Merge projects/bhyve to head.

'bhyve' was developed by grehan@ and myself at NetApp (thanks!). Special thanks to Peter Snyder, Joe Caradonna and Michael Dexter for their support and encouragement. Obtained from: NetApp
author: neel <neel@FreeBSD.org> 2013-01-19 04:18:52 +0000
committer: neel <neel@FreeBSD.org> 2013-01-19 04:18:52 +0000
commit: 363335d53e3c955602378aa434d0054c48a6e0d6 (patch)
tree: 5af6fe77acd4da3002c907484fd64133deb95c8d
parent: 3600c83b820e00959d61600e67e9dcb32ef6b518 (diff)
parent: dde8bf641fc7c8e9541167cd7c01523973d0b569 (diff)
download: FreeBSD-src-363335d53e3c955602378aa434d0054c48a6e0d6.zip
FreeBSD-src-363335d53e3c955602378aa434d0054c48a6e0d6.tar.gz
105 files changed, 25476 insertions, 0 deletions
diff --git a/lib/Makefile b/lib/Makefile
index 3dd274e..132302e 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -115,6 +115,7 @@ SUBDIR=	${SUBDIR_ORDERED} \
 	${_libusbhid} \
 	${_libusb} \
 	${_libvgl} \
+	${_libvmmapi} \
 	libwrap \
 	liby \
 	libz \
@@ -198,6 +199,10 @@ _libproc=	libproc
 _librtld_db=	librtld_db
 .endif
 
+.if ${MACHINE_CPUARCH} == "amd64"
+_libvmmapi=	libvmmapi
+.endif
+
 .if ${MACHINE_CPUARCH} == "ia64"
 _libefi=	libefi
 .endif
diff --git a/lib/libvmmapi/Makefile b/lib/libvmmapi/Makefile
new file mode 100644
index 0000000..93d3c85
--- /dev/null
+++ b/lib/libvmmapi/Makefile
@@ -0,0 +1,11 @@
+# $FreeBSD$
+
+LIB=	vmmapi
+SRCS=	vmmapi.c vmmapi_freebsd.c
+INCS=	vmmapi.h
+
+WARNS?=	2
+
+CFLAGS+= -I${.CURDIR}
+
+.include <bsd.lib.mk>
diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
new file mode 100644
index 0000000..cfb42d0
--- /dev/null
+++ b/lib/libvmmapi/vmmapi.c
@@ -0,0 +1,723 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <machine/specialreg.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+
+#include "vmmapi.h"
+
+struct vmctx {
+	int	fd;
+	char	*name;
+};
+
+#define	CREATE(x)  sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x)))
+#define	DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x)))
+
+static int
+vm_device_open(const char *name)
+{
+        int fd, len;
+        char *vmfile;
+
+	len = strlen("/dev/vmm/") + strlen(name) + 1;
+	vmfile = malloc(len);
+	assert(vmfile != NULL);
+	snprintf(vmfile, len, "/dev/vmm/%s", name);
+
+        /* Open the device file */
+        fd = open(vmfile, O_RDWR, 0);
+
+	free(vmfile);
+        return (fd);
+}
+
+int
+vm_create(const char *name)
+{
+
+	return (CREATE((char *)name));
+}
+
+struct vmctx *
+vm_open(const char *name)
+{
+	struct vmctx *vm;
+
+	vm = malloc(sizeof(struct vmctx) + strlen(name) + 1);
+	assert(vm != NULL);
+
+	vm->fd = -1;
+	vm->name = (char *)(vm + 1);
+	strcpy(vm->name, name);
+
+	if ((vm->fd = vm_device_open(vm->name)) < 0)
+		goto err;
+
+	return (vm);
+err:
+	vm_destroy(vm);
+	return (NULL);
+}
+
+void
+vm_destroy(struct vmctx *vm)
+{
+	assert(vm != NULL);
+
+	if (vm->fd >= 0)
+		close(vm->fd);
+	DESTROY(vm->name);
+
+	free(vm);
+}
+
+size_t
+vmm_get_mem_total(void)
+{
+	size_t mem_total = 0;
+	size_t oldlen = sizeof(mem_total);
+	int error;
+	error = sysctlbyname("hw.vmm.mem_total", &mem_total, &oldlen, NULL, 0);
+	if (error)
+		return -1;
+	return mem_total;
+}
+
+size_t
+vmm_get_mem_free(void)
+{
+	size_t mem_free = 0;
+	size_t oldlen = sizeof(mem_free);
+	int error;
+	error = sysctlbyname("hw.vmm.mem_free", &mem_free, &oldlen, NULL, 0);
+	if (error)
+		return -1;
+	return mem_free;
+}
+
+int
+vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len)
+{
+	int error;
+	struct vm_memory_segment seg;
+
+	bzero(&seg, sizeof(seg));
+	seg.gpa = gpa;
+	error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg);
+	*ret_len = seg.len;
+	return (error);
+}
+
+int
+vm_setup_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **mapaddr)
+{
+	int error;
+	struct vm_memory_segment seg;
+
+	/*
+	 * Create and optionally map 'len' bytes of memory at guest
+	 * physical address 'gpa'
+	 */
+	bzero(&seg, sizeof(seg));
+	seg.gpa = gpa;
+	seg.len = len;
+	error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg);
+	if (error == 0 && mapaddr != NULL) {
+		*mapaddr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED,
+				ctx->fd, gpa);
+	}
+	return (error);
+}
+
+char *
+vm_map_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len)
+{
+
+	/* Map 'len' bytes of memory at guest physical address 'gpa' */
+	return ((char *)mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED,
+		     ctx->fd, gpa));
+}
+
+int
+vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
+	    uint64_t base, uint32_t limit, uint32_t access)
+{
+	int error;
+	struct vm_seg_desc vmsegdesc;
+
+	bzero(&vmsegdesc, sizeof(vmsegdesc));
+	vmsegdesc.cpuid = vcpu;
+	vmsegdesc.regnum = reg;
+	vmsegdesc.desc.base = base;
+	vmsegdesc.desc.limit = limit;
+	vmsegdesc.desc.access = access;
+
+	error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc);
+	return (error);
+}
+
+int
+vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
+	    uint64_t *base, uint32_t *limit, uint32_t *access)
+{
+	int error;
+	struct vm_seg_desc vmsegdesc;
+
+	bzero(&vmsegdesc, sizeof(vmsegdesc));
+	vmsegdesc.cpuid = vcpu;
+	vmsegdesc.regnum = reg;
+
+	error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc);
+	if (error == 0) {
+		*base = vmsegdesc.desc.base;
+		*limit = vmsegdesc.desc.limit;
+		*access = vmsegdesc.desc.access;
+	}
+	return (error);
+}
+
+int
+vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
+{
+	int error;
+	struct vm_register vmreg;
+
+	bzero(&vmreg, sizeof(vmreg));
+	vmreg.cpuid = vcpu;
+	vmreg.regnum = reg;
+	vmreg.regval = val;
+
+	error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg);
+	return (error);
+}
+
+int
+vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val)
+{
+	int error;
+	struct vm_register vmreg;
+
+	bzero(&vmreg, sizeof(vmreg));
+	vmreg.cpuid = vcpu;
+	vmreg.regnum = reg;
+
+	error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg);
+	*ret_val = vmreg.regval;
+	return (error);
+}
+
+int
+vm_get_pinning(struct vmctx *ctx, int vcpu, int *host_cpuid)
+{
+	int error;
+	struct vm_pin vmpin;
+
+	bzero(&vmpin, sizeof(vmpin));
+	vmpin.vm_cpuid = vcpu;
+
+	error = ioctl(ctx->fd, VM_GET_PINNING, &vmpin);
+	*host_cpuid = vmpin.host_cpuid;
+	return (error);
+}
+
+int
+vm_set_pinning(struct vmctx *ctx, int vcpu, int host_cpuid)
+{
+	int error;
+	struct vm_pin vmpin;
+
+	bzero(&vmpin, sizeof(vmpin));
+	vmpin.vm_cpuid = vcpu;
+	vmpin.host_cpuid = host_cpuid;
+
+	error = ioctl(ctx->fd, VM_SET_PINNING, &vmpin);
+	return (error);
+}
+
+int
+vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, struct vm_exit *vmexit)
+{
+	int error;
+	struct vm_run vmrun;
+
+	bzero(&vmrun, sizeof(vmrun));
+	vmrun.cpuid = vcpu;
+	vmrun.rip = rip;
+
+	error = ioctl(ctx->fd, VM_RUN, &vmrun);
+	bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit));
+	return (error);
+}
+
+static int
+vm_inject_event_real(struct vmctx *ctx, int vcpu, enum vm_event_type type,
+		     int vector, int error_code, int error_code_valid)
+{
+	struct vm_event ev;
+
+	bzero(&ev, sizeof(ev));
+	ev.cpuid = vcpu;
+	ev.type = type;
+	ev.vector = vector;
+	ev.error_code = error_code;
+	ev.error_code_valid = error_code_valid;
+
+	return (ioctl(ctx->fd, VM_INJECT_EVENT, &ev));
+}
+
+int
+vm_inject_event(struct vmctx *ctx, int vcpu, enum vm_event_type type,
+		int vector)
+{
+
+	return (vm_inject_event_real(ctx, vcpu, type, vector, 0, 0));
+}
+
+int
+vm_inject_event2(struct vmctx *ctx, int vcpu, enum vm_event_type type,
+		 int vector, int error_code)
+{
+
+	return (vm_inject_event_real(ctx, vcpu, type, vector, error_code, 1));
+}
+
+int
+vm_apicid2vcpu(struct vmctx *ctx, int apicid)
+{
+	/*
+	 * The apic id associated with the 'vcpu' has the same numerical value
+	 * as the 'vcpu' itself.
+	 */
+	return (apicid);
+}
+
+int
+vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector)
+{
+	struct vm_lapic_irq vmirq;
+
+	bzero(&vmirq, sizeof(vmirq));
+	vmirq.cpuid = vcpu;
+	vmirq.vector = vector;
+
+	return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq));
+}
+
+int
+vm_inject_nmi(struct vmctx *ctx, int vcpu)
+{
+	struct vm_nmi vmnmi;
+
+	bzero(&vmnmi, sizeof(vmnmi));
+	vmnmi.cpuid = vcpu;
+
+	return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi));
+}
+
+static struct {
+	const char	*name;
+	int		type;
+} capstrmap[] = {
+	{ "hlt_exit",		VM_CAP_HALT_EXIT },
+	{ "mtrap_exit",		VM_CAP_MTRAP_EXIT },
+	{ "pause_exit",		VM_CAP_PAUSE_EXIT },
+	{ "unrestricted_guest",	VM_CAP_UNRESTRICTED_GUEST },
+	{ 0 }
+};
+
+int
+vm_capability_name2type(const char *capname)
+{
+	int i;
+
+	for (i = 0; capstrmap[i].name != NULL && capname != NULL; i++) {
+		if (strcmp(capstrmap[i].name, capname) == 0)
+			return (capstrmap[i].type);
+	}
+
+	return (-1);
+}
+
+const char *
+vm_capability_type2name(int type)
+{
+	int i;
+
+	for (i = 0; capstrmap[i].name != NULL; i++) {
+		if (capstrmap[i].type == type)
+			return (capstrmap[i].name);
+	}
+
+	return (NULL);
+}
+
+int
+vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
+		  int *retval)
+{
+	int error;
+	struct vm_capability vmcap;
+
+	bzero(&vmcap, sizeof(vmcap));
+	vmcap.cpuid = vcpu;
+	vmcap.captype = cap;
+
+	error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap);
+	*retval = vmcap.capval;
+	return (error);
+}
+
+int
+vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val)
+{
+	struct vm_capability vmcap;
+
+	bzero(&vmcap, sizeof(vmcap));
+	vmcap.cpuid = vcpu;
+	vmcap.captype = cap;
+	vmcap.capval = val;
+	
+	return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap));
+}
+
+int
+vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
+{
+	struct vm_pptdev pptdev;
+
+	bzero(&pptdev, sizeof(pptdev));
+	pptdev.bus = bus;
+	pptdev.slot = slot;
+	pptdev.func = func;
+
+	return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev));
+}
+
+int
+vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
+{
+	struct vm_pptdev pptdev;
+
+	bzero(&pptdev, sizeof(pptdev));
+	pptdev.bus = bus;
+	pptdev.slot = slot;
+	pptdev.func = func;
+
+	return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev));
+}
+
+int
+vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
+		   vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+	struct vm_pptdev_mmio pptmmio;
+
+	bzero(&pptmmio, sizeof(pptmmio));
+	pptmmio.bus = bus;
+	pptmmio.slot = slot;
+	pptmmio.func = func;
+	pptmmio.gpa = gpa;
+	pptmmio.len = len;
+	pptmmio.hpa = hpa;
+
+	return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio));
+}
+
+int
+vm_setup_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
+	     int destcpu, int vector, int numvec)
+{
+	struct vm_pptdev_msi pptmsi;
+
+	bzero(&pptmsi, sizeof(pptmsi));
+	pptmsi.vcpu = vcpu;
+	pptmsi.bus = bus;
+	pptmsi.slot = slot;
+	pptmsi.func = func;
+	pptmsi.destcpu = destcpu;
+	pptmsi.vector = vector;
+	pptmsi.numvec = numvec;
+
+	return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi));
+}
+
+int	
+vm_setup_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
+	      int idx, uint32_t msg, uint32_t vector_control, uint64_t addr)
+{
+	struct vm_pptdev_msix pptmsix;
+
+	bzero(&pptmsix, sizeof(pptmsix));
+	pptmsix.vcpu = vcpu;
+	pptmsix.bus = bus;
+	pptmsix.slot = slot;
+	pptmsix.func = func;
+	pptmsix.idx = idx;
+	pptmsix.msg = msg;
+	pptmsix.addr = addr;
+	pptmsix.vector_control = vector_control;
+
+	return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix);
+}
+
+uint64_t *
+vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
+	     int *ret_entries)
+{
+	int error;
+
+	static struct vm_stats vmstats;
+
+	vmstats.cpuid = vcpu;
+
+	error = ioctl(ctx->fd, VM_STATS, &vmstats);
+	if (error == 0) {
+		if (ret_entries)
+			*ret_entries = vmstats.num_entries;
+		if (ret_tv)
+			*ret_tv = vmstats.tv;
+		return (vmstats.statbuf);
+	} else
+		return (NULL);
+}
+
+const char *
+vm_get_stat_desc(struct vmctx *ctx, int index)
+{
+	static struct vm_stat_desc statdesc;
+
+	statdesc.index = index;
+	if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0)
+		return (statdesc.desc);
+	else
+		return (NULL);
+}
+
+int
+vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *state)
+{
+	int error;
+	struct vm_x2apic x2apic;
+
+	bzero(&x2apic, sizeof(x2apic));
+	x2apic.cpuid = vcpu;
+
+	error = ioctl(ctx->fd, VM_GET_X2APIC_STATE, &x2apic);
+	*state = x2apic.state;
+	return (error);
+}
+
+int
+vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state state)
+{
+	int error;
+	struct vm_x2apic x2apic;
+
+	bzero(&x2apic, sizeof(x2apic));
+	x2apic.cpuid = vcpu;
+	x2apic.state = state;
+
+	error = ioctl(ctx->fd, VM_SET_X2APIC_STATE, &x2apic);
+
+	return (error);
+}
+
+/*
+ * From Intel Vol 3a:
+ * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT
+ */
+int
+vcpu_reset(struct vmctx *vmctx, int vcpu)
+{
+	int error;
+	uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx;
+	uint32_t desc_access, desc_limit;
+	uint16_t sel;
+
+	zero = 0;
+
+	rflags = 0x2;
+	error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags);
+	if (error)
+		goto done;
+
+	rip = 0xfff0;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0)
+		goto done;
+
+	cr0 = CR0_NE;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
+		goto done;
+
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0)
+		goto done;
+	
+	cr4 = 0;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0)
+		goto done;
+
+	/*
+	 * CS: present, r/w, accessed, 16-bit, byte granularity, usable
+	 */
+	desc_base = 0xffff0000;
+	desc_limit = 0xffff;
+	desc_access = 0x0093;
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS,
+			    desc_base, desc_limit, desc_access);
+	if (error)
+		goto done;
+
+	sel = 0xf000;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0)
+		goto done;
+
+	/*
+	 * SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity
+	 */
+	desc_base = 0;
+	desc_limit = 0xffff;
+	desc_access = 0x0093;
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS,
+			    desc_base, desc_limit, desc_access);
+	if (error)
+		goto done;
+
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS,
+			    desc_base, desc_limit, desc_access);
+	if (error)
+		goto done;
+
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES,
+			    desc_base, desc_limit, desc_access);
+	if (error)
+		goto done;
+
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS,
+			    desc_base, desc_limit, desc_access);
+	if (error)
+		goto done;
+
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS,
+			    desc_base, desc_limit, desc_access);
+	if (error)
+		goto done;
+
+	sel = 0;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0)
+		goto done;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0)
+		goto done;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0)
+		goto done;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0)
+		goto done;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0)
+		goto done;
+
+	/* General purpose registers */
+	rdx = 0xf00;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0)
+		goto done;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0)
+		goto done;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0)
+		goto done;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0)
+		goto done;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0)
+		goto done;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0)
+		goto done;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0)
+		goto done;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0)
+		goto done;
+
+	/* GDTR, IDTR */
+	desc_base = 0;
+	desc_limit = 0xffff;
+	desc_access = 0;
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR,
+			    desc_base, desc_limit, desc_access);
+	if (error != 0)
+		goto done;
+
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR,
+			    desc_base, desc_limit, desc_access);
+	if (error != 0)
+		goto done;
+
+	/* TR */
+	desc_base = 0;
+	desc_limit = 0xffff;
+	desc_access = 0x0000008b;
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access);
+	if (error)
+		goto done;
+
+	sel = 0;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0)
+		goto done;
+
+	/* LDTR */
+	desc_base = 0;
+	desc_limit = 0xffff;
+	desc_access = 0x00000082;
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base,
+			    desc_limit, desc_access);
+	if (error)
+		goto done;
+
+	sel = 0;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0)
+		goto done;
+
+	/* XXX cr2, debug registers */
+
+	error = 0;
+done:
+	return (error);
+}
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
new file mode 100644
index 0000000..de04252
--- /dev/null
+++ b/lib/libvmmapi/vmmapi.h
@@ -0,0 +1,105 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMMAPI_H_
+#define	_VMMAPI_H_
+
+struct vmctx;
+enum x2apic_state;
+
+int	vm_create(const char *name);
+struct vmctx *vm_open(const char *name);
+void	vm_destroy(struct vmctx *ctx);
+size_t	vmm_get_mem_total(void);
+size_t	vmm_get_mem_free(void);
+int	vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len);
+/*
+ * Create a memory segment of 'len' bytes in the guest physical address space
+ * at offset 'gpa'.
+ *
+ * If 'mapaddr' is not NULL then this region is mmap'ed into the address
+ * space of the calling process. If there is an mmap error then *mapaddr
+ * will be set to MAP_FAILED.
+ */
+
+int	vm_setup_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len,
+		      char **mapaddr);
+char *  vm_map_memory(struct vmctx *ctx, vm_paddr_t gpa, size_t len);
+int	vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
+		    uint64_t base, uint32_t limit, uint32_t access);
+int	vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
+		    uint64_t *base, uint32_t *limit, uint32_t *access);
+int	vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val);
+int	vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval);
+int	vm_get_pinning(struct vmctx *ctx, int vcpu, int *host_cpuid);
+int	vm_set_pinning(struct vmctx *ctx, int vcpu, int host_cpuid);
+int	vm_run(struct vmctx *ctx, int vcpu, uint64_t rip,
+	       struct vm_exit *ret_vmexit);
+int	vm_apicid2vcpu(struct vmctx *ctx, int apicid);
+int	vm_inject_event(struct vmctx *ctx, int vcpu, enum vm_event_type type,
+			int vector);
+int	vm_inject_event2(struct vmctx *ctx, int vcpu, enum vm_event_type type,
+			 int vector, int error_code);
+int	vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector);
+int	vm_inject_nmi(struct vmctx *ctx, int vcpu);
+int	vm_capability_name2type(const char *capname);
+const char *vm_capability_type2name(int type);
+int	vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
+			  int *retval);
+int	vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
+			  int val);
+int	vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
+int	vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
+int	vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
+			   vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int	vm_setup_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
+		     int dest, int vector, int numvec);
+int	vm_setup_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
+		      int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
+
+/*
+ * Return a pointer to the statistics buffer. Note that this is not MT-safe.
+ */
+uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
+		       int *ret_entries);
+const char *vm_get_stat_desc(struct vmctx *ctx, int index);
+
+int	vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *s);
+int	vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state s);
+
+/* Reset vcpu register state */
+int	vcpu_reset(struct vmctx *ctx, int vcpu);
+
+/*
+ * FreeBSD specific APIs
+ */
+int	vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu,
+				uint64_t rip, uint64_t cr3, uint64_t gdtbase,
+				uint64_t rsp);
+void	vm_setup_freebsd_gdt(uint64_t *gdtr);
+#endif	/* _VMMAPI_H_ */
diff --git a/lib/libvmmapi/vmmapi_freebsd.c b/lib/libvmmapi/vmmapi_freebsd.c
new file mode 100644
index 0000000..9bd2988
--- /dev/null
+++ b/lib/libvmmapi/vmmapi_freebsd.c
@@ -0,0 +1,183 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <machine/specialreg.h>
+#include <machine/segments.h>
+#include <machine/vmm.h>
+
+#include "vmmapi.h"
+
+#define	DESC_UNUSABLE		0x00010000
+
+#define	GUEST_NULL_SEL		0
+#define	GUEST_CODE_SEL		1
+#define	GUEST_DATA_SEL		2
+#define	GUEST_GDTR_LIMIT	(3 * 8 - 1)
+
+void     
+vm_setup_freebsd_gdt(uint64_t *gdtr)
+{       
+	gdtr[GUEST_NULL_SEL] = 0;
+	gdtr[GUEST_CODE_SEL] = 0x0020980000000000;
+	gdtr[GUEST_DATA_SEL] = 0x0000900000000000;
+}
+
+/*
+ * Setup the 'vcpu' register set such that it will begin execution at
+ * 'rip' in long mode.
+ */
+int
+vm_setup_freebsd_registers(struct vmctx *vmctx, int vcpu,
+			   uint64_t rip, uint64_t cr3, uint64_t gdtbase,
+			   uint64_t rsp)
+{
+	int error;
+	uint64_t cr0, cr4, efer, rflags, desc_base;
+	uint32_t desc_access, desc_limit;
+	uint16_t gsel;
+
+	cr0 = CR0_PE | CR0_PG | CR0_NE;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
+		goto done;
+
+	cr4 = CR4_PAE;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0)
+		goto done;
+
+	efer = EFER_LME | EFER_LMA;
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_EFER, efer)))
+		goto done;
+
+	rflags = 0x2;
+	error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags);
+	if (error)
+		goto done;
+
+	desc_base = 0;
+	desc_limit = 0;
+	desc_access = 0x0000209B;
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS,
+			    desc_base, desc_limit, desc_access);
+	if (error)
+		goto done;
+
+	desc_access = 0x00000093;
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS,
+			    desc_base, desc_limit, desc_access);
+	if (error)
+		goto done;
+
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES,
+			    desc_base, desc_limit, desc_access);
+	if (error)
+		goto done;
+
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS,
+			    desc_base, desc_limit, desc_access);
+	if (error)
+		goto done;
+
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS,
+			    desc_base, desc_limit, desc_access);
+	if (error)
+		goto done;
+
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS,
+			    desc_base, desc_limit, desc_access);
+	if (error)
+		goto done;
+
+	/*
+	 * XXX TR is pointing to null selector even though we set the
+	 * TSS segment to be usable with a base address and limit of 0.
+	 */
+	desc_access = 0x0000008b;
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access);
+	if (error)
+		goto done;
+
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, 0, 0,
+			    DESC_UNUSABLE);
+	if (error)
+		goto done;
+
+	gsel = GSEL(GUEST_CODE_SEL, SEL_KPL);
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, gsel)) != 0)
+		goto done;
+	
+	gsel = GSEL(GUEST_DATA_SEL, SEL_KPL);
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, gsel)) != 0)
+		goto done;
+	
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, gsel)) != 0)
+		goto done;
+
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, gsel)) != 0)
+		goto done;
+	
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, gsel)) != 0)
+		goto done;
+	
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, gsel)) != 0)
+		goto done;
+
+	/* XXX TR is pointing to the null selector */
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, 0)) != 0)
+		goto done;
+
+	/* LDTR is pointing to the null selector */
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0)
+		goto done;
+
+	/* entry point */
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0)
+		goto done;
+
+	/* page table base */
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, cr3)) != 0)
+		goto done;
+
+	desc_base = gdtbase;
+	desc_limit = GUEST_GDTR_LIMIT;
+	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR,
+			    desc_base, desc_limit, 0);
+	if (error != 0)
+		goto done;
+
+	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, rsp)) != 0)
+		goto done;
+
+	error = 0;
+done:
+	return (error);
+}
diff --git a/share/man/man4/bhyve.4 b/share/man/man4/bhyve.4
new file mode 100644
index 0000000..cdfc1e2
--- /dev/null
+++ b/share/man/man4/bhyve.4
@@ -0,0 +1,68 @@
+.\"
+.\" Copyright (c) 2012 NetApp Inc
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd January 5, 2013
+.Dt BHYVE 4
+.Os
+.Sh NAME
+.Nm bhyve
+.Nd virtual machine monitor
+.Sh SYNOPSIS
+.Cd "/usr/sbin/bhyve"
+.Cd "/usr/sbin/bhyveload"
+.Cd "/usr/sbin/bhyvectl"
+.Cd "/boot/kernel/vmm.ko"
+.Sh DESCRIPTION
+.Nm
+is a virtual machine monitor that is hosted by FreeBSD. It is used to host
+unmodified guest operating systems on top of FreeBSD.
+.Pp
+.Nm
+relies heavily on hardware assist provided by the CPU and chipset to virtualize
+processor and memory resources.
+.Sh SEE ALSO
+.Xr bhyve 8 ,
+.Xr bhyveload 8 ,
+.Xr bhyvectl 8 ,
+.Xr vmm 4
+.Sh HISTORY
+.Nm
+first appeared in
+.Fx 10.0 ,
+and was developed at NetApp Inc.
+.Sh AUTHORS
+.Nm
+was developed by
+.An -nosplit
+.An "Peter Grehan" Aq grehan@FreeBSD.org
+and
+.An "Neel Natu" Aq neel@FreeBSD.org
+at NetApp Inc.
+.Sh BUGS
+.Nm
+is considered experimental in
+.Fx .
diff --git a/share/mk/bsd.libnames.mk b/share/mk/bsd.libnames.mk
index 4f8bedd..95f9064 100644
--- a/share/mk/bsd.libnames.mk
+++ b/share/mk/bsd.libnames.mk
@@ -162,6 +162,7 @@ LIBULOG?=	${DESTDIR}${LIBDIR}/libulog.a
 LIBUTIL?=	${DESTDIR}${LIBDIR}/libutil.a
 LIBUUTIL?=	${DESTDIR}${LIBDIR}/libuutil.a
 LIBVGL?=	${DESTDIR}${LIBDIR}/libvgl.a
+LIBVMMAPI?=	${DESTDIR}${LIBDIR}/libvmmapi.a
 LIBWIND?=	${DESTDIR}${LIBDIR}/libwind.a
 LIBWRAP?=	${DESTDIR}${LIBDIR}/libwrap.a
 LIBXPG4?=	${DESTDIR}${LIBDIR}/libxpg4.a
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
new file mode 100644
index 0000000..024c30e
--- /dev/null
+++ b/sys/amd64/include/vmm.h
@@ -0,0 +1,293 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: vmm.h 482 2011-05-09 21:22:43Z grehan $
+ */
+
+#ifndef _VMM_H_
+#define	_VMM_H_
+
+#ifdef _KERNEL
+
+#define	VM_MAX_NAMELEN	32
+
+struct vm;
+struct vm_memory_segment;
+struct seg_desc;
+struct vm_exit;
+struct vm_run;
+struct vlapic;
+
+enum x2apic_state;
+
+typedef int	(*vmm_init_func_t)(void);
+typedef int	(*vmm_cleanup_func_t)(void);
+typedef void *	(*vmi_init_func_t)(struct vm *vm); /* instance specific apis */
+typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, register_t rip);
+typedef void	(*vmi_cleanup_func_t)(void *vmi);
+typedef int	(*vmi_mmap_set_func_t)(void *vmi, vm_paddr_t gpa,
+				       vm_paddr_t hpa, size_t length,
+				       vm_memattr_t attr, int prot,
+				       boolean_t superpages_ok);
+typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *vmi, vm_paddr_t gpa);
+typedef int	(*vmi_get_register_t)(void *vmi, int vcpu, int num,
+				      uint64_t *retval);
+typedef int	(*vmi_set_register_t)(void *vmi, int vcpu, int num,
+				      uint64_t val);
+typedef int	(*vmi_get_desc_t)(void *vmi, int vcpu, int num,
+				  struct seg_desc *desc);
+typedef int	(*vmi_set_desc_t)(void *vmi, int vcpu, int num,
+				  struct seg_desc *desc);
+typedef int	(*vmi_inject_event_t)(void *vmi, int vcpu,
+				      int type, int vector,
+				      uint32_t code, int code_valid);
+typedef int	(*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
+typedef int	(*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
+
+struct vmm_ops {
+	vmm_init_func_t		init;		/* module wide initialization */
+	vmm_cleanup_func_t	cleanup;
+
+	vmi_init_func_t		vminit;		/* vm-specific initialization */
+	vmi_run_func_t		vmrun;
+	vmi_cleanup_func_t	vmcleanup;
+	vmi_mmap_set_func_t	vmmmap_set;
+	vmi_mmap_get_func_t	vmmmap_get;
+	vmi_get_register_t	vmgetreg;
+	vmi_set_register_t	vmsetreg;
+	vmi_get_desc_t		vmgetdesc;
+	vmi_set_desc_t		vmsetdesc;
+	vmi_inject_event_t	vminject;
+	vmi_get_cap_t		vmgetcap;
+	vmi_set_cap_t		vmsetcap;
+};
+
+extern struct vmm_ops vmm_ops_intel;
+extern struct vmm_ops vmm_ops_amd;
+
+struct vm *vm_create(const char *name);
+void vm_destroy(struct vm *vm);
+const char *vm_name(struct vm *vm);
+int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
+int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
+vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size);
+int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
+	      struct vm_memory_segment *seg);
+int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
+int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
+int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
+		    struct seg_desc *ret_desc);
+int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
+		    struct seg_desc *desc);
+int vm_get_pinning(struct vm *vm, int vcpu, int *cpuid);
+int vm_set_pinning(struct vm *vm, int vcpu, int cpuid);
+int vm_run(struct vm *vm, struct vm_run *vmrun);
+int vm_inject_event(struct vm *vm, int vcpu, int type,
+		    int vector, uint32_t error_code, int error_code_valid);
+int vm_inject_nmi(struct vm *vm, int vcpu);
+int vm_nmi_pending(struct vm *vm, int vcpuid);
+void vm_nmi_clear(struct vm *vm, int vcpuid);
+uint64_t *vm_guest_msrs(struct vm *vm, int cpu);
+struct vlapic *vm_lapic(struct vm *vm, int cpu);
+int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
+int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
+int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
+int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
+void vm_activate_cpu(struct vm *vm, int vcpu);
+cpuset_t vm_active_cpus(struct vm *vm);
+struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
+
+/*
+ * Return 1 if device indicated by bus/slot/func is supposed to be a
+ * pci passthrough device.
+ *
+ * Return 0 otherwise.
+ */
+int vmm_is_pptdev(int bus, int slot, int func);
+
+void *vm_iommu_domain(struct vm *vm);
+
+enum vcpu_state {
+	VCPU_IDLE,
+	VCPU_RUNNING,
+	VCPU_CANNOT_RUN,
+};
+
+int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state);
+enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu);
+
+static int __inline
+vcpu_is_running(struct vm *vm, int vcpu)
+{
+	return (vcpu_get_state(vm, vcpu) == VCPU_RUNNING);
+}
+
+void *vcpu_stats(struct vm *vm, int vcpu);
+void vm_interrupt_hostcpu(struct vm *vm, int vcpu);
+
+#endif	/* KERNEL */
+
+#include <machine/vmm_instruction_emul.h>
+
+#define	VM_MAXCPU	8			/* maximum virtual cpus */
+
+/*
+ * Identifiers for events that can be injected into the VM
+ */
+enum vm_event_type {
+	VM_EVENT_NONE,
+	VM_HW_INTR,
+	VM_NMI,
+	VM_HW_EXCEPTION,
+	VM_SW_INTR,
+	VM_PRIV_SW_EXCEPTION,
+	VM_SW_EXCEPTION,
+	VM_EVENT_MAX
+};
+
+/*
+ * Identifiers for architecturally defined registers.
+ */
+enum vm_reg_name {
+	VM_REG_GUEST_RAX,
+	VM_REG_GUEST_RBX,
+	VM_REG_GUEST_RCX,
+	VM_REG_GUEST_RDX,
+	VM_REG_GUEST_RSI,
+	VM_REG_GUEST_RDI,
+	VM_REG_GUEST_RBP,
+	VM_REG_GUEST_R8,
+	VM_REG_GUEST_R9,
+	VM_REG_GUEST_R10,
+	VM_REG_GUEST_R11,
+	VM_REG_GUEST_R12,
+	VM_REG_GUEST_R13,
+	VM_REG_GUEST_R14,
+	VM_REG_GUEST_R15,
+	VM_REG_GUEST_CR0,
+	VM_REG_GUEST_CR3,
+	VM_REG_GUEST_CR4,
+	VM_REG_GUEST_DR7,
+	VM_REG_GUEST_RSP,
+	VM_REG_GUEST_RIP,
+	VM_REG_GUEST_RFLAGS,
+	VM_REG_GUEST_ES,
+	VM_REG_GUEST_CS,
+	VM_REG_GUEST_SS,
+	VM_REG_GUEST_DS,
+	VM_REG_GUEST_FS,
+	VM_REG_GUEST_GS,
+	VM_REG_GUEST_LDTR,
+	VM_REG_GUEST_TR,
+	VM_REG_GUEST_IDTR,
+	VM_REG_GUEST_GDTR,
+	VM_REG_GUEST_EFER,
+	VM_REG_LAST
+};
+
+/*
+ * Identifiers for optional vmm capabilities
+ */
+enum vm_cap_type {
+	VM_CAP_HALT_EXIT,
+	VM_CAP_MTRAP_EXIT,
+	VM_CAP_PAUSE_EXIT,
+	VM_CAP_UNRESTRICTED_GUEST,
+	VM_CAP_MAX
+};
+
+enum x2apic_state {
+	X2APIC_ENABLED,
+	X2APIC_AVAILABLE,
+	X2APIC_DISABLED,
+	X2APIC_STATE_LAST
+};
+
+/*
+ * The 'access' field has the format specified in Table 21-2 of the Intel
+ * Architecture Manual vol 3b.
+ *
+ * XXX The contents of the 'access' field are architecturally defined except
+ * bit 16 - Segment Unusable.
+ */
+struct seg_desc {
+	uint64_t	base;
+	uint32_t	limit;
+	uint32_t	access;
+};
+
+enum vm_exitcode {
+	VM_EXITCODE_INOUT,
+	VM_EXITCODE_VMX,
+	VM_EXITCODE_BOGUS,
+	VM_EXITCODE_RDMSR,
+	VM_EXITCODE_WRMSR,
+	VM_EXITCODE_HLT,
+	VM_EXITCODE_MTRAP,
+	VM_EXITCODE_PAUSE,
+	VM_EXITCODE_PAGING,
+	VM_EXITCODE_SPINUP_AP,
+	VM_EXITCODE_MAX
+};
+
+struct vm_exit {
+	enum vm_exitcode	exitcode;
+	int			inst_length;	/* 0 means unknown */
+	uint64_t		rip;
+	union {
+		struct {
+			uint16_t	bytes:3;	/* 1 or 2 or 4 */
+			uint16_t	in:1;		/* out is 0, in is 1 */
+			uint16_t	string:1;
+			uint16_t	rep:1;
+			uint16_t	port;
+			uint32_t	eax;		/* valid for out */
+		} inout;
+		struct {
+			uint64_t	gpa;
+			struct vie	vie;
+		} paging;
+		/*
+		 * VMX specific payload. Used when there is no "better"
+		 * exitcode to represent the VM-exit.
+		 */
+		struct {
+			int		error;		/* vmx inst error */
+			uint32_t	exit_reason;
+			uint64_t	exit_qualification;
+		} vmx;
+		struct {
+			uint32_t	code;		/* ecx value */
+			uint64_t	wval;
+		} msr;
+		struct {
+			int		vcpu;
+			uint64_t	rip;
+		} spinup_ap;
+	} u;
+};
+
+#endif	/* _VMM_H_ */
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
new file mode 100644
index 0000000..79f893d
--- /dev/null
+++ b/sys/amd64/include/vmm_dev.h
@@ -0,0 +1,215 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: vmm_dev.h 482 2011-05-09 21:22:43Z grehan $
+ */
+
+#ifndef	_VMM_DEV_H_
+#define	_VMM_DEV_H_
+
+#ifdef _KERNEL
+void	vmmdev_init(void);
+int	vmmdev_cleanup(void);
+#endif
+
+struct vm_memory_segment {
+	vm_paddr_t	gpa;	/* in */
+	size_t		len;	/* in */
+};
+
+struct vm_register {
+	int		cpuid;
+	int		regnum;		/* enum vm_reg_name */
+	uint64_t	regval;
+};
+
+struct vm_seg_desc {			/* data or code segment */
+	int		cpuid;
+	int		regnum;		/* enum vm_reg_name */
+	struct seg_desc desc;
+};
+
+struct vm_pin {
+	int		vm_cpuid;
+	int		host_cpuid;	/* -1 to unpin */
+};
+
+struct vm_run {
+	int		cpuid;
+	uint64_t	rip;		/* start running here */
+	struct vm_exit	vm_exit;
+};
+
+struct vm_event {
+	int		cpuid;
+	enum vm_event_type type;
+	int		vector;
+	uint32_t	error_code;
+	int		error_code_valid;
+};
+
+struct vm_lapic_irq {
+	int		cpuid;
+	int		vector;
+};
+
+struct vm_capability {
+	int		cpuid;
+	enum vm_cap_type captype;
+	int		capval;
+	int		allcpus;
+};
+
+struct vm_pptdev {
+	int		bus;
+	int		slot;
+	int		func;
+};
+
+struct vm_pptdev_mmio {
+	int		bus;
+	int		slot;
+	int		func;
+	vm_paddr_t	gpa;
+	vm_paddr_t	hpa;
+	size_t		len;
+};
+
+struct vm_pptdev_msi {
+	int		vcpu;
+	int		bus;
+	int		slot;
+	int		func;
+	int		numvec;		/* 0 means disabled */
+	int		vector;
+	int		destcpu;
+};
+
+struct vm_pptdev_msix {
+	int		vcpu;
+	int		bus;
+	int		slot;
+	int		func;
+	int		idx;
+	uint32_t	msg;
+	uint32_t	vector_control;
+	uint64_t	addr;
+};
+
+struct vm_nmi {
+	int		cpuid;
+};
+
+#define	MAX_VM_STATS	64
+struct vm_stats {
+	int		cpuid;				/* in */
+	int		num_entries;			/* out */
+	struct timeval	tv;
+	uint64_t	statbuf[MAX_VM_STATS];
+};
+
+struct vm_stat_desc {
+	int		index;				/* in */
+	char		desc[128];			/* out */
+};
+
+struct vm_x2apic {
+	int			cpuid;
+	enum x2apic_state	state;
+};
+
+enum {
+	IOCNUM_RUN,
+	IOCNUM_SET_PINNING,
+	IOCNUM_GET_PINNING,
+	IOCNUM_MAP_MEMORY,
+	IOCNUM_GET_MEMORY_SEG,
+	IOCNUM_SET_REGISTER,
+	IOCNUM_GET_REGISTER,
+	IOCNUM_SET_SEGMENT_DESCRIPTOR,
+	IOCNUM_GET_SEGMENT_DESCRIPTOR,
+	IOCNUM_INJECT_EVENT,
+	IOCNUM_LAPIC_IRQ,
+	IOCNUM_SET_CAPABILITY,
+	IOCNUM_GET_CAPABILITY,
+	IOCNUM_BIND_PPTDEV,
+	IOCNUM_UNBIND_PPTDEV,
+	IOCNUM_MAP_PPTDEV_MMIO,
+	IOCNUM_PPTDEV_MSI,
+	IOCNUM_PPTDEV_MSIX,
+	IOCNUM_INJECT_NMI,
+	IOCNUM_VM_STATS,
+	IOCNUM_VM_STAT_DESC,
+	IOCNUM_SET_X2APIC_STATE,
+	IOCNUM_GET_X2APIC_STATE,
+};
+
+#define	VM_RUN		\
+	_IOWR('v', IOCNUM_RUN, struct vm_run)
+#define	VM_SET_PINNING	\
+	_IOW('v', IOCNUM_SET_PINNING, struct vm_pin)
+#define	VM_GET_PINNING	\
+	_IOWR('v', IOCNUM_GET_PINNING, struct vm_pin)
+#define	VM_MAP_MEMORY	\
+	_IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment)
+#define	VM_GET_MEMORY_SEG \
+	_IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment)
+#define	VM_SET_REGISTER \
+	_IOW('v', IOCNUM_SET_REGISTER, struct vm_register)
+#define	VM_GET_REGISTER \
+	_IOWR('v', IOCNUM_GET_REGISTER, struct vm_register)
+#define	VM_SET_SEGMENT_DESCRIPTOR \
+	_IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define	VM_GET_SEGMENT_DESCRIPTOR \
+	_IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define	VM_INJECT_EVENT	\
+	_IOW('v', IOCNUM_INJECT_EVENT, struct vm_event)
+#define	VM_LAPIC_IRQ 		\
+	_IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq)
+#define	VM_SET_CAPABILITY \
+	_IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability)
+#define	VM_GET_CAPABILITY \
+	_IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability)
+#define	VM_BIND_PPTDEV \
+	_IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev)
+#define	VM_UNBIND_PPTDEV \
+	_IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev)
+#define	VM_MAP_PPTDEV_MMIO \
+	_IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio)
+#define	VM_PPTDEV_MSI \
+	_IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi)
+#define	VM_PPTDEV_MSIX \
+	_IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix)
+#define VM_INJECT_NMI \
+	_IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi)
+#define	VM_STATS \
+	_IOWR('v', IOCNUM_VM_STATS, struct vm_stats)
+#define	VM_STAT_DESC \
+	_IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc)
+#define	VM_SET_X2APIC_STATE \
+	_IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic)
+#define	VM_GET_X2APIC_STATE \
+	_IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic)
+#endif
diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h
new file mode 100644
index 0000000..4cc494b
--- /dev/null
+++ b/sys/amd64/include/vmm_instruction_emul.h
@@ -0,0 +1,113 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_VMM_INSTRUCTION_EMUL_H_
+#define	_VMM_INSTRUCTION_EMUL_H_
+
+/*
+ * The data structures 'vie' and 'vie_op' are meant to be opaque to the
+ * consumers of instruction decoding. The only reason why their contents
+ * need to be exposed is because they are part of the 'vm_exit' structure.
+ */
+struct vie_op {
+	uint8_t		op_byte;	/* actual opcode byte */
+	uint8_t		op_type;	/* type of operation (e.g. MOV) */
+	uint16_t	op_flags;
+};
+
+#define	VIE_INST_SIZE	15
+struct vie {
+	uint8_t		inst[VIE_INST_SIZE];	/* instruction bytes */
+	uint8_t		num_valid;		/* size of the instruction */
+	uint8_t		num_processed;
+
+	uint8_t		rex_w:1,		/* REX prefix */
+			rex_r:1,
+			rex_x:1,
+			rex_b:1;
+
+	uint8_t		mod:2,			/* ModRM byte */
+			reg:4,
+			rm:4;
+
+	uint8_t		ss:2,			/* SIB byte */
+			index:4,
+			base:4;
+
+	uint8_t		disp_bytes;
+	uint8_t		imm_bytes;
+
+	uint8_t		scale;
+	int		base_register;		/* VM_REG_GUEST_xyz */
+	int		index_register;		/* VM_REG_GUEST_xyz */
+
+	int64_t		displacement;		/* optional addr displacement */
+	int64_t		immediate;		/* optional immediate operand */
+
+	uint8_t		decoded;	/* set to 1 if successfully decoded */
+
+	struct vie_op	op;			/* opcode description */
+};
+
+/*
+ * Callback functions to read and write memory regions.
+ */
+typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa,
+				 uint64_t *rval, int rsize, void *arg);
+
+typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa,
+				  uint64_t wval, int wsize, void *arg);
+
+/*
+ * Emulate the decoded 'vie' instruction.
+ *
+ * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region
+ * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the
+ * callback functions.
+ *
+ * 'void *vm' should be 'struct vm *' when called from kernel context and
+ * 'struct vmctx *' when called from user context.
+ * s
+ */
+int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
+			    mem_region_read_t mrr, mem_region_write_t mrw,
+			    void *mrarg);
+
+#ifdef _KERNEL
+/*
+ * APIs to fetch and decode the instruction from nested page fault handler.
+ */
+int vmm_fetch_instruction(struct vm *vm, int cpuid,
+			  uint64_t rip, int inst_length, uint64_t cr3,
+			  struct vie *vie);
+
+int vmm_decode_instruction(struct vm *vm, int cpuid,
+			   uint64_t gla, struct vie *vie);
+#endif	/* _KERNEL */
+
+#endif	/* _VMM_INSTRUCTION_EMUL_H_ */
diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c
new file mode 100644
index 0000000..dc071d3
--- /dev/null
+++ b/sys/amd64/vmm/amd/amdv.c
@@ -0,0 +1,265 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/smp.h>
+
+#include <machine/vmm.h>
+#include "io/iommu.h"
+
+static int
+amdv_init(void)
+{
+
+	printf("amdv_init: not implemented\n");
+	return (ENXIO);
+}
+
+static int
+amdv_cleanup(void)
+{
+
+	printf("amdv_cleanup: not implemented\n");
+	return (ENXIO);
+}
+
+static void *
+amdv_vminit(struct vm *vm)
+{
+
+	printf("amdv_vminit: not implemented\n");
+	return (NULL);
+}
+
+static int
+amdv_vmrun(void *arg, int vcpu, register_t rip)
+{
+
+	printf("amdv_vmrun: not implemented\n");
+	return (ENXIO);
+}
+
+static void
+amdv_vmcleanup(void *arg)
+{
+
+	printf("amdv_vmcleanup: not implemented\n");
+	return;
+}
+
+static int
+amdv_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+	    vm_memattr_t attr, int prot, boolean_t spok)
+{
+
+	printf("amdv_vmmmap_set: not implemented\n");
+	return (EINVAL);
+}
+
+static vm_paddr_t
+amdv_vmmmap_get(void *arg, vm_paddr_t gpa)
+{
+
+	printf("amdv_vmmmap_get: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval)
+{
+	
+	printf("amdv_getreg: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_setreg(void *arg, int vcpu, int regnum, uint64_t val)
+{
+	
+	printf("amdv_setreg: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_getdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
+{
+
+	printf("amdv_get_desc: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_setdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
+{
+
+	printf("amdv_get_desc: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_inject_event(void *vmi, int vcpu, int type, int vector,
+		  uint32_t error_code, int error_code_valid)
+{
+
+	printf("amdv_inject_event: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_getcap(void *arg, int vcpu, int type, int *retval)
+{
+
+	printf("amdv_getcap: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_setcap(void *arg, int vcpu, int type, int val)
+{
+
+	printf("amdv_setcap: not implemented\n");
+	return (EINVAL);
+}
+
+struct vmm_ops vmm_ops_amd = {
+	amdv_init,
+	amdv_cleanup,
+	amdv_vminit,
+	amdv_vmrun,
+	amdv_vmcleanup,
+	amdv_vmmmap_set,
+	amdv_vmmmap_get,
+	amdv_getreg,
+	amdv_setreg,
+	amdv_getdesc,
+	amdv_setdesc,
+	amdv_inject_event,
+	amdv_getcap,
+	amdv_setcap
+};
+
+static int
+amd_iommu_init(void)
+{
+
+	printf("amd_iommu_init: not implemented\n");
+	return (ENXIO);
+}
+
+static void
+amd_iommu_cleanup(void)
+{
+
+	printf("amd_iommu_cleanup: not implemented\n");
+}
+
+static void
+amd_iommu_enable(void)
+{
+
+	printf("amd_iommu_enable: not implemented\n");
+}
+
+static void
+amd_iommu_disable(void)
+{
+
+	printf("amd_iommu_disable: not implemented\n");
+}
+
+static void *
+amd_iommu_create_domain(vm_paddr_t maxaddr)
+{
+
+	printf("amd_iommu_create_domain: not implemented\n");
+	return (NULL);
+}
+
+static void
+amd_iommu_destroy_domain(void *domain)
+{
+
+	printf("amd_iommu_destroy_domain: not implemented\n");
+}
+
+static uint64_t
+amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa,
+			 uint64_t len)
+{
+
+	printf("amd_iommu_create_mapping: not implemented\n");
+	return (0);
+}
+
+static uint64_t
+amd_iommu_remove_mapping(void *domain, vm_paddr_t gpa, uint64_t len)
+{
+
+	printf("amd_iommu_remove_mapping: not implemented\n");
+	return (0);
+}
+
+static void
+amd_iommu_add_device(void *domain, int bus, int slot, int func)
+{
+
+	printf("amd_iommu_add_device: not implemented\n");
+}
+
+static void
+amd_iommu_remove_device(void *domain, int bus, int slot, int func)
+{
+
+	printf("amd_iommu_remove_device: not implemented\n");
+}
+
+static void
+amd_iommu_invalidate_tlb(void *domain)
+{
+
+	printf("amd_iommu_invalidate_tlb: not implemented\n");
+}
+
+struct iommu_ops iommu_ops_amd = {
+	amd_iommu_init,
+	amd_iommu_cleanup,
+	amd_iommu_enable,
+	amd_iommu_disable,
+	amd_iommu_create_domain,
+	amd_iommu_destroy_domain,
+	amd_iommu_create_mapping,
+	amd_iommu_remove_mapping,
+	amd_iommu_add_device,
+	amd_iommu_remove_device,
+	amd_iommu_invalidate_tlb,
+};
diff --git a/sys/amd64/vmm/intel/ept.c b/sys/amd64/vmm/intel/ept.c
new file mode 100644
index 0000000..4f91601
--- /dev/null
+++ b/sys/amd64/vmm/intel/ept.c
@@ -0,0 +1,392 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/smp.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/param.h>
+#include <machine/cpufunc.h>
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include "vmx_cpufunc.h"
+#include "vmx_msr.h"
+#include "vmx.h"
+#include "ept.h"
+
+#define	EPT_PWL4(cap)			((cap) & (1UL << 6))
+#define	EPT_MEMORY_TYPE_WB(cap)		((cap) & (1UL << 14))
+#define	EPT_PDE_SUPERPAGE(cap)		((cap) & (1UL << 16))	/* 2MB pages */
+#define	EPT_PDPTE_SUPERPAGE(cap)	((cap) & (1UL << 17))	/* 1GB pages */
+#define	INVVPID_SUPPORTED(cap)		((cap) & (1UL << 32))
+#define	INVEPT_SUPPORTED(cap)		((cap) & (1UL << 20))
+
+#define	INVVPID_ALL_TYPES_MASK		0xF0000000000UL
+#define	INVVPID_ALL_TYPES_SUPPORTED(cap)	\
+	(((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK)
+
+#define	INVEPT_ALL_TYPES_MASK		0x6000000UL
+#define	INVEPT_ALL_TYPES_SUPPORTED(cap)		\
+	(((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
+
+#define	EPT_PG_RD			(1 << 0)
+#define	EPT_PG_WR			(1 << 1)
+#define	EPT_PG_EX			(1 << 2)
+#define	EPT_PG_MEMORY_TYPE(x)		((x) << 3)
+#define	EPT_PG_IGNORE_PAT		(1 << 6)
+#define	EPT_PG_SUPERPAGE		(1 << 7)
+
+#define	EPT_ADDR_MASK			((uint64_t)-1 << 12)
+
+MALLOC_DECLARE(M_VMX);
+
+static uint64_t page_sizes_mask;
+
+int
+ept_init(void)
+{
+	int page_shift;
+	uint64_t cap;
+
+	cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
+
+	/*
+	 * Verify that:
+	 * - page walk length is 4 steps
+	 * - extended page tables can be laid out in write-back memory
+	 * - invvpid instruction with all possible types is supported
+	 * - invept instruction with all possible types is supported
+	 */
+	if (!EPT_PWL4(cap) ||
+	    !EPT_MEMORY_TYPE_WB(cap) ||
+	    !INVVPID_SUPPORTED(cap) ||
+	    !INVVPID_ALL_TYPES_SUPPORTED(cap) ||
+	    !INVEPT_SUPPORTED(cap) ||
+	    !INVEPT_ALL_TYPES_SUPPORTED(cap))
+		return (EINVAL);
+
+	/* Set bits in 'page_sizes_mask' for each valid page size */
+	page_shift = PAGE_SHIFT;
+	page_sizes_mask = 1UL << page_shift;		/* 4KB page */
+
+	page_shift += 9;
+	if (EPT_PDE_SUPERPAGE(cap))
+		page_sizes_mask |= 1UL << page_shift;	/* 2MB superpage */
+
+	page_shift += 9;
+	if (EPT_PDPTE_SUPERPAGE(cap))
+		page_sizes_mask |= 1UL << page_shift;	/* 1GB superpage */
+
+	return (0);
+}
+
+#if 0
+static void
+ept_dump(uint64_t *ptp, int nlevels)
+{
+	int i, t, tabs;
+	uint64_t *ptpnext, ptpval;
+
+	if (--nlevels < 0)
+		return;
+
+	tabs = 3 - nlevels;
+	for (t = 0; t < tabs; t++)
+		printf("\t");
+	printf("PTP = %p\n", ptp);
+
+	for (i = 0; i < 512; i++) {
+		ptpval = ptp[i];
+
+		if (ptpval == 0)
+			continue;
+		
+		for (t = 0; t < tabs; t++)
+			printf("\t");
+		printf("%3d 0x%016lx\n", i, ptpval);
+
+		if (nlevels != 0 && (ptpval & EPT_PG_SUPERPAGE) == 0) {
+			ptpnext = (uint64_t *)
+				  PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
+			ept_dump(ptpnext, nlevels);
+		}
+	}
+}
+#endif
+
+static size_t
+ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+		   vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
+{
+	int spshift, ptpshift, ptpindex, nlevels;
+
+	/*
+	 * Compute the size of the mapping that we can accomodate.
+	 *
+	 * This is based on three factors:
+	 * - super page sizes supported by the processor
+	 * - alignment of the region starting at 'gpa' and 'hpa'
+	 * - length of the region 'len'
+	 */
+	spshift = PAGE_SHIFT;
+	if (spok)
+		spshift += (EPT_PWLEVELS - 1) * 9;
+	while (spshift >= PAGE_SHIFT) {
+		uint64_t spsize = 1UL << spshift;
+		if ((page_sizes_mask & spsize) != 0 &&
+		    (gpa & (spsize - 1)) == 0 &&
+		    (hpa & (spsize - 1)) == 0 &&
+		    length >= spsize) {
+			break;
+		}
+		spshift -= 9;
+	}
+
+	if (spshift < PAGE_SHIFT) {
+		panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, "
+		      "length 0x%016lx, page_sizes_mask 0x%016lx",
+		      gpa, hpa, length, page_sizes_mask);
+	}
+
+	nlevels = EPT_PWLEVELS;
+	while (--nlevels >= 0) {
+		ptpshift = PAGE_SHIFT + nlevels * 9;
+		ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+		/* We have reached the leaf mapping */
+		if (spshift >= ptpshift)
+			break;
+
+		/*
+		 * We are working on a non-leaf page table page.
+		 *
+		 * Create the next level page table page if necessary and point
+		 * to it from the current page table.
+		 */
+		if (ptp[ptpindex] == 0) {
+			void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO);
+			ptp[ptpindex] = vtophys(nlp);
+			ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX;
+		}
+
+		/* Work our way down to the next level page table page */
+		ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK);
+	}
+
+	if ((gpa & ((1UL << ptpshift) - 1)) != 0) {
+		panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d "
+		      "mismatch\n", gpa, ptpshift);
+	}
+
+	if (prot != VM_PROT_NONE) {
+		/* Do the mapping */
+		ptp[ptpindex] = hpa;
+
+		/* Apply the access controls */
+		if (prot & VM_PROT_READ)
+			ptp[ptpindex] |= EPT_PG_RD;
+		if (prot & VM_PROT_WRITE)
+			ptp[ptpindex] |= EPT_PG_WR;
+		if (prot & VM_PROT_EXECUTE)
+			ptp[ptpindex] |= EPT_PG_EX;
+
+		/*
+		 * XXX should we enforce this memory type by setting the
+		 * ignore PAT bit to 1.
+		 */
+		ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
+
+		if (nlevels > 0)
+			ptp[ptpindex] |= EPT_PG_SUPERPAGE;
+	} else {
+		/* Remove the mapping */
+		ptp[ptpindex] = 0;
+	}
+
+	return (1UL << ptpshift);
+}
+
+static vm_paddr_t
+ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa)
+{
+	int nlevels, ptpshift, ptpindex;
+	uint64_t ptpval, hpabase, pgmask;
+
+	nlevels = EPT_PWLEVELS;
+	while (--nlevels >= 0) {
+		ptpshift = PAGE_SHIFT + nlevels * 9;
+		ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+		ptpval = ptp[ptpindex];
+
+		/* Cannot make progress beyond this point */
+		if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0)
+			break;
+
+		if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) {
+			pgmask = (1UL << ptpshift) - 1;
+			hpabase = ptpval & ~pgmask;
+			return (hpabase | (gpa & pgmask));
+		}
+
+		/* Work our way down to the next level page table page */
+		ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
+	}
+
+	return ((vm_paddr_t)-1);
+}
+
+static void
+ept_free_pt_entry(pt_entry_t pte)
+{
+	if (pte == 0)
+		return;
+
+	/* sanity check */
+	if ((pte & EPT_PG_SUPERPAGE) != 0)
+		panic("ept_free_pt_entry: pte cannot have superpage bit");
+
+	return;
+}
+
+static void
+ept_free_pd_entry(pd_entry_t pde)
+{
+	pt_entry_t	*pt;
+	int		i;
+
+	if (pde == 0)
+		return;
+
+	if ((pde & EPT_PG_SUPERPAGE) == 0) {
+		pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK);
+		for (i = 0; i < NPTEPG; i++)
+			ept_free_pt_entry(pt[i]);
+		free(pt, M_VMX);	/* free the page table page */
+	}
+}
+
+static void
+ept_free_pdp_entry(pdp_entry_t pdpe)
+{
+	pd_entry_t 	*pd;
+	int		 i;
+
+	if (pdpe == 0)
+		return;
+
+	if ((pdpe & EPT_PG_SUPERPAGE) == 0) {
+		pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK);
+		for (i = 0; i < NPDEPG; i++)
+			ept_free_pd_entry(pd[i]);
+		free(pd, M_VMX);	/* free the page directory page */
+	}
+}
+
+static void
+ept_free_pml4_entry(pml4_entry_t pml4e)
+{
+	pdp_entry_t	*pdp;
+	int		i;
+
+	if (pml4e == 0)
+		return;
+
+	if ((pml4e & EPT_PG_SUPERPAGE) == 0) {
+		pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK);
+		for (i = 0; i < NPDPEPG; i++)
+			ept_free_pdp_entry(pdp[i]);
+		free(pdp, M_VMX);	/* free the page directory ptr page */
+	}
+}
+
+void
+ept_vmcleanup(struct vmx *vmx)
+{
+	int 		 i;
+
+	for (i = 0; i < NPML4EPG; i++)
+		ept_free_pml4_entry(vmx->pml4ept[i]);
+}
+
+int
+ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
+		vm_memattr_t attr, int prot, boolean_t spok)
+{
+	size_t n;
+	struct vmx *vmx = arg;
+
+	while (len > 0) {
+		n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr,
+				       prot, spok);
+		len -= n;
+		gpa += n;
+		hpa += n;
+	}
+
+	return (0);
+}
+
+vm_paddr_t
+ept_vmmmap_get(void *arg, vm_paddr_t gpa)
+{
+	vm_paddr_t hpa;
+	struct vmx *vmx;
+
+	vmx = arg;
+	hpa = ept_lookup_mapping(vmx->pml4ept, gpa);
+	return (hpa);
+}
+
+static void
+invept_single_context(void *arg)
+{
+	struct invept_desc desc = *(struct invept_desc *)arg;
+
+	invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
+}
+
+void
+ept_invalidate_mappings(u_long pml4ept)
+{
+	struct invept_desc invept_desc = { 0 };
+
+	invept_desc.eptp = EPTP(pml4ept);
+
+	smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
+}
diff --git a/sys/amd64/vmm/intel/ept.h b/sys/amd64/vmm/intel/ept.h
new file mode 100644
index 0000000..2d7258d
--- /dev/null
+++ b/sys/amd64/vmm/intel/ept.h
@@ -0,0 +1,43 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_EPT_H_
+#define	_EPT_H_
+
+struct vmx;
+
+#define	EPT_PWLEVELS	4		/* page walk levels */
+#define	EPTP(pml4)	((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK)
+
+int	ept_init(void);
+int	ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+	    vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings);
+vm_paddr_t ept_vmmmap_get(void *arg, vm_paddr_t gpa);
+void	ept_invalidate_mappings(u_long ept_pml4);
+void	ept_vmcleanup(struct vmx *vmx);
+#endif
diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
new file mode 100644
index 0000000..a5784dd
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -0,0 +1,551 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/pcpu.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/segments.h>
+#include <machine/pmap.h>
+
+#include <machine/vmm.h>
+#include "vmm_host.h"
+#include "vmcs.h"
+#include "vmx_cpufunc.h"
+#include "ept.h"
+#include "vmx.h"
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+static uint64_t
+vmcs_fix_regval(uint32_t encoding, uint64_t val)
+{
+
+	switch (encoding) {
+	case VMCS_GUEST_CR0:
+		val = vmx_fix_cr0(val);
+		break;
+	case VMCS_GUEST_CR4:
+		val = vmx_fix_cr4(val);
+		break;
+	default:
+		break;
+	}
+	return (val);
+}
+
+static uint32_t
+vmcs_field_encoding(int ident)
+{
+	switch (ident) {
+	case VM_REG_GUEST_CR0:
+		return (VMCS_GUEST_CR0);
+	case VM_REG_GUEST_CR3:
+		return (VMCS_GUEST_CR3);
+	case VM_REG_GUEST_CR4:
+		return (VMCS_GUEST_CR4);
+	case VM_REG_GUEST_DR7:
+		return (VMCS_GUEST_DR7);
+	case VM_REG_GUEST_RSP:
+		return (VMCS_GUEST_RSP);
+	case VM_REG_GUEST_RIP:
+		return (VMCS_GUEST_RIP);
+	case VM_REG_GUEST_RFLAGS:
+		return (VMCS_GUEST_RFLAGS);
+	case VM_REG_GUEST_ES:
+		return (VMCS_GUEST_ES_SELECTOR);
+	case VM_REG_GUEST_CS:
+		return (VMCS_GUEST_CS_SELECTOR);
+	case VM_REG_GUEST_SS:
+		return (VMCS_GUEST_SS_SELECTOR);
+	case VM_REG_GUEST_DS:
+		return (VMCS_GUEST_DS_SELECTOR);
+	case VM_REG_GUEST_FS:
+		return (VMCS_GUEST_FS_SELECTOR);
+	case VM_REG_GUEST_GS:
+		return (VMCS_GUEST_GS_SELECTOR);
+	case VM_REG_GUEST_TR:
+		return (VMCS_GUEST_TR_SELECTOR);
+	case VM_REG_GUEST_LDTR:
+		return (VMCS_GUEST_LDTR_SELECTOR);
+	case VM_REG_GUEST_EFER:
+		return (VMCS_GUEST_IA32_EFER);
+	default:
+		return (-1);
+	}
+
+}
+
+static int
+vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc)
+{
+
+	switch (seg) {
+	case VM_REG_GUEST_ES:
+		*base = VMCS_GUEST_ES_BASE;
+		*lim = VMCS_GUEST_ES_LIMIT;
+		*acc = VMCS_GUEST_ES_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_CS:
+		*base = VMCS_GUEST_CS_BASE;
+		*lim = VMCS_GUEST_CS_LIMIT;
+		*acc = VMCS_GUEST_CS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_SS:
+		*base = VMCS_GUEST_SS_BASE;
+		*lim = VMCS_GUEST_SS_LIMIT;
+		*acc = VMCS_GUEST_SS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_DS:
+		*base = VMCS_GUEST_DS_BASE;
+		*lim = VMCS_GUEST_DS_LIMIT;
+		*acc = VMCS_GUEST_DS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_FS:
+		*base = VMCS_GUEST_FS_BASE;
+		*lim = VMCS_GUEST_FS_LIMIT;
+		*acc = VMCS_GUEST_FS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_GS:
+		*base = VMCS_GUEST_GS_BASE;
+		*lim = VMCS_GUEST_GS_LIMIT;
+		*acc = VMCS_GUEST_GS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_TR:
+		*base = VMCS_GUEST_TR_BASE;
+		*lim = VMCS_GUEST_TR_LIMIT;
+		*acc = VMCS_GUEST_TR_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_LDTR:
+		*base = VMCS_GUEST_LDTR_BASE;
+		*lim = VMCS_GUEST_LDTR_LIMIT;
+		*acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_IDTR:
+		*base = VMCS_GUEST_IDTR_BASE;
+		*lim = VMCS_GUEST_IDTR_LIMIT;
+		*acc = VMCS_INVALID_ENCODING;
+		break;
+	case VM_REG_GUEST_GDTR:
+		*base = VMCS_GUEST_GDTR_BASE;
+		*lim = VMCS_GUEST_GDTR_LIMIT;
+		*acc = VMCS_INVALID_ENCODING;
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+int
+vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval)
+{
+	int error;
+	uint32_t encoding;
+
+	/*
+	 * If we need to get at vmx-specific state in the VMCS we can bypass
+	 * the translation of 'ident' to 'encoding' by simply setting the
+	 * sign bit. As it so happens the upper 16 bits are reserved (i.e
+	 * set to 0) in the encodings for the VMCS so we are free to use the
+	 * sign bit.
+	 */
+	if (ident < 0)
+		encoding = ident & 0x7fffffff;
+	else
+		encoding = vmcs_field_encoding(ident);
+
+	if (encoding == (uint32_t)-1)
+		return (EINVAL);
+
+	VMPTRLD(vmcs);
+	error = vmread(encoding, retval);
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+int
+vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val)
+{
+	int error;
+	uint32_t encoding;
+
+	if (ident < 0)
+		encoding = ident & 0x7fffffff;
+	else
+		encoding = vmcs_field_encoding(ident);
+
+	if (encoding == (uint32_t)-1)
+		return (EINVAL);
+
+	val = vmcs_fix_regval(encoding, val);
+
+	VMPTRLD(vmcs);
+	error = vmwrite(encoding, val);
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+int
+vmcs_setdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc)
+{
+	int error;
+	uint32_t base, limit, access;
+
+	error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
+	if (error != 0)
+		panic("vmcs_setdesc: invalid segment register %d", seg);
+
+	VMPTRLD(vmcs);
+	if ((error = vmwrite(base, desc->base)) != 0)
+		goto done;
+
+	if ((error = vmwrite(limit, desc->limit)) != 0)
+		goto done;
+
+	if (access != VMCS_INVALID_ENCODING) {
+		if ((error = vmwrite(access, desc->access)) != 0)
+			goto done;
+	}
+done:
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+int
+vmcs_getdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc)
+{
+	int error;
+	uint32_t base, limit, access;
+	uint64_t u64;
+
+	error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
+	if (error != 0)
+		panic("vmcs_getdesc: invalid segment register %d", seg);
+
+	VMPTRLD(vmcs);
+	if ((error = vmread(base, &u64)) != 0)
+		goto done;
+	desc->base = u64;
+
+	if ((error = vmread(limit, &u64)) != 0)
+		goto done;
+	desc->limit = u64;
+
+	if (access != VMCS_INVALID_ENCODING) {
+		if ((error = vmread(access, &u64)) != 0)
+			goto done;
+		desc->access = u64;
+	}
+done:
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+int
+vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count)
+{
+	int error;
+
+	VMPTRLD(vmcs);
+
+	/*
+	 * Guest MSRs are saved in the VM-exit MSR-store area.
+	 * Guest MSRs are loaded from the VM-entry MSR-load area.
+	 * Both areas point to the same location in memory.
+	 */
+	if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0)
+		goto done;
+
+	error = 0;
+done:
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+int
+vmcs_set_defaults(struct vmcs *vmcs,
+		  u_long host_rip, u_long host_rsp, u_long ept_pml4,
+		  uint32_t pinbased_ctls, uint32_t procbased_ctls,
+		  uint32_t procbased_ctls2, uint32_t exit_ctls,
+		  uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid)
+{
+	int error, codesel, datasel, tsssel;
+	u_long cr0, cr4, efer;
+	uint64_t eptp, pat, fsbase, idtrbase;
+	uint32_t exc_bitmap;
+
+	codesel = vmm_get_host_codesel();
+	datasel = vmm_get_host_datasel();
+	tsssel = vmm_get_host_tsssel();
+
+	/*
+	 * Make sure we have a "current" VMCS to work with.
+	 */
+	VMPTRLD(vmcs);
+
+	/*
+	 * Load the VMX controls
+	 */
+	if ((error = vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_EXIT_CTLS, exit_ctls)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls)) != 0)
+		goto done;
+
+	/* Guest state */
+
+	/* Initialize guest IA32_PAT MSR with the default value */
+	pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
+	      PAT_VALUE(1, PAT_WRITE_THROUGH)	|
+	      PAT_VALUE(2, PAT_UNCACHED)	|
+	      PAT_VALUE(3, PAT_UNCACHEABLE)	|
+	      PAT_VALUE(4, PAT_WRITE_BACK)	|
+	      PAT_VALUE(5, PAT_WRITE_THROUGH)	|
+	      PAT_VALUE(6, PAT_UNCACHED)	|
+	      PAT_VALUE(7, PAT_UNCACHEABLE);
+	if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0)
+		goto done;
+
+	/* Host state */
+
+	/* Initialize host IA32_PAT MSR */
+	pat = vmm_get_host_pat();
+	if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0)
+		goto done;
+
+	/* Load the IA32_EFER MSR */
+	efer = vmm_get_host_efer();
+	if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0)
+		goto done;
+
+	/* Load the control registers */
+
+	cr0 = vmm_get_host_cr0();
+	if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0)
+		goto done;
+	
+	cr4 = vmm_get_host_cr4() | CR4_VMXE;
+	if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0)
+		goto done;
+
+	/* Load the segment selectors */
+	if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0)
+		goto done;
+
+	/*
+	 * Load the Base-Address for %fs and idtr.
+	 *
+	 * Note that we exclude %gs, tss and gdtr here because their base
+	 * address is pcpu specific.
+	 */
+	fsbase = vmm_get_host_fsbase();
+	if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0)
+		goto done;
+
+	idtrbase = vmm_get_host_idtrbase();
+	if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0)
+		goto done;
+
+	/* instruction pointer */
+	if ((error = vmwrite(VMCS_HOST_RIP, host_rip)) != 0)
+		goto done;
+
+	/* stack pointer */
+	if ((error = vmwrite(VMCS_HOST_RSP, host_rsp)) != 0)
+		goto done;
+
+	/* eptp */
+	eptp = EPTP(ept_pml4);
+	if ((error = vmwrite(VMCS_EPTP, eptp)) != 0)
+		goto done;
+
+	/* vpid */
+	if ((error = vmwrite(VMCS_VPID, vpid)) != 0)
+		goto done;
+
+	/* msr bitmap */
+	if ((error = vmwrite(VMCS_MSR_BITMAP, msr_bitmap)) != 0)
+		goto done;
+
+	/* exception bitmap */
+	exc_bitmap = 1 << IDT_MC;
+	if ((error = vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap)) != 0)
+		goto done;
+
+	/* link pointer */
+	if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0)
+		goto done;
+done:
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+uint64_t
+vmcs_read(uint32_t encoding)
+{
+	int error;
+	uint64_t val;
+
+	error = vmread(encoding, &val);
+	if (error != 0)
+		panic("vmcs_read(%u) error %d", encoding, error);
+
+	return (val);
+}
+
+#ifdef DDB
+extern int vmxon_enabled[];
+
+DB_SHOW_COMMAND(vmcs, db_show_vmcs)
+{
+	uint64_t cur_vmcs, val;
+	uint32_t exit;
+
+	if (!vmxon_enabled[curcpu]) {
+		db_printf("VMX not enabled\n");
+		return;
+	}
+
+	if (have_addr) {
+		db_printf("Only current VMCS supported\n");
+		return;
+	}
+
+	vmptrst(&cur_vmcs);
+	if (cur_vmcs == VMCS_INITIAL) {
+		db_printf("No current VM context\n");
+		return;
+	}
+	db_printf("VMCS: %jx\n", cur_vmcs);
+	db_printf("VPID: %lu\n", vmcs_read(VMCS_VPID));
+	db_printf("Activity: ");
+	val = vmcs_read(VMCS_GUEST_ACTIVITY);
+	switch (val) {
+	case 0:
+		db_printf("Active");
+		break;
+	case 1:
+		db_printf("HLT");
+		break;
+	case 2:
+		db_printf("Shutdown");
+		break;
+	case 3:
+		db_printf("Wait for SIPI");
+		break;
+	default:
+		db_printf("Unknown: %#lx", val);
+	}
+	db_printf("\n");
+	exit = vmcs_read(VMCS_EXIT_REASON);
+	if (exit & 0x80000000)
+		db_printf("Entry Failure Reason: %u\n", exit & 0xffff);
+	else
+		db_printf("Exit Reason: %u\n", exit & 0xffff);
+	db_printf("Qualification: %#lx\n", vmcs_exit_qualification());
+	db_printf("Guest Linear Address: %#lx\n",
+	    vmcs_read(VMCS_GUEST_LINEAR_ADDRESS));
+	switch (exit & 0x8000ffff) {
+	case EXIT_REASON_EXCEPTION:
+	case EXIT_REASON_EXT_INTR:
+		val = vmcs_read(VMCS_EXIT_INTERRUPTION_INFO);
+		db_printf("Interrupt Type: ");
+		switch (val >> 8 & 0x7) {
+		case 0:
+			db_printf("external");
+			break;
+		case 2:
+			db_printf("NMI");
+			break;
+		case 3:
+			db_printf("HW exception");
+			break;
+		case 4:
+			db_printf("SW exception");
+			break;
+		default:
+			db_printf("?? %lu", val >> 8 & 0x7);
+			break;
+		}
+		db_printf("  Vector: %lu", val & 0xff);
+		if (val & 0x800)
+			db_printf("  Error Code: %lx",
+			    vmcs_read(VMCS_EXIT_INTERRUPTION_ERROR));
+		db_printf("\n");
+		break;
+	case EXIT_REASON_EPT_FAULT:
+	case EXIT_REASON_EPT_MISCONFIG:
+		db_printf("Guest Physical Address: %#lx\n",
+		    vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS));
+		break;
+	}
+	db_printf("VM-instruction error: %#lx\n", vmcs_instruction_error());
+}
+#endif
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
new file mode 100644
index 0000000..f39eed2
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -0,0 +1,338 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMCS_H_
+#define	_VMCS_H_
+
+#ifdef _KERNEL
+struct vmcs {
+	uint32_t	identifier;
+	uint32_t	abort_code;
+	char		_impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2];
+};
+CTASSERT(sizeof(struct vmcs) == PAGE_SIZE);
+
+/* MSR save region is composed of an array of 'struct msr_entry' */
+struct msr_entry {
+	uint32_t	index;
+	uint32_t	reserved;
+	uint64_t	val;
+
+};
+
+int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
+int	vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp,
+			  u_long ept_pml4,
+			  uint32_t pinbased_ctls, uint32_t procbased_ctls,
+			  uint32_t procbased_ctls2, uint32_t exit_ctls,
+			  uint32_t entry_ctls, u_long msr_bitmap,
+			  uint16_t vpid);
+int	vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval);
+int	vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val);
+int	vmcs_getdesc(struct vmcs *vmcs, int ident,
+		     struct seg_desc *desc);
+int	vmcs_setdesc(struct vmcs *vmcs, int ident,
+		     struct seg_desc *desc);
+uint64_t vmcs_read(uint32_t encoding);
+
+#define	vmexit_instruction_length()	vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH)
+#define	vmcs_guest_rip()		vmcs_read(VMCS_GUEST_RIP)
+#define	vmcs_instruction_error()	vmcs_read(VMCS_INSTRUCTION_ERROR)
+#define	vmcs_exit_reason()		(vmcs_read(VMCS_EXIT_REASON) & 0xffff)
+#define	vmcs_exit_qualification()	vmcs_read(VMCS_EXIT_QUALIFICATION)
+#define	vmcs_guest_cr3()		vmcs_read(VMCS_GUEST_CR3)
+#define	vmcs_gpa()			vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)
+#define	vmcs_gla()			vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)
+
+#endif	/* _KERNEL */
+
+#define	VMCS_INITIAL			0xffffffffffffffff
+
+#define	VMCS_IDENT(encoding)		((encoding) | 0x80000000)
+/*
+ * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B.
+ */
+#define	VMCS_INVALID_ENCODING		0xffffffff
+
+/* 16-bit control fields */
+#define	VMCS_VPID			0x00000000
+
+/* 16-bit guest-state fields */
+#define	VMCS_GUEST_ES_SELECTOR		0x00000800
+#define	VMCS_GUEST_CS_SELECTOR		0x00000802
+#define	VMCS_GUEST_SS_SELECTOR		0x00000804
+#define	VMCS_GUEST_DS_SELECTOR		0x00000806
+#define	VMCS_GUEST_FS_SELECTOR		0x00000808
+#define	VMCS_GUEST_GS_SELECTOR		0x0000080A
+#define	VMCS_GUEST_LDTR_SELECTOR	0x0000080C
+#define	VMCS_GUEST_TR_SELECTOR		0x0000080E
+
+/* 16-bit host-state fields */
+#define	VMCS_HOST_ES_SELECTOR		0x00000C00
+#define	VMCS_HOST_CS_SELECTOR		0x00000C02
+#define	VMCS_HOST_SS_SELECTOR		0x00000C04
+#define	VMCS_HOST_DS_SELECTOR		0x00000C06
+#define	VMCS_HOST_FS_SELECTOR		0x00000C08
+#define	VMCS_HOST_GS_SELECTOR		0x00000C0A
+#define	VMCS_HOST_TR_SELECTOR		0x00000C0C
+
+/* 64-bit control fields */
+#define	VMCS_IO_BITMAP_A		0x00002000
+#define	VMCS_IO_BITMAP_B		0x00002002
+#define	VMCS_MSR_BITMAP			0x00002004
+#define	VMCS_EXIT_MSR_STORE		0x00002006
+#define	VMCS_EXIT_MSR_LOAD		0x00002008
+#define	VMCS_ENTRY_MSR_LOAD		0x0000200A
+#define	VMCS_EXECUTIVE_VMCS		0x0000200C
+#define	VMCS_TSC_OFFSET			0x00002010
+#define	VMCS_VIRTUAL_APIC		0x00002012
+#define	VMCS_APIC_ACCESS		0x00002014
+#define	VMCS_EPTP			0x0000201A
+
+/* 64-bit read-only fields */
+#define	VMCS_GUEST_PHYSICAL_ADDRESS	0x00002400
+
+/* 64-bit guest-state fields */
+#define	VMCS_LINK_POINTER		0x00002800
+#define	VMCS_GUEST_IA32_DEBUGCTL	0x00002802
+#define	VMCS_GUEST_IA32_PAT		0x00002804
+#define	VMCS_GUEST_IA32_EFER		0x00002806
+#define	VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808
+#define	VMCS_GUEST_PDPTE0		0x0000280A
+#define	VMCS_GUEST_PDPTE1		0x0000280C
+#define	VMCS_GUEST_PDPTE2		0x0000280E
+#define	VMCS_GUEST_PDPTE3		0x00002810
+
+/* 64-bit host-state fields */
+#define	VMCS_HOST_IA32_PAT		0x00002C00
+#define	VMCS_HOST_IA32_EFER		0x00002C02
+#define	VMCS_HOST_IA32_PERF_GLOBAL_CTRL	0x00002C04
+
+/* 32-bit control fields */
+#define	VMCS_PIN_BASED_CTLS		0x00004000
+#define	VMCS_PRI_PROC_BASED_CTLS	0x00004002
+#define	VMCS_EXCEPTION_BITMAP		0x00004004
+#define	VMCS_PF_ERROR_MASK		0x00004006
+#define	VMCS_PF_ERROR_MATCH		0x00004008
+#define	VMCS_CR3_TARGET_COUNT		0x0000400A
+#define	VMCS_EXIT_CTLS			0x0000400C
+#define	VMCS_EXIT_MSR_STORE_COUNT	0x0000400E
+#define	VMCS_EXIT_MSR_LOAD_COUNT	0x00004010
+#define	VMCS_ENTRY_CTLS			0x00004012
+#define	VMCS_ENTRY_MSR_LOAD_COUNT	0x00004014
+#define	VMCS_ENTRY_INTR_INFO		0x00004016
+#define	VMCS_ENTRY_EXCEPTION_ERROR	0x00004018
+#define	VMCS_ENTRY_INST_LENGTH		0x0000401A
+#define	VMCS_TPR_THRESHOLD		0x0000401C
+#define	VMCS_SEC_PROC_BASED_CTLS	0x0000401E
+#define	VMCS_PLE_GAP			0x00004020
+#define	VMCS_PLE_WINDOW			0x00004022
+
+/* 32-bit read-only data fields */
+#define	VMCS_INSTRUCTION_ERROR		0x00004400
+#define	VMCS_EXIT_REASON		0x00004402
+#define	VMCS_EXIT_INTERRUPTION_INFO	0x00004404
+#define	VMCS_EXIT_INTERRUPTION_ERROR	0x00004406
+#define	VMCS_IDT_VECTORING_INFO		0x00004408
+#define	VMCS_IDT_VECTORING_ERROR	0x0000440A
+#define	VMCS_EXIT_INSTRUCTION_LENGTH	0x0000440C
+#define	VMCS_EXIT_INSTRUCTION_INFO	0x0000440E
+
+/* 32-bit guest-state fields */
+#define	VMCS_GUEST_ES_LIMIT		0x00004800
+#define	VMCS_GUEST_CS_LIMIT		0x00004802
+#define	VMCS_GUEST_SS_LIMIT		0x00004804
+#define	VMCS_GUEST_DS_LIMIT		0x00004806
+#define	VMCS_GUEST_FS_LIMIT		0x00004808
+#define	VMCS_GUEST_GS_LIMIT		0x0000480A
+#define	VMCS_GUEST_LDTR_LIMIT		0x0000480C
+#define	VMCS_GUEST_TR_LIMIT		0x0000480E
+#define	VMCS_GUEST_GDTR_LIMIT		0x00004810
+#define	VMCS_GUEST_IDTR_LIMIT		0x00004812
+#define	VMCS_GUEST_ES_ACCESS_RIGHTS	0x00004814
+#define	VMCS_GUEST_CS_ACCESS_RIGHTS	0x00004816
+#define	VMCS_GUEST_SS_ACCESS_RIGHTS	0x00004818
+#define	VMCS_GUEST_DS_ACCESS_RIGHTS	0x0000481A
+#define	VMCS_GUEST_FS_ACCESS_RIGHTS	0x0000481C
+#define	VMCS_GUEST_GS_ACCESS_RIGHTS	0x0000481E
+#define	VMCS_GUEST_LDTR_ACCESS_RIGHTS	0x00004820
+#define	VMCS_GUEST_TR_ACCESS_RIGHTS	0x00004822
+#define	VMCS_GUEST_INTERRUPTIBILITY	0x00004824
+#define	VMCS_GUEST_ACTIVITY		0x00004826
+#define VMCS_GUEST_SMBASE		0x00004828
+#define	VMCS_GUEST_IA32_SYSENTER_CS	0x0000482A
+#define	VMCS_PREEMPTION_TIMER_VALUE	0x0000482E
+
+/* 32-bit host state fields */
+#define	VMCS_HOST_IA32_SYSENTER_CS	0x00004C00
+
+/* Natural Width control fields */
+#define	VMCS_CR0_MASK			0x00006000
+#define	VMCS_CR4_MASK			0x00006002
+#define	VMCS_CR0_SHADOW			0x00006004
+#define	VMCS_CR4_SHADOW			0x00006006
+#define	VMCS_CR3_TARGET0		0x00006008
+#define	VMCS_CR3_TARGET1		0x0000600A
+#define	VMCS_CR3_TARGET2		0x0000600C
+#define	VMCS_CR3_TARGET3		0x0000600E
+
+/* Natural Width read-only fields */
+#define	VMCS_EXIT_QUALIFICATION		0x00006400
+#define	VMCS_IO_RCX			0x00006402
+#define	VMCS_IO_RSI			0x00006404
+#define	VMCS_IO_RDI			0x00006406
+#define	VMCS_IO_RIP			0x00006408
+#define	VMCS_GUEST_LINEAR_ADDRESS	0x0000640A
+
+/* Natural Width guest-state fields */
+#define	VMCS_GUEST_CR0			0x00006800
+#define	VMCS_GUEST_CR3			0x00006802
+#define	VMCS_GUEST_CR4			0x00006804
+#define	VMCS_GUEST_ES_BASE		0x00006806
+#define	VMCS_GUEST_CS_BASE		0x00006808
+#define	VMCS_GUEST_SS_BASE		0x0000680A
+#define	VMCS_GUEST_DS_BASE		0x0000680C
+#define	VMCS_GUEST_FS_BASE		0x0000680E
+#define	VMCS_GUEST_GS_BASE		0x00006810
+#define	VMCS_GUEST_LDTR_BASE		0x00006812
+#define	VMCS_GUEST_TR_BASE		0x00006814
+#define	VMCS_GUEST_GDTR_BASE		0x00006816
+#define	VMCS_GUEST_IDTR_BASE		0x00006818
+#define	VMCS_GUEST_DR7			0x0000681A
+#define	VMCS_GUEST_RSP			0x0000681C
+#define	VMCS_GUEST_RIP			0x0000681E
+#define	VMCS_GUEST_RFLAGS		0x00006820
+#define	VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822
+#define	VMCS_GUEST_IA32_SYSENTER_ESP	0x00006824
+#define	VMCS_GUEST_IA32_SYSENTER_EIP	0x00006826
+
+/* Natural Width host-state fields */
+#define	VMCS_HOST_CR0			0x00006C00
+#define	VMCS_HOST_CR3			0x00006C02
+#define	VMCS_HOST_CR4			0x00006C04
+#define	VMCS_HOST_FS_BASE		0x00006C06
+#define	VMCS_HOST_GS_BASE		0x00006C08
+#define	VMCS_HOST_TR_BASE		0x00006C0A
+#define	VMCS_HOST_GDTR_BASE		0x00006C0C
+#define	VMCS_HOST_IDTR_BASE		0x00006C0E
+#define	VMCS_HOST_IA32_SYSENTER_ESP	0x00006C10
+#define	VMCS_HOST_IA32_SYSENTER_EIP	0x00006C12
+#define	VMCS_HOST_RSP			0x00006C14
+#define	VMCS_HOST_RIP			0x00006c16
+
+/*
+ * VM instruction error numbers
+ */
+#define	VMRESUME_WITH_NON_LAUNCHED_VMCS	5
+
+/*
+ * VMCS exit reasons
+ */
+#define EXIT_REASON_EXCEPTION		0
+#define EXIT_REASON_EXT_INTR		1
+#define EXIT_REASON_TRIPLE_FAULT	2
+#define EXIT_REASON_INIT		3
+#define EXIT_REASON_SIPI		4
+#define EXIT_REASON_IO_SMI		5
+#define EXIT_REASON_SMI			6
+#define EXIT_REASON_INTR_WINDOW		7
+#define EXIT_REASON_NMI_WINDOW		8
+#define EXIT_REASON_TASK_SWITCH		9
+#define EXIT_REASON_CPUID		10
+#define EXIT_REASON_GETSEC		11
+#define EXIT_REASON_HLT			12
+#define EXIT_REASON_INVD		13
+#define EXIT_REASON_INVLPG		14
+#define EXIT_REASON_RDPMC		15
+#define EXIT_REASON_RDTSC		16
+#define EXIT_REASON_RSM			17
+#define EXIT_REASON_VMCALL		18
+#define EXIT_REASON_VMCLEAR		19
+#define EXIT_REASON_VMLAUNCH		20
+#define EXIT_REASON_VMPTRLD		21
+#define EXIT_REASON_VMPTRST		22
+#define EXIT_REASON_VMREAD		23
+#define EXIT_REASON_VMRESUME		24
+#define EXIT_REASON_VMWRITE		25
+#define EXIT_REASON_VMXOFF		26
+#define EXIT_REASON_VMXON		27
+#define EXIT_REASON_CR_ACCESS		28
+#define EXIT_REASON_DR_ACCESS		29
+#define EXIT_REASON_INOUT		30
+#define EXIT_REASON_RDMSR		31
+#define EXIT_REASON_WRMSR		32
+#define EXIT_REASON_INVAL_VMCS		33
+#define EXIT_REASON_INVAL_MSR		34
+#define EXIT_REASON_MWAIT		36
+#define EXIT_REASON_MTF			37
+#define EXIT_REASON_MONITOR		39
+#define EXIT_REASON_PAUSE		40
+#define EXIT_REASON_MCE			41
+#define EXIT_REASON_TPR			43
+#define EXIT_REASON_APIC		44
+#define EXIT_REASON_GDTR_IDTR		46
+#define EXIT_REASON_LDTR_TR		47
+#define EXIT_REASON_EPT_FAULT		48
+#define EXIT_REASON_EPT_MISCONFIG	49
+#define EXIT_REASON_INVEPT		50
+#define EXIT_REASON_RDTSCP		51
+#define EXIT_REASON_VMX_PREEMPT		52
+#define EXIT_REASON_INVVPID		53
+#define EXIT_REASON_WBINVD		54
+#define EXIT_REASON_XSETBV		55
+
+/*
+ * VMCS interrupt information fields
+ */
+#define	VMCS_INTERRUPTION_INFO_VALID	(1U << 31)
+#define	VMCS_INTERRUPTION_INFO_HW_INTR	(0 << 8)
+#define	VMCS_INTERRUPTION_INFO_NMI	(2 << 8)
+
+/*
+ * VMCS Guest interruptibility field
+ */
+#define	VMCS_INTERRUPTIBILITY_STI_BLOCKING	(1 << 0)
+#define	VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING	(1 << 1)
+#define	VMCS_INTERRUPTIBILITY_SMI_BLOCKING	(1 << 2)
+#define	VMCS_INTERRUPTIBILITY_NMI_BLOCKING	(1 << 3)
+
+/*
+ * Exit qualification for EXIT_REASON_INVAL_VMCS
+ */
+#define	EXIT_QUAL_NMI_WHILE_STI_BLOCKING	3
+
+/*
+ * Exit qualification for EPT violation
+ */
+#define	EPT_VIOLATION_DATA_READ		(1UL << 0)
+#define	EPT_VIOLATION_DATA_WRITE	(1UL << 1)
+#define	EPT_VIOLATION_INST_FETCH	(1UL << 2)
+#define	EPT_VIOLATION_GLA_VALID		(1UL << 7)
+#define	EPT_VIOLATION_XLAT_VALID	(1UL << 8)
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
new file mode 100644
index 0000000..4f267bb
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -0,0 +1,1845 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/psl.h>
+#include <machine/cpufunc.h>
+#include <machine/md_var.h>
+#include <machine/pmap.h>
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+#include <machine/vmparam.h>
+
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_host.h"
+#include "vmm_lapic.h"
+#include "vmm_msr.h"
+#include "vmm_ktr.h"
+#include "vmm_stat.h"
+
+#include "vmx_msr.h"
+#include "ept.h"
+#include "vmx_cpufunc.h"
+#include "vmx.h"
+#include "x86.h"
+#include "vmx_controls.h"
+
+#define	PINBASED_CTLS_ONE_SETTING					\
+	(PINBASED_EXTINT_EXITING	|				\
+	 PINBASED_NMI_EXITING		|				\
+	 PINBASED_VIRTUAL_NMI)
+#define	PINBASED_CTLS_ZERO_SETTING	0
+
+#define PROCBASED_CTLS_WINDOW_SETTING					\
+	(PROCBASED_INT_WINDOW_EXITING	|				\
+	 PROCBASED_NMI_WINDOW_EXITING)
+
+#define	PROCBASED_CTLS_ONE_SETTING 					\
+	(PROCBASED_SECONDARY_CONTROLS	|				\
+	 PROCBASED_IO_EXITING		|				\
+	 PROCBASED_MSR_BITMAPS		|				\
+	 PROCBASED_CTLS_WINDOW_SETTING)
+#define	PROCBASED_CTLS_ZERO_SETTING	\
+	(PROCBASED_CR3_LOAD_EXITING |	\
+	PROCBASED_CR3_STORE_EXITING |	\
+	PROCBASED_IO_BITMAPS)
+
+#define	PROCBASED_CTLS2_ONE_SETTING	PROCBASED2_ENABLE_EPT
+#define	PROCBASED_CTLS2_ZERO_SETTING	0
+
+#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT					\
+	(VM_EXIT_HOST_LMA			|			\
+	VM_EXIT_SAVE_EFER			|			\
+	VM_EXIT_LOAD_EFER)
+
+#define	VM_EXIT_CTLS_ONE_SETTING					\
+	(VM_EXIT_CTLS_ONE_SETTING_NO_PAT       	|			\
+	VM_EXIT_SAVE_PAT			|			\
+	VM_EXIT_LOAD_PAT)
+#define	VM_EXIT_CTLS_ZERO_SETTING	VM_EXIT_SAVE_DEBUG_CONTROLS
+
+#define	VM_ENTRY_CTLS_ONE_SETTING_NO_PAT	VM_ENTRY_LOAD_EFER
+
+#define	VM_ENTRY_CTLS_ONE_SETTING					\
+	(VM_ENTRY_CTLS_ONE_SETTING_NO_PAT     	|			\
+	VM_ENTRY_LOAD_PAT)
+#define	VM_ENTRY_CTLS_ZERO_SETTING					\
+	(VM_ENTRY_LOAD_DEBUG_CONTROLS		|			\
+	VM_ENTRY_INTO_SMM			|			\
+	VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
+
+#define	guest_msr_rw(vmx, msr) \
+	msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
+
+#define	HANDLED		1
+#define	UNHANDLED	0
+
+MALLOC_DEFINE(M_VMX, "vmx", "vmx");
+
+int vmxon_enabled[MAXCPU];
+static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
+
+static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
+static uint32_t exit_ctls, entry_ctls;
+
+static uint64_t cr0_ones_mask, cr0_zeros_mask;
+static uint64_t cr4_ones_mask, cr4_zeros_mask;
+
+static volatile u_int nextvpid;
+
+static int vmx_no_patmsr;
+
+/*
+ * Virtual NMI blocking conditions.
+ *
+ * Some processor implementations also require NMI to be blocked if
+ * the STI_BLOCKING bit is set. It is possible to detect this at runtime
+ * based on the (exit_reason,exit_qual) tuple being set to 
+ * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING).
+ *
+ * We take the easy way out and also include STI_BLOCKING as one of the
+ * gating items for vNMI injection.
+ */
+static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING |
+				    VMCS_INTERRUPTIBILITY_NMI_BLOCKING |
+				    VMCS_INTERRUPTIBILITY_STI_BLOCKING;
+
+/*
+ * Optional capabilities
+ */
+static int cap_halt_exit;
+static int cap_pause_exit;
+static int cap_unrestricted_guest;
+static int cap_monitor_trap;
+ 
+/* statistics */
+static VMM_STAT_DEFINE(VCPU_MIGRATIONS, "vcpu migration across host cpus");
+static VMM_STAT_DEFINE(VMEXIT_EXTINT, "vm exits due to external interrupt");
+static VMM_STAT_DEFINE(VMEXIT_HLT_IGNORED, "number of times hlt was ignored");
+static VMM_STAT_DEFINE(VMEXIT_HLT, "number of times hlt was intercepted");
+
+#ifdef KTR
+static const char *
+exit_reason_to_str(int reason)
+{
+	static char reasonbuf[32];
+
+	switch (reason) {
+	case EXIT_REASON_EXCEPTION:
+		return "exception";
+	case EXIT_REASON_EXT_INTR:
+		return "extint";
+	case EXIT_REASON_TRIPLE_FAULT:
+		return "triplefault";
+	case EXIT_REASON_INIT:
+		return "init";
+	case EXIT_REASON_SIPI:
+		return "sipi";
+	case EXIT_REASON_IO_SMI:
+		return "iosmi";
+	case EXIT_REASON_SMI:
+		return "smi";
+	case EXIT_REASON_INTR_WINDOW:
+		return "intrwindow";
+	case EXIT_REASON_NMI_WINDOW:
+		return "nmiwindow";
+	case EXIT_REASON_TASK_SWITCH:
+		return "taskswitch";
+	case EXIT_REASON_CPUID:
+		return "cpuid";
+	case EXIT_REASON_GETSEC:
+		return "getsec";
+	case EXIT_REASON_HLT:
+		return "hlt";
+	case EXIT_REASON_INVD:
+		return "invd";
+	case EXIT_REASON_INVLPG:
+		return "invlpg";
+	case EXIT_REASON_RDPMC:
+		return "rdpmc";
+	case EXIT_REASON_RDTSC:
+		return "rdtsc";
+	case EXIT_REASON_RSM:
+		return "rsm";
+	case EXIT_REASON_VMCALL:
+		return "vmcall";
+	case EXIT_REASON_VMCLEAR:
+		return "vmclear";
+	case EXIT_REASON_VMLAUNCH:
+		return "vmlaunch";
+	case EXIT_REASON_VMPTRLD:
+		return "vmptrld";
+	case EXIT_REASON_VMPTRST:
+		return "vmptrst";
+	case EXIT_REASON_VMREAD:
+		return "vmread";
+	case EXIT_REASON_VMRESUME:
+		return "vmresume";
+	case EXIT_REASON_VMWRITE:
+		return "vmwrite";
+	case EXIT_REASON_VMXOFF:
+		return "vmxoff";
+	case EXIT_REASON_VMXON:
+		return "vmxon";
+	case EXIT_REASON_CR_ACCESS:
+		return "craccess";
+	case EXIT_REASON_DR_ACCESS:
+		return "draccess";
+	case EXIT_REASON_INOUT:
+		return "inout";
+	case EXIT_REASON_RDMSR:
+		return "rdmsr";
+	case EXIT_REASON_WRMSR:
+		return "wrmsr";
+	case EXIT_REASON_INVAL_VMCS:
+		return "invalvmcs";
+	case EXIT_REASON_INVAL_MSR:
+		return "invalmsr";
+	case EXIT_REASON_MWAIT:
+		return "mwait";
+	case EXIT_REASON_MTF:
+		return "mtf";
+	case EXIT_REASON_MONITOR:
+		return "monitor";
+	case EXIT_REASON_PAUSE:
+		return "pause";
+	case EXIT_REASON_MCE:
+		return "mce";
+	case EXIT_REASON_TPR:
+		return "tpr";
+	case EXIT_REASON_APIC:
+		return "apic";
+	case EXIT_REASON_GDTR_IDTR:
+		return "gdtridtr";
+	case EXIT_REASON_LDTR_TR:
+		return "ldtrtr";
+	case EXIT_REASON_EPT_FAULT:
+		return "eptfault";
+	case EXIT_REASON_EPT_MISCONFIG:
+		return "eptmisconfig";
+	case EXIT_REASON_INVEPT:
+		return "invept";
+	case EXIT_REASON_RDTSCP:
+		return "rdtscp";
+	case EXIT_REASON_VMX_PREEMPT:
+		return "vmxpreempt";
+	case EXIT_REASON_INVVPID:
+		return "invvpid";
+	case EXIT_REASON_WBINVD:
+		return "wbinvd";
+	case EXIT_REASON_XSETBV:
+		return "xsetbv";
+	default:
+		snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
+		return (reasonbuf);
+	}
+}
+
+#ifdef SETJMP_TRACE
+static const char *
+vmx_setjmp_rc2str(int rc)
+{
+	switch (rc) {
+	case VMX_RETURN_DIRECT:
+		return "direct";
+	case VMX_RETURN_LONGJMP:
+		return "longjmp";
+	case VMX_RETURN_VMRESUME:
+		return "vmresume";
+	case VMX_RETURN_VMLAUNCH:
+		return "vmlaunch";
+	case VMX_RETURN_AST:
+		return "ast";
+	default:
+		return "unknown";
+	}
+}
+
+#define	SETJMP_TRACE(vmx, vcpu, vmxctx, regname)			  \
+	VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \
+		 (vmxctx)->regname)
+
+static void
+vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
+{
+	uint64_t host_rip, host_rsp;
+
+	if (vmxctx != &vmx->ctx[vcpu])
+		panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p",
+			vmxctx, &vmx->ctx[vcpu]);
+
+	VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx);
+	VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)",
+		 vmx_setjmp_rc2str(rc), rc);
+
+	host_rsp = host_rip = ~0;
+	vmread(VMCS_HOST_RIP, &host_rip);
+	vmread(VMCS_HOST_RSP, &host_rsp);
+	VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx",
+		 host_rip, host_rsp);
+
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip);
+
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2);
+}
+#endif
+#else
+static void __inline
+vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
+{
+	return;
+}
+#endif	/* KTR */
+
+u_long
+vmx_fix_cr0(u_long cr0)
+{
+
+	return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
+}
+
+u_long
+vmx_fix_cr4(u_long cr4)
+{
+
+	return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
+}
+
+static void
+msr_save_area_init(struct msr_entry *g_area, int *g_count)
+{
+	int cnt;
+
+	static struct msr_entry guest_msrs[] = {
+		{ MSR_KGSBASE, 0, 0 },
+	};
+
+	cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
+	if (cnt > GUEST_MSR_MAX_ENTRIES)
+		panic("guest msr save area overrun");
+	bcopy(guest_msrs, g_area, sizeof(guest_msrs));
+	*g_count = cnt;
+}
+
+static void
+vmx_disable(void *arg __unused)
+{
+	struct invvpid_desc invvpid_desc = { 0 };
+	struct invept_desc invept_desc = { 0 };
+
+	if (vmxon_enabled[curcpu]) {
+		/*
+		 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
+		 *
+		 * VMXON or VMXOFF are not required to invalidate any TLB
+		 * caching structures. This prevents potential retention of
+		 * cached information in the TLB between distinct VMX episodes.
+		 */
+		invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
+		invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
+		vmxoff();
+	}
+	load_cr4(rcr4() & ~CR4_VMXE);
+}
+
+static int
+vmx_cleanup(void)
+{
+
+	smp_rendezvous(NULL, vmx_disable, NULL, NULL);
+
+	return (0);
+}
+
+static void
+vmx_enable(void *arg __unused)
+{
+	int error;
+
+	load_cr4(rcr4() | CR4_VMXE);
+
+	*(uint32_t *)vmxon_region[curcpu] = vmx_revision();
+	error = vmxon(vmxon_region[curcpu]);
+	if (error == 0)
+		vmxon_enabled[curcpu] = 1;
+}
+
+static int
+vmx_init(void)
+{
+	int error;
+	uint64_t fixed0, fixed1, feature_control;
+	uint32_t tmp;
+
+	/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
+	if (!(cpu_feature2 & CPUID2_VMX)) {
+		printf("vmx_init: processor does not support VMX operation\n");
+		return (ENXIO);
+	}
+
+	/*
+	 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
+	 * are set (bits 0 and 2 respectively).
+	 */
+	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
+	if ((feature_control & 0x5) != 0x5) {
+		printf("vmx_init: VMX operation disabled by BIOS\n");
+		return (ENXIO);
+	}
+
+	/* Check support for primary processor-based VM-execution controls */
+	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+			       MSR_VMX_TRUE_PROCBASED_CTLS,
+			       PROCBASED_CTLS_ONE_SETTING,
+			       PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
+	if (error) {
+		printf("vmx_init: processor does not support desired primary "
+		       "processor-based controls\n");
+		return (error);
+	}
+
+	/* Clear the processor-based ctl bits that are set on demand */
+	procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
+
+	/* Check support for secondary processor-based VM-execution controls */
+	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+			       MSR_VMX_PROCBASED_CTLS2,
+			       PROCBASED_CTLS2_ONE_SETTING,
+			       PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
+	if (error) {
+		printf("vmx_init: processor does not support desired secondary "
+		       "processor-based controls\n");
+		return (error);
+	}
+
+	/* Check support for VPID */
+	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
+			       PROCBASED2_ENABLE_VPID, 0, &tmp);
+	if (error == 0)
+		procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
+
+	/* Check support for pin-based VM-execution controls */
+	error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
+			       MSR_VMX_TRUE_PINBASED_CTLS,
+			       PINBASED_CTLS_ONE_SETTING,
+			       PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
+	if (error) {
+		printf("vmx_init: processor does not support desired "
+		       "pin-based controls\n");
+		return (error);
+	}
+
+	/* Check support for VM-exit controls */
+	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
+			       VM_EXIT_CTLS_ONE_SETTING,
+			       VM_EXIT_CTLS_ZERO_SETTING,
+			       &exit_ctls);
+	if (error) {
+		/* Try again without the PAT MSR bits */
+		error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS,
+				       MSR_VMX_TRUE_EXIT_CTLS,
+				       VM_EXIT_CTLS_ONE_SETTING_NO_PAT,
+				       VM_EXIT_CTLS_ZERO_SETTING,
+				       &exit_ctls);
+		if (error) {
+			printf("vmx_init: processor does not support desired "
+			       "exit controls\n");
+			return (error);
+		} else {
+			if (bootverbose)
+				printf("vmm: PAT MSR access not supported\n");
+			guest_msr_valid(MSR_PAT);
+			vmx_no_patmsr = 1;
+		}
+	}
+
+	/* Check support for VM-entry controls */
+	if (!vmx_no_patmsr) {
+		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
+				       MSR_VMX_TRUE_ENTRY_CTLS,
+				       VM_ENTRY_CTLS_ONE_SETTING,
+				       VM_ENTRY_CTLS_ZERO_SETTING,
+				       &entry_ctls);
+	} else {
+		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
+				       MSR_VMX_TRUE_ENTRY_CTLS,
+				       VM_ENTRY_CTLS_ONE_SETTING_NO_PAT,
+				       VM_ENTRY_CTLS_ZERO_SETTING,
+				       &entry_ctls);
+	}
+
+	if (error) {
+		printf("vmx_init: processor does not support desired "
+		       "entry controls\n");
+		       return (error);
+	}
+
+	/*
+	 * Check support for optional features by testing them
+	 * as individual bits
+	 */
+	cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+					MSR_VMX_TRUE_PROCBASED_CTLS,
+					PROCBASED_HLT_EXITING, 0,
+					&tmp) == 0);
+
+	cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+					MSR_VMX_PROCBASED_CTLS,
+					PROCBASED_MTF, 0,
+					&tmp) == 0);
+
+	cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+					 MSR_VMX_TRUE_PROCBASED_CTLS,
+					 PROCBASED_PAUSE_EXITING, 0,
+					 &tmp) == 0);
+
+	cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+					MSR_VMX_PROCBASED_CTLS2,
+					PROCBASED2_UNRESTRICTED_GUEST, 0,
+				        &tmp) == 0);
+
+	/* Initialize EPT */
+	error = ept_init();
+	if (error) {
+		printf("vmx_init: ept initialization failed (%d)\n", error);
+		return (error);
+	}
+
+	/*
+	 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
+	 */
+	fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
+	fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
+	cr0_ones_mask = fixed0 & fixed1;
+	cr0_zeros_mask = ~fixed0 & ~fixed1;
+
+	/*
+	 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
+	 * if unrestricted guest execution is allowed.
+	 */
+	if (cap_unrestricted_guest)
+		cr0_ones_mask &= ~(CR0_PG | CR0_PE);
+
+	/*
+	 * Do not allow the guest to set CR0_NW or CR0_CD.
+	 */
+	cr0_zeros_mask |= (CR0_NW | CR0_CD);
+
+	fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
+	fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
+	cr4_ones_mask = fixed0 & fixed1;
+	cr4_zeros_mask = ~fixed0 & ~fixed1;
+
+	/* enable VMX operation */
+	smp_rendezvous(NULL, vmx_enable, NULL, NULL);
+
+	return (0);
+}
+
+/*
+ * If this processor does not support VPIDs then simply return 0.
+ *
+ * Otherwise generate the next value of VPID to use. Any value is alright
+ * as long as it is non-zero.
+ *
+ * We always execute in VMX non-root context with EPT enabled. Thus all
+ * combined mappings are tagged with the (EP4TA, VPID, PCID) tuple. This
+ * in turn means that multiple VMs can share the same VPID as long as
+ * they have distinct EPT page tables.
+ *
+ * XXX
+ * We should optimize this so that it returns VPIDs that are not in
+ * use. Then we will not unnecessarily invalidate mappings in
+ * vmx_set_pcpu_defaults() just because two or more vcpus happen to
+ * use the same 'vpid'.
+ */
+static uint16_t
+vmx_vpid(void)
+{
+	uint16_t vpid = 0;
+
+	if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) != 0) {
+		do {
+			vpid = atomic_fetchadd_int(&nextvpid, 1);
+		} while (vpid == 0);
+	}
+
+	return (vpid);
+}
+
+static int
+vmx_setup_cr_shadow(int which, struct vmcs *vmcs)
+{
+	int error, mask_ident, shadow_ident;
+	uint64_t mask_value, shadow_value;
+
+	if (which != 0 && which != 4)
+		panic("vmx_setup_cr_shadow: unknown cr%d", which);
+
+	if (which == 0) {
+		mask_ident = VMCS_CR0_MASK;
+		mask_value = cr0_ones_mask | cr0_zeros_mask;
+		shadow_ident = VMCS_CR0_SHADOW;
+		shadow_value = cr0_ones_mask;
+	} else {
+		mask_ident = VMCS_CR4_MASK;
+		mask_value = cr4_ones_mask | cr4_zeros_mask;
+		shadow_ident = VMCS_CR4_SHADOW;
+		shadow_value = cr4_ones_mask;
+	}
+
+	error = vmcs_setreg(vmcs, VMCS_IDENT(mask_ident), mask_value);
+	if (error)
+		return (error);
+
+	error = vmcs_setreg(vmcs, VMCS_IDENT(shadow_ident), shadow_value);
+	if (error)
+		return (error);
+
+	return (0);
+}
+#define	vmx_setup_cr0_shadow(vmcs)	vmx_setup_cr_shadow(0, (vmcs))
+#define	vmx_setup_cr4_shadow(vmcs)	vmx_setup_cr_shadow(4, (vmcs))
+
+static void *
+vmx_vminit(struct vm *vm)
+{
+	uint16_t vpid;
+	int i, error, guest_msr_count;
+	struct vmx *vmx;
+
+	vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
+	if ((uintptr_t)vmx & PAGE_MASK) {
+		panic("malloc of struct vmx not aligned on %d byte boundary",
+		      PAGE_SIZE);
+	}
+	vmx->vm = vm;
+
+	/*
+	 * Clean up EPTP-tagged guest physical and combined mappings
+	 *
+	 * VMX transitions are not required to invalidate any guest physical
+	 * mappings. So, it may be possible for stale guest physical mappings
+	 * to be present in the processor TLBs.
+	 *
+	 * Combined mappings for this EP4TA are also invalidated for all VPIDs.
+	 */
+	ept_invalidate_mappings(vtophys(vmx->pml4ept));
+
+	msr_bitmap_initialize(vmx->msr_bitmap);
+
+	/*
+	 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
+	 * The guest FSBASE and GSBASE are saved and restored during
+	 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
+	 * always restored from the vmcs host state area on vm-exit.
+	 *
+	 * Guest KGSBASE is saved and restored in the guest MSR save area.
+	 * Host KGSBASE is restored before returning to userland from the pcb.
+	 * There will be a window of time when we are executing in the host
+	 * kernel context with a value of KGSBASE from the guest. This is ok
+	 * because the value of KGSBASE is inconsequential in kernel context.
+	 *
+	 * MSR_EFER is saved and restored in the guest VMCS area on a
+	 * VM exit and entry respectively. It is also restored from the
+	 * host VMCS area on a VM exit.
+	 */
+	if (guest_msr_rw(vmx, MSR_GSBASE) ||
+	    guest_msr_rw(vmx, MSR_FSBASE) ||
+	    guest_msr_rw(vmx, MSR_KGSBASE) ||
+	    guest_msr_rw(vmx, MSR_EFER))
+		panic("vmx_vminit: error setting guest msr access");
+
+	/*
+	 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
+	 * and entry respectively. It is also restored from the host VMCS
+	 * area on a VM exit. However, if running on a system with no
+	 * MSR_PAT save/restore support, leave access disabled so accesses
+	 * will be trapped.
+	 */
+	if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
+		panic("vmx_vminit: error setting guest pat msr access");
+
+	for (i = 0; i < VM_MAXCPU; i++) {
+		vmx->vmcs[i].identifier = vmx_revision();
+		error = vmclear(&vmx->vmcs[i]);
+		if (error != 0) {
+			panic("vmx_vminit: vmclear error %d on vcpu %d\n",
+			      error, i);
+		}
+
+		vpid = vmx_vpid();
+
+		error = vmcs_set_defaults(&vmx->vmcs[i],
+					  (u_long)vmx_longjmp,
+					  (u_long)&vmx->ctx[i],
+					  vtophys(vmx->pml4ept),
+					  pinbased_ctls,
+					  procbased_ctls,
+					  procbased_ctls2,
+					  exit_ctls, entry_ctls,
+					  vtophys(vmx->msr_bitmap),
+					  vpid);
+
+		if (error != 0)
+			panic("vmx_vminit: vmcs_set_defaults error %d", error);
+
+		vmx->cap[i].set = 0;
+		vmx->cap[i].proc_ctls = procbased_ctls;
+
+		vmx->state[i].lastcpu = -1;
+		vmx->state[i].vpid = vpid;
+
+		msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
+
+		error = vmcs_set_msr_save(&vmx->vmcs[i],
+					  vtophys(vmx->guest_msrs[i]),
+					  guest_msr_count);
+		if (error != 0)
+			panic("vmcs_set_msr_save error %d", error);
+
+		error = vmx_setup_cr0_shadow(&vmx->vmcs[i]);
+		if (error != 0)
+			panic("vmx_setup_cr0_shadow %d", error);
+
+		error = vmx_setup_cr4_shadow(&vmx->vmcs[i]);
+		if (error != 0)
+			panic("vmx_setup_cr4_shadow %d", error);
+	}
+
+	return (vmx);
+}
+
+static int
+vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
+{
+	int handled, func;
+	
+	func = vmxctx->guest_rax;
+
+	handled = x86_emulate_cpuid(vm, vcpu,
+				    (uint32_t*)(&vmxctx->guest_rax),
+				    (uint32_t*)(&vmxctx->guest_rbx),
+				    (uint32_t*)(&vmxctx->guest_rcx),
+				    (uint32_t*)(&vmxctx->guest_rdx));
+	return (handled);
+}
+
+static __inline void
+vmx_run_trace(struct vmx *vmx, int vcpu)
+{
+#ifdef KTR
+	VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip());
+#endif
+}
+
+static __inline void
+vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
+	       int handled)
+{
+#ifdef KTR
+	VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
+		 handled ? "handled" : "unhandled",
+		 exit_reason_to_str(exit_reason), rip);
+#endif
+}
+
+static __inline void
+vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
+{
+#ifdef KTR
+	VMM_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
+#endif
+}
+
+static int
+vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu)
+{
+	int error, lastcpu;
+	struct vmxstate *vmxstate;
+	struct invvpid_desc invvpid_desc = { 0 };
+
+	vmxstate = &vmx->state[vcpu];
+	lastcpu = vmxstate->lastcpu;
+	vmxstate->lastcpu = curcpu;
+
+	if (lastcpu == curcpu) {
+		error = 0;
+		goto done;
+	}
+
+	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
+
+	error = vmwrite(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
+	if (error != 0)
+		goto done;
+
+	error = vmwrite(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
+	if (error != 0)
+		goto done;
+
+	error = vmwrite(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
+	if (error != 0)
+		goto done;
+
+	/*
+	 * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
+	 *
+	 * We do this because this vcpu was executing on a different host
+	 * cpu when it last ran. We do not track whether it invalidated
+	 * mappings associated with its 'vpid' during that run. So we must
+	 * assume that the mappings associated with 'vpid' on 'curcpu' are
+	 * stale and invalidate them.
+	 *
+	 * Note that we incur this penalty only when the scheduler chooses to
+	 * move the thread associated with this vcpu between host cpus.
+	 *
+	 * Note also that this will invalidate mappings tagged with 'vpid'
+	 * for "all" EP4TAs.
+	 */
+	if (vmxstate->vpid != 0) {
+		invvpid_desc.vpid = vmxstate->vpid;
+		invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
+	}
+done:
+	return (error);
+}
+
+static void 
+vm_exit_update_rip(struct vm_exit *vmexit)
+{
+	int error;
+
+	error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length);
+	if (error)
+		panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
+}
+
+/*
+ * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
+ */
+CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
+
+static void __inline
+vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
+{
+	int error;
+
+	vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
+
+	error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+	if (error)
+		panic("vmx_set_int_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
+{
+	int error;
+
+	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
+
+	error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+	if (error)
+		panic("vmx_clear_int_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+	int error;
+
+	vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
+
+	error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+	if (error)
+		panic("vmx_set_nmi_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+	int error;
+
+	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
+
+	error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+	if (error)
+		panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error);
+}
+
+static int
+vmx_inject_nmi(struct vmx *vmx, int vcpu)
+{
+	int error;
+	uint64_t info, interruptibility;
+
+	/* Bail out if no NMI requested */
+	if (!vm_nmi_pending(vmx->vm, vcpu))
+		return (0);
+
+	error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility);
+	if (error) {
+		panic("vmx_inject_nmi: vmread(interruptibility) %d",
+			error);
+	}
+	if (interruptibility & nmi_blocking_bits)
+		goto nmiblocked;
+
+	/*
+	 * Inject the virtual NMI. The vector must be the NMI IDT entry
+	 * or the VMCS entry check will fail.
+	 */
+	info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID;
+	info |= IDT_NMI;
+
+	error = vmwrite(VMCS_ENTRY_INTR_INFO, info);
+	if (error)
+		panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error);
+
+	VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI");
+
+	/* Clear the request */
+	vm_nmi_clear(vmx->vm, vcpu);
+	return (1);
+
+nmiblocked:
+	/*
+	 * Set the NMI Window Exiting execution control so we can inject
+	 * the virtual NMI as soon as blocking condition goes away.
+	 */
+	vmx_set_nmi_window_exiting(vmx, vcpu);
+
+	VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
+	return (1);
+}
+
+static void
+vmx_inject_interrupts(struct vmx *vmx, int vcpu)
+{
+	int error, vector;
+	uint64_t info, rflags, interruptibility;
+
+	const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING |
+				   VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING;
+
+	/*
+	 * If there is already an interrupt pending then just return.
+	 *
+	 * This could happen if an interrupt was injected on a prior
+	 * VM entry but the actual entry into guest mode was aborted
+	 * because of a pending AST.
+	 */
+	error = vmread(VMCS_ENTRY_INTR_INFO, &info);
+	if (error)
+		panic("vmx_inject_interrupts: vmread(intrinfo) %d", error);
+	if (info & VMCS_INTERRUPTION_INFO_VALID)
+		return;
+
+	/*
+	 * NMI injection has priority so deal with those first
+	 */
+	if (vmx_inject_nmi(vmx, vcpu))
+		return;
+
+	/* Ask the local apic for a vector to inject */
+	vector = lapic_pending_intr(vmx->vm, vcpu);
+	if (vector < 0)
+		return;
+
+	if (vector < 32 || vector > 255)
+		panic("vmx_inject_interrupts: invalid vector %d\n", vector);
+
+	/* Check RFLAGS.IF and the interruptibility state of the guest */
+	error = vmread(VMCS_GUEST_RFLAGS, &rflags);
+	if (error)
+		panic("vmx_inject_interrupts: vmread(rflags) %d", error);
+
+	if ((rflags & PSL_I) == 0)
+		goto cantinject;
+
+	error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility);
+	if (error) {
+		panic("vmx_inject_interrupts: vmread(interruptibility) %d",
+			error);
+	}
+	if (interruptibility & HWINTR_BLOCKED)
+		goto cantinject;
+
+	/* Inject the interrupt */
+	info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID;
+	info |= vector;
+	error = vmwrite(VMCS_ENTRY_INTR_INFO, info);
+	if (error)
+		panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error);
+
+	/* Update the Local APIC ISR */
+	lapic_intr_accepted(vmx->vm, vcpu, vector);
+
+	VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
+
+	return;
+
+cantinject:
+	/*
+	 * Set the Interrupt Window Exiting execution control so we can inject
+	 * the interrupt as soon as blocking condition goes away.
+	 */
+	vmx_set_int_window_exiting(vmx, vcpu);
+
+	VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
+}
+
+static int
+vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
+{
+	int error, cr, vmcs_guest_cr;
+	uint64_t regval, ones_mask, zeros_mask;
+	const struct vmxctx *vmxctx;
+
+	/* We only handle mov to %cr0 or %cr4 at this time */
+	if ((exitqual & 0xf0) != 0x00)
+		return (UNHANDLED);
+
+	cr = exitqual & 0xf;
+	if (cr != 0 && cr != 4)
+		return (UNHANDLED);
+
+	vmxctx = &vmx->ctx[vcpu];
+
+	/*
+	 * We must use vmwrite() directly here because vmcs_setreg() will
+	 * call vmclear(vmcs) as a side-effect which we certainly don't want.
+	 */
+	switch ((exitqual >> 8) & 0xf) {
+	case 0:
+		regval = vmxctx->guest_rax;
+		break;
+	case 1:
+		regval = vmxctx->guest_rcx;
+		break;
+	case 2:
+		regval = vmxctx->guest_rdx;
+		break;
+	case 3:
+		regval = vmxctx->guest_rbx;
+		break;
+	case 4:
+		error = vmread(VMCS_GUEST_RSP, &regval);
+		if (error) {
+			panic("vmx_emulate_cr_access: "
+			      "error %d reading guest rsp", error);
+		}
+		break;
+	case 5:
+		regval = vmxctx->guest_rbp;
+		break;
+	case 6:
+		regval = vmxctx->guest_rsi;
+		break;
+	case 7:
+		regval = vmxctx->guest_rdi;
+		break;
+	case 8:
+		regval = vmxctx->guest_r8;
+		break;
+	case 9:
+		regval = vmxctx->guest_r9;
+		break;
+	case 10:
+		regval = vmxctx->guest_r10;
+		break;
+	case 11:
+		regval = vmxctx->guest_r11;
+		break;
+	case 12:
+		regval = vmxctx->guest_r12;
+		break;
+	case 13:
+		regval = vmxctx->guest_r13;
+		break;
+	case 14:
+		regval = vmxctx->guest_r14;
+		break;
+	case 15:
+		regval = vmxctx->guest_r15;
+		break;
+	}
+
+	if (cr == 0) {
+		ones_mask = cr0_ones_mask;
+		zeros_mask = cr0_zeros_mask;
+		vmcs_guest_cr = VMCS_GUEST_CR0;
+	} else {
+		ones_mask = cr4_ones_mask;
+		zeros_mask = cr4_zeros_mask;
+		vmcs_guest_cr = VMCS_GUEST_CR4;
+	}
+	regval |= ones_mask;
+	regval &= ~zeros_mask;
+	error = vmwrite(vmcs_guest_cr, regval);
+	if (error) {
+		panic("vmx_emulate_cr_access: error %d writing cr%d",
+		      error, cr);
+	}
+
+	return (HANDLED);
+}
+
+static int
+vmx_ept_fault(struct vm *vm, int cpu,
+	      uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length,
+	      uint64_t cr3, uint64_t ept_qual, struct vie *vie)
+{
+	int read, write, error;
+
+	/* EPT violation on an instruction fetch doesn't make sense here */
+	if (ept_qual & EPT_VIOLATION_INST_FETCH)
+		return (UNHANDLED);
+
+	/* EPT violation must be a read fault or a write fault */
+	read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
+	write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
+	if ((read | write) == 0)
+		return (UNHANDLED);
+
+	/*
+	 * The EPT violation must have been caused by accessing a
+	 * guest-physical address that is a translation of a guest-linear
+	 * address.
+	 */
+	if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
+	    (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
+		return (UNHANDLED);
+	}
+
+	/* Fetch, decode and emulate the faulting instruction */
+	if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0)
+		return (UNHANDLED);
+
+	if (vmm_decode_instruction(vm, cpu, gla, vie) != 0)
+		return (UNHANDLED);
+
+	/*
+	 * Check if this is a local apic access
+	 */
+	if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE)
+		return (UNHANDLED);
+
+	error = vmm_emulate_instruction(vm, cpu, gpa, vie,
+					lapic_mmio_read, lapic_mmio_write, 0);
+
+	return (error ? UNHANDLED : HANDLED);
+}
+
+static int
+vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
+{
+	int error, handled;
+	struct vmcs *vmcs;
+	struct vmxctx *vmxctx;
+	uint32_t eax, ecx, edx;
+	uint64_t qual, gla, gpa, cr3, intr_info;
+
+	handled = 0;
+	vmcs = &vmx->vmcs[vcpu];
+	vmxctx = &vmx->ctx[vcpu];
+	qual = vmexit->u.vmx.exit_qualification;
+	vmexit->exitcode = VM_EXITCODE_BOGUS;
+
+	switch (vmexit->u.vmx.exit_reason) {
+	case EXIT_REASON_CR_ACCESS:
+		handled = vmx_emulate_cr_access(vmx, vcpu, qual);
+		break;
+	case EXIT_REASON_RDMSR:
+		ecx = vmxctx->guest_rcx;
+		error = emulate_rdmsr(vmx->vm, vcpu, ecx);
+		if (error) {
+			vmexit->exitcode = VM_EXITCODE_RDMSR;
+			vmexit->u.msr.code = ecx;
+		} else
+			handled = 1;
+		break;
+	case EXIT_REASON_WRMSR:
+		eax = vmxctx->guest_rax;
+		ecx = vmxctx->guest_rcx;
+		edx = vmxctx->guest_rdx;
+		error = emulate_wrmsr(vmx->vm, vcpu, ecx,
+					(uint64_t)edx << 32 | eax);
+		if (error) {
+			vmexit->exitcode = VM_EXITCODE_WRMSR;
+			vmexit->u.msr.code = ecx;
+			vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
+		} else
+			handled = 1;
+		break;
+	case EXIT_REASON_HLT:
+		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
+		/*
+		 * If there is an event waiting to be injected then there is
+		 * no need to 'hlt'.
+		 */
+		error = vmread(VMCS_ENTRY_INTR_INFO, &intr_info);
+		if (error)
+			panic("vmx_exit_process: vmread(intrinfo) %d", error);
+
+		if (intr_info & VMCS_INTERRUPTION_INFO_VALID) {
+			handled = 1;
+			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT_IGNORED, 1);
+		} else
+			vmexit->exitcode = VM_EXITCODE_HLT;
+		break;
+	case EXIT_REASON_MTF:
+		vmexit->exitcode = VM_EXITCODE_MTRAP;
+		break;
+	case EXIT_REASON_PAUSE:
+		vmexit->exitcode = VM_EXITCODE_PAUSE;
+		break;
+	case EXIT_REASON_INTR_WINDOW:
+		vmx_clear_int_window_exiting(vmx, vcpu);
+		VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
+		/* FALLTHRU */
+	case EXIT_REASON_EXT_INTR:
+		/*
+		 * External interrupts serve only to cause VM exits and allow
+		 * the host interrupt handler to run.
+		 *
+		 * If this external interrupt triggers a virtual interrupt
+		 * to a VM, then that state will be recorded by the
+		 * host interrupt handler in the VM's softc. We will inject
+		 * this virtual interrupt during the subsequent VM enter.
+		 */
+
+		/*
+		 * This is special. We want to treat this as an 'handled'
+		 * VM-exit but not increment the instruction pointer.
+		 */
+		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
+		return (1);
+	case EXIT_REASON_NMI_WINDOW:
+		/* Exit to allow the pending virtual NMI to be injected */
+		vmx_clear_nmi_window_exiting(vmx, vcpu);
+		VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
+		return (1);
+	case EXIT_REASON_INOUT:
+		vmexit->exitcode = VM_EXITCODE_INOUT;
+		vmexit->u.inout.bytes = (qual & 0x7) + 1;
+		vmexit->u.inout.in = (qual & 0x8) ? 1 : 0;
+		vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
+		vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
+		vmexit->u.inout.port = (uint16_t)(qual >> 16);
+		vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
+		break;
+	case EXIT_REASON_CPUID:
+		handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
+		break;
+	case EXIT_REASON_EPT_FAULT:
+		gla = vmcs_gla();
+		gpa = vmcs_gpa();
+		cr3 = vmcs_guest_cr3();
+		handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa,
+					vmexit->rip, vmexit->inst_length,
+					cr3, qual, &vmexit->u.paging.vie);
+		if (!handled) {
+			vmexit->exitcode = VM_EXITCODE_PAGING;
+			vmexit->u.paging.gpa = gpa;
+		}
+		break;
+	default:
+		break;
+	}
+
+	if (handled) {
+		/*
+		 * It is possible that control is returned to userland
+		 * even though we were able to handle the VM exit in the
+		 * kernel.
+		 *
+		 * In such a case we want to make sure that the userland
+		 * restarts guest execution at the instruction *after*
+		 * the one we just processed. Therefore we update the
+		 * guest rip in the VMCS and in 'vmexit'.
+		 */
+		vm_exit_update_rip(vmexit);
+		vmexit->rip += vmexit->inst_length;
+		vmexit->inst_length = 0;
+
+		/*
+		 * Special case for spinning up an AP - exit to userspace to
+		 * give the controlling process a chance to intercept and
+		 * spin up a thread for the AP.
+		 */
+		if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP)
+			handled = 0;
+	} else {
+		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
+			/*
+			 * If this VM exit was not claimed by anybody then
+			 * treat it as a generic VMX exit.
+			 */
+			vmexit->exitcode = VM_EXITCODE_VMX;
+			vmexit->u.vmx.error = 0;
+		} else {
+			/*
+			 * The exitcode and collateral have been populated.
+			 * The VM exit will be processed further in userland.
+			 */
+		}
+	}
+	return (handled);
+}
+
+static int
+vmx_run(void *arg, int vcpu, register_t rip)
+{
+	int error, vie, rc, handled, astpending;
+	uint32_t exit_reason;
+	struct vmx *vmx;
+	struct vmxctx *vmxctx;
+	struct vmcs *vmcs;
+	struct vm_exit *vmexit;
+	
+	vmx = arg;
+	vmcs = &vmx->vmcs[vcpu];
+	vmxctx = &vmx->ctx[vcpu];
+	vmxctx->launched = 0;
+
+	astpending = 0;
+	vmexit = vm_exitinfo(vmx->vm, vcpu);
+
+	/*
+	 * XXX Can we avoid doing this every time we do a vm run?
+	 */
+	VMPTRLD(vmcs);
+
+	/*
+	 * XXX
+	 * We do this every time because we may setup the virtual machine
+	 * from a different process than the one that actually runs it.
+	 *
+	 * If the life of a virtual machine was spent entirely in the context
+	 * of a single process we could do this once in vmcs_set_defaults().
+	 */
+	if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0)
+		panic("vmx_run: error %d writing to VMCS_HOST_CR3", error);
+
+	if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0)
+		panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
+
+	if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0)
+		panic("vmx_run: error %d setting up pcpu defaults", error);
+
+	do {
+		lapic_timer_tick(vmx->vm, vcpu);
+		vmx_inject_interrupts(vmx, vcpu);
+		vmx_run_trace(vmx, vcpu);
+		rc = vmx_setjmp(vmxctx);
+#ifdef SETJMP_TRACE
+		vmx_setjmp_trace(vmx, vcpu, vmxctx, rc);
+#endif
+		switch (rc) {
+		case VMX_RETURN_DIRECT:
+			if (vmxctx->launched == 0) {
+				vmxctx->launched = 1;
+				vmx_launch(vmxctx);
+			} else
+				vmx_resume(vmxctx);
+			panic("vmx_launch/resume should not return");
+			break;
+		case VMX_RETURN_LONGJMP:
+			break;			/* vm exit */
+		case VMX_RETURN_AST:
+			astpending = 1;
+			break;
+		case VMX_RETURN_VMRESUME:
+			vie = vmcs_instruction_error();
+			if (vmxctx->launch_error == VM_FAIL_INVALID ||
+			    vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) {
+				printf("vmresume error %d vmcs inst error %d\n",
+					vmxctx->launch_error, vie);
+				goto err_exit;
+			}
+			vmx_launch(vmxctx);	/* try to launch the guest */
+			panic("vmx_launch should not return");
+			break;
+		case VMX_RETURN_VMLAUNCH:
+			vie = vmcs_instruction_error();
+#if 1
+			printf("vmlaunch error %d vmcs inst error %d\n",
+				vmxctx->launch_error, vie);
+#endif
+			goto err_exit;
+		default:
+			panic("vmx_setjmp returned %d", rc);
+		}
+		
+		/* enable interrupts */
+		enable_intr();
+
+		/* collect some basic information for VM exit processing */
+		vmexit->rip = rip = vmcs_guest_rip();
+		vmexit->inst_length = vmexit_instruction_length();
+		vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
+		vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
+
+		if (astpending) {
+			handled = 1;
+			vmexit->inst_length = 0;
+			vmexit->exitcode = VM_EXITCODE_BOGUS;
+			vmx_astpending_trace(vmx, vcpu, rip);
+			break;
+		}
+
+		handled = vmx_exit_process(vmx, vcpu, vmexit);
+		vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
+
+	} while (handled);
+
+	/*
+	 * If a VM exit has been handled then the exitcode must be BOGUS
+	 * If a VM exit is not handled then the exitcode must not be BOGUS
+	 */
+	if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
+	    (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
+		panic("Mismatch between handled (%d) and exitcode (%d)",
+		      handled, vmexit->exitcode);
+	}
+
+	VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode);
+
+	/*
+	 * XXX
+	 * We need to do this to ensure that any VMCS state cached by the
+	 * processor is flushed to memory. We need to do this in case the
+	 * VM moves to a different cpu the next time it runs.
+	 *
+	 * Can we avoid doing this?
+	 */
+	VMCLEAR(vmcs);
+	return (0);
+
+err_exit:
+	vmexit->exitcode = VM_EXITCODE_VMX;
+	vmexit->u.vmx.exit_reason = (uint32_t)-1;
+	vmexit->u.vmx.exit_qualification = (uint32_t)-1;
+	vmexit->u.vmx.error = vie;
+	VMCLEAR(vmcs);
+	return (ENOEXEC);
+}
+
+static void
+vmx_vmcleanup(void *arg)
+{
+	int error;
+	struct vmx *vmx = arg;
+
+	/*
+	 * XXXSMP we also need to clear the VMCS active on the other vcpus.
+	 */
+	error = vmclear(&vmx->vmcs[0]);
+	if (error != 0)
+		panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
+
+	ept_vmcleanup(vmx);
+	free(vmx, M_VMX);
+
+	return;
+}
+
+static register_t *
+vmxctx_regptr(struct vmxctx *vmxctx, int reg)
+{
+
+	switch (reg) {
+	case VM_REG_GUEST_RAX:
+		return (&vmxctx->guest_rax);
+	case VM_REG_GUEST_RBX:
+		return (&vmxctx->guest_rbx);
+	case VM_REG_GUEST_RCX:
+		return (&vmxctx->guest_rcx);
+	case VM_REG_GUEST_RDX:
+		return (&vmxctx->guest_rdx);
+	case VM_REG_GUEST_RSI:
+		return (&vmxctx->guest_rsi);
+	case VM_REG_GUEST_RDI:
+		return (&vmxctx->guest_rdi);
+	case VM_REG_GUEST_RBP:
+		return (&vmxctx->guest_rbp);
+	case VM_REG_GUEST_R8:
+		return (&vmxctx->guest_r8);
+	case VM_REG_GUEST_R9:
+		return (&vmxctx->guest_r9);
+	case VM_REG_GUEST_R10:
+		return (&vmxctx->guest_r10);
+	case VM_REG_GUEST_R11:
+		return (&vmxctx->guest_r11);
+	case VM_REG_GUEST_R12:
+		return (&vmxctx->guest_r12);
+	case VM_REG_GUEST_R13:
+		return (&vmxctx->guest_r13);
+	case VM_REG_GUEST_R14:
+		return (&vmxctx->guest_r14);
+	case VM_REG_GUEST_R15:
+		return (&vmxctx->guest_r15);
+	default:
+		break;
+	}
+	return (NULL);
+}
+
+static int
+vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
+{
+	register_t *regp;
+
+	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
+		*retval = *regp;
+		return (0);
+	} else
+		return (EINVAL);
+}
+
+static int
+vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
+{
+	register_t *regp;
+
+	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
+		*regp = val;
+		return (0);
+	} else
+		return (EINVAL);
+}
+
+static int
+vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
+{
+	struct vmx *vmx = arg;
+
+	if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
+		return (0);
+
+	/*
+	 * If the vcpu is running then don't mess with the VMCS.
+	 *
+	 * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause
+	 * the subsequent vmlaunch/vmresume to fail.
+	 */
+	if (vcpu_is_running(vmx->vm, vcpu))
+		panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+	return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval));
+}
+
+static int
+vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
+{
+	int error;
+	uint64_t ctls;
+	struct vmx *vmx = arg;
+
+	/*
+	 * XXX Allow caller to set contents of the guest registers saved in
+	 * the 'vmxctx' even though the vcpu might be running. We need this
+	 * specifically to support the rdmsr emulation that will set the
+	 * %eax and %edx registers during vm exit processing.
+	 */
+	if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
+		return (0);
+
+	/*
+	 * If the vcpu is running then don't mess with the VMCS.
+	 *
+	 * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause
+	 * the subsequent vmlaunch/vmresume to fail.
+	 */
+	if (vcpu_is_running(vmx->vm, vcpu))
+		panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+	error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val);
+
+	if (error == 0) {
+		/*
+		 * If the "load EFER" VM-entry control is 1 then the
+		 * value of EFER.LMA must be identical to "IA-32e mode guest"
+		 * bit in the VM-entry control.
+		 */
+		if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
+		    (reg == VM_REG_GUEST_EFER)) {
+			vmcs_getreg(&vmx->vmcs[vcpu],
+				    VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
+			if (val & EFER_LMA)
+				ctls |= VM_ENTRY_GUEST_LMA;
+			else
+				ctls &= ~VM_ENTRY_GUEST_LMA;
+			vmcs_setreg(&vmx->vmcs[vcpu],
+				    VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
+		}
+	}
+
+	return (error);
+}
+
+static int
+vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+	struct vmx *vmx = arg;
+
+	return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc));
+}
+
+static int
+vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+	struct vmx *vmx = arg;
+
+	return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc));
+}
+
+static int
+vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
+	   int code_valid)
+{
+	int error;
+	uint64_t info;
+	struct vmx *vmx = arg;
+	struct vmcs *vmcs = &vmx->vmcs[vcpu];
+
+	static uint32_t type_map[VM_EVENT_MAX] = {
+		0x1,		/* VM_EVENT_NONE */
+		0x0,		/* VM_HW_INTR */
+		0x2,		/* VM_NMI */
+		0x3,		/* VM_HW_EXCEPTION */
+		0x4,		/* VM_SW_INTR */
+		0x5,		/* VM_PRIV_SW_EXCEPTION */
+		0x6,		/* VM_SW_EXCEPTION */
+	};
+
+	/*
+	 * If there is already an exception pending to be delivered to the
+	 * vcpu then just return.
+	 */
+	error = vmcs_getreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info);
+	if (error)
+		return (error);
+
+	if (info & VMCS_INTERRUPTION_INFO_VALID)
+		return (EAGAIN);
+
+	info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0);
+	info |= VMCS_INTERRUPTION_INFO_VALID;
+	error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info);
+	if (error != 0)
+		return (error);
+
+	if (code_valid) {
+		error = vmcs_setreg(vmcs,
+				    VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR),
+				    code);
+	}
+	return (error);
+}
+
+static int
+vmx_getcap(void *arg, int vcpu, int type, int *retval)
+{
+	struct vmx *vmx = arg;
+	int vcap;
+	int ret;
+
+	ret = ENOENT;
+
+	vcap = vmx->cap[vcpu].set;
+
+	switch (type) {
+	case VM_CAP_HALT_EXIT:
+		if (cap_halt_exit)
+			ret = 0;
+		break;
+	case VM_CAP_PAUSE_EXIT:
+		if (cap_pause_exit)
+			ret = 0;
+		break;
+	case VM_CAP_MTRAP_EXIT:
+		if (cap_monitor_trap)
+			ret = 0;
+		break;
+	case VM_CAP_UNRESTRICTED_GUEST:
+		if (cap_unrestricted_guest)
+			ret = 0;
+		break;
+	default:
+		break;
+	}
+
+	if (ret == 0)
+		*retval = (vcap & (1 << type)) ? 1 : 0;
+
+	return (ret);
+}
+
+static int
+vmx_setcap(void *arg, int vcpu, int type, int val)
+{
+	struct vmx *vmx = arg;
+	struct vmcs *vmcs = &vmx->vmcs[vcpu];
+	uint32_t baseval;
+	uint32_t *pptr;
+	int error;
+	int flag;
+	int reg;
+	int retval;
+
+	retval = ENOENT;
+	pptr = NULL;
+
+	switch (type) {
+	case VM_CAP_HALT_EXIT:
+		if (cap_halt_exit) {
+			retval = 0;
+			pptr = &vmx->cap[vcpu].proc_ctls;
+			baseval = *pptr;
+			flag = PROCBASED_HLT_EXITING;
+			reg = VMCS_PRI_PROC_BASED_CTLS;
+		}
+		break;
+	case VM_CAP_MTRAP_EXIT:
+		if (cap_monitor_trap) {
+			retval = 0;
+			pptr = &vmx->cap[vcpu].proc_ctls;
+			baseval = *pptr;
+			flag = PROCBASED_MTF;
+			reg = VMCS_PRI_PROC_BASED_CTLS;
+		}
+		break;
+	case VM_CAP_PAUSE_EXIT:
+		if (cap_pause_exit) {
+			retval = 0;
+			pptr = &vmx->cap[vcpu].proc_ctls;
+			baseval = *pptr;
+			flag = PROCBASED_PAUSE_EXITING;
+			reg = VMCS_PRI_PROC_BASED_CTLS;
+		}
+		break;
+	case VM_CAP_UNRESTRICTED_GUEST:
+		if (cap_unrestricted_guest) {
+			retval = 0;
+			baseval = procbased_ctls2;
+			flag = PROCBASED2_UNRESTRICTED_GUEST;
+			reg = VMCS_SEC_PROC_BASED_CTLS;
+		}
+		break;
+	default:
+		break;
+	}
+
+	if (retval == 0) {
+		if (val) {
+			baseval |= flag;
+		} else {
+			baseval &= ~flag;
+		}
+		VMPTRLD(vmcs);
+		error = vmwrite(reg, baseval);
+		VMCLEAR(vmcs);
+
+		if (error) {
+			retval = error;
+		} else {
+			/*
+			 * Update optional stored flags, and record
+			 * setting
+			 */
+			if (pptr != NULL) {
+				*pptr = baseval;
+			}
+
+			if (val) {
+				vmx->cap[vcpu].set |= (1 << type);
+			} else {
+				vmx->cap[vcpu].set &= ~(1 << type);
+			}
+		}
+	}
+
+        return (retval);
+}
+
+struct vmm_ops vmm_ops_intel = {
+	vmx_init,
+	vmx_cleanup,
+	vmx_vminit,
+	vmx_run,
+	vmx_vmcleanup,
+	ept_vmmmap_set,
+	ept_vmmmap_get,
+	vmx_getreg,
+	vmx_setreg,
+	vmx_getdesc,
+	vmx_setdesc,
+	vmx_inject,
+	vmx_getcap,
+	vmx_setcap
+};
diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h
new file mode 100644
index 0000000..c7cd567
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx.h
@@ -0,0 +1,120 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_H_
+#define	_VMX_H_
+
+#include "vmcs.h"
+
+#define	GUEST_MSR_MAX_ENTRIES	64		/* arbitrary */
+
+struct vmxctx {
+	register_t	tmpstk[32];		/* vmx_return() stack */
+	register_t	tmpstktop;
+
+	register_t	guest_rdi;		/* Guest state */
+	register_t	guest_rsi;
+	register_t	guest_rdx;
+	register_t	guest_rcx;
+	register_t	guest_r8;
+	register_t	guest_r9;
+	register_t	guest_rax;
+	register_t	guest_rbx;
+	register_t	guest_rbp;
+	register_t	guest_r10;
+	register_t	guest_r11;
+	register_t	guest_r12;
+	register_t	guest_r13;
+	register_t	guest_r14;
+	register_t	guest_r15;
+	register_t	guest_cr2;
+
+	register_t	host_r15;		/* Host state */
+	register_t	host_r14;
+	register_t	host_r13;
+	register_t	host_r12;
+	register_t	host_rbp;
+	register_t	host_rsp;
+	register_t	host_rbx;
+	register_t	host_rip;
+	/*
+	 * XXX todo debug registers and fpu state
+	 */
+	
+	int		launched;		/* vmcs launch state */
+	int		launch_error;
+};
+
+struct vmxcap {
+	int	set;
+	uint32_t proc_ctls;
+};
+
+struct vmxstate {
+	int	lastcpu;	/* host cpu that this 'vcpu' last ran on */
+	uint16_t vpid;
+};
+
+/* virtual machine softc */
+struct vmx {
+	pml4_entry_t	pml4ept[NPML4EPG];
+	struct vmcs	vmcs[VM_MAXCPU];	/* one vmcs per virtual cpu */
+	char		msr_bitmap[PAGE_SIZE];
+	struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES];
+	struct vmxctx	ctx[VM_MAXCPU];
+	struct vmxcap	cap[VM_MAXCPU];
+	struct vmxstate	state[VM_MAXCPU];
+	struct vm	*vm;
+};
+CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
+
+#define	VMX_RETURN_DIRECT	0
+#define	VMX_RETURN_LONGJMP	1
+#define	VMX_RETURN_VMRESUME	2
+#define	VMX_RETURN_VMLAUNCH	3
+#define	VMX_RETURN_AST		4
+/*
+ * vmx_setjmp() returns:
+ * - 0 when it returns directly
+ * - 1 when it returns from vmx_longjmp
+ * - 2 when it returns from vmx_resume (which would only be in the error case)
+ * - 3 when it returns from vmx_launch (which would only be in the error case)
+ * - 4 when it returns from vmx_resume or vmx_launch because of AST pending
+ */
+int	vmx_setjmp(struct vmxctx *ctx);
+void	vmx_longjmp(void);			/* returns via vmx_setjmp */
+void	vmx_launch(struct vmxctx *ctx) __dead2;	/* may return via vmx_setjmp */
+void	vmx_resume(struct vmxctx *ctx) __dead2;	/* may return via vmx_setjmp */
+
+u_long	vmx_fix_cr0(u_long cr0);
+u_long	vmx_fix_cr4(u_long cr4);
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_controls.h b/sys/amd64/vmm/intel/vmx_controls.h
new file mode 100644
index 0000000..31f29f8
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_controls.h
@@ -0,0 +1,92 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_CONTROLS_H_
+#define	_VMX_CONTROLS_H_
+
+/* Pin-Based VM-Execution Controls */
+#define	PINBASED_EXTINT_EXITING		(1 << 0)
+#define	PINBASED_NMI_EXITING		(1 << 3)
+#define	PINBASED_VIRTUAL_NMI		(1 << 5)
+#define	PINBASED_PREMPTION_TIMER	(1 << 6)
+
+/* Primary Processor-Based VM-Execution Controls */
+#define	PROCBASED_INT_WINDOW_EXITING	(1 << 2)
+#define	PROCBASED_TSC_OFFSET		(1 << 3)
+#define	PROCBASED_HLT_EXITING		(1 << 7)
+#define	PROCBASED_INVLPG_EXITING	(1 << 9)
+#define	PROCBASED_MWAIT_EXITING		(1 << 10)
+#define	PROCBASED_RDPMC_EXITING		(1 << 11)
+#define	PROCBASED_RDTSC_EXITING		(1 << 12)
+#define	PROCBASED_CR3_LOAD_EXITING	(1 << 15)
+#define	PROCBASED_CR3_STORE_EXITING	(1 << 16)
+#define	PROCBASED_CR8_LOAD_EXITING	(1 << 19)
+#define	PROCBASED_CR8_STORE_EXITING	(1 << 20)
+#define	PROCBASED_USE_TPR_SHADOW	(1 << 21)
+#define	PROCBASED_NMI_WINDOW_EXITING	(1 << 22)
+#define PROCBASED_MOV_DR_EXITING	(1 << 23)
+#define	PROCBASED_IO_EXITING		(1 << 24)
+#define	PROCBASED_IO_BITMAPS		(1 << 25)
+#define	PROCBASED_MTF			(1 << 27)
+#define	PROCBASED_MSR_BITMAPS		(1 << 28)
+#define	PROCBASED_MONITOR_EXITING	(1 << 29)
+#define	PROCBASED_PAUSE_EXITING		(1 << 30)
+#define	PROCBASED_SECONDARY_CONTROLS	(1 << 31)
+
+/* Secondary Processor-Based VM-Execution Controls */
+#define	PROCBASED2_VIRTUALIZE_APIC	(1 << 0)
+#define	PROCBASED2_ENABLE_EPT		(1 << 1)
+#define	PROCBASED2_DESC_TABLE_EXITING	(1 << 2)
+#define	PROCBASED2_ENABLE_RDTSCP	(1 << 3)
+#define	PROCBASED2_VIRTUALIZE_X2APIC	(1 << 4)
+#define	PROCBASED2_ENABLE_VPID		(1 << 5)
+#define	PROCBASED2_WBINVD_EXITING	(1 << 6)
+#define	PROCBASED2_UNRESTRICTED_GUEST	(1 << 7)
+#define	PROCBASED2_PAUSE_LOOP_EXITING	(1 << 10)
+
+/* VM Exit Controls */
+#define	VM_EXIT_SAVE_DEBUG_CONTROLS	(1 << 2)
+#define	VM_EXIT_HOST_LMA		(1 << 9)
+#define	VM_EXIT_LOAD_PERF_GLOBAL_CTRL	(1 << 12)
+#define	VM_EXIT_ACKNOWLEDGE_INTERRUPT	(1 << 15)
+#define	VM_EXIT_SAVE_PAT		(1 << 18)
+#define	VM_EXIT_LOAD_PAT		(1 << 19)
+#define	VM_EXIT_SAVE_EFER		(1 << 20)
+#define	VM_EXIT_LOAD_EFER		(1 << 21)
+#define	VM_EXIT_SAVE_PREEMPTION_TIMER	(1 << 22)
+
+/* VM Entry Controls */
+#define	VM_ENTRY_LOAD_DEBUG_CONTROLS	(1 << 2)
+#define	VM_ENTRY_GUEST_LMA		(1 << 9)
+#define	VM_ENTRY_INTO_SMM		(1 << 10)
+#define	VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1 << 11)
+#define	VM_ENTRY_LOAD_PERF_GLOBAL_CTRL	(1 << 13)
+#define	VM_ENTRY_LOAD_PAT		(1 << 14)
+#define	VM_ENTRY_LOAD_EFER		(1 << 15)
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_cpufunc.h b/sys/amd64/vmm/intel/vmx_cpufunc.h
new file mode 100644
index 0000000..2e66443
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_cpufunc.h
@@ -0,0 +1,218 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_VMX_CPUFUNC_H_
+#define	_VMX_CPUFUNC_H_
+
+struct vmcs;
+
+/*
+ * Section 5.2 "Conventions" from Intel Architecture Manual 2B.
+ *
+ *			error
+ * VMsucceed		  0
+ * VMFailInvalid	  1
+ * VMFailValid		  2	see also VMCS VM-Instruction Error Field
+ */
+#define	VM_SUCCESS		0
+#define	VM_FAIL_INVALID		1
+#define	VM_FAIL_VALID		2
+#define	VMX_SET_ERROR_CODE \
+	"	jnc 1f;"						\
+	"	mov $1, %[error];"	/* CF: error = 1 */		\
+	"	jmp 3f;"						\
+	"1:	jnz 2f;"						\
+	"	mov $2, %[error];"	/* ZF: error = 2 */		\
+	"	jmp 3f;"						\
+	"2:	mov $0, %[error];"					\
+	"3:"
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmxon(char *region)
+{
+	int error;
+	uint64_t addr;
+
+	addr = vtophys(region);
+	__asm __volatile("vmxon %[addr];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [addr] "m" (*(uint64_t *)&addr)
+			 : "memory");
+
+	return (error);
+}
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmclear(struct vmcs *vmcs)
+{
+	int error;
+	uint64_t addr;
+
+	addr = vtophys(vmcs);
+	__asm __volatile("vmclear %[addr];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [addr] "m" (*(uint64_t *)&addr)
+			 : "memory");
+	return (error);
+}
+
+static __inline void
+vmxoff(void)
+{
+
+	__asm __volatile("vmxoff");
+}
+
+static __inline void
+vmptrst(uint64_t *addr)
+{
+
+	__asm __volatile("vmptrst %[addr]" :: [addr]"m" (*addr) : "memory");
+}
+
+static __inline int
+vmptrld(struct vmcs *vmcs)
+{
+	int error;
+	uint64_t addr;
+
+	addr = vtophys(vmcs);
+	__asm __volatile("vmptrld %[addr];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [addr] "m" (*(uint64_t *)&addr)
+			 : "memory");
+	return (error);
+}
+
+static __inline int
+vmwrite(uint64_t reg, uint64_t val)
+{
+	int error;
+
+	__asm __volatile("vmwrite %[val], %[reg];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [val] "r" (val), [reg] "r" (reg)
+			 : "memory");
+
+	return (error);
+}
+
+static __inline int
+vmread(uint64_t r, uint64_t *addr)
+{
+	int error;
+
+	__asm __volatile("vmread %[r], %[addr];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [r] "r" (r), [addr] "m" (*addr)
+			 : "memory");
+
+	return (error);
+}
+
+static void __inline
+VMCLEAR(struct vmcs *vmcs)
+{
+	int err;
+
+	err = vmclear(vmcs);
+	if (err != 0)
+		panic("%s: vmclear(%p) error %d", __func__, vmcs, err);
+
+	critical_exit();
+}
+
+static void __inline
+VMPTRLD(struct vmcs *vmcs)
+{
+	int err;
+
+	critical_enter();
+
+	err = vmptrld(vmcs);
+	if (err != 0)
+		panic("%s: vmptrld(%p) error %d", __func__, vmcs, err);
+}
+
+#define	INVVPID_TYPE_ADDRESS		0UL
+#define	INVVPID_TYPE_SINGLE_CONTEXT	1UL
+#define	INVVPID_TYPE_ALL_CONTEXTS	2UL
+
+struct invvpid_desc {
+	uint16_t	vpid;
+	uint16_t	_res1;
+	uint32_t	_res2;
+	uint64_t	linear_addr;
+};
+CTASSERT(sizeof(struct invvpid_desc) == 16);
+
+static void __inline
+invvpid(uint64_t type, struct invvpid_desc desc)
+{
+	int error;
+
+	__asm __volatile("invvpid %[desc], %[type];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [desc] "m" (desc), [type] "r" (type)
+			 : "memory");
+
+	if (error)
+		panic("invvpid error %d", error);
+}
+
+#define	INVEPT_TYPE_SINGLE_CONTEXT	1UL
+#define	INVEPT_TYPE_ALL_CONTEXTS	2UL
+struct invept_desc {
+	uint64_t	eptp;
+	uint64_t	_res;
+};
+CTASSERT(sizeof(struct invept_desc) == 16);
+
+static void __inline
+invept(uint64_t type, struct invept_desc desc)
+{
+	int error;
+
+	__asm __volatile("invept %[desc], %[type];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [desc] "m" (desc), [type] "r" (type)
+			 : "memory");
+
+	if (error)
+		panic("invept error %d", error);
+}
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_genassym.c b/sys/amd64/vmm/intel/vmx_genassym.c
new file mode 100644
index 0000000..823a05d
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_genassym.c
@@ -0,0 +1,89 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/assym.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+
+#include <machine/vmm.h>
+#include "vmx.h"
+#include "vmx_cpufunc.h"
+
+ASSYM(VMXCTX_TMPSTKTOP, offsetof(struct vmxctx, tmpstktop));
+ASSYM(VMXCTX_GUEST_RDI, offsetof(struct vmxctx, guest_rdi));
+ASSYM(VMXCTX_GUEST_RSI, offsetof(struct vmxctx, guest_rsi));
+ASSYM(VMXCTX_GUEST_RDX, offsetof(struct vmxctx, guest_rdx));
+ASSYM(VMXCTX_GUEST_RCX, offsetof(struct vmxctx, guest_rcx));
+ASSYM(VMXCTX_GUEST_R8, offsetof(struct vmxctx, guest_r8));
+ASSYM(VMXCTX_GUEST_R9, offsetof(struct vmxctx, guest_r9));
+ASSYM(VMXCTX_GUEST_RAX, offsetof(struct vmxctx, guest_rax));
+ASSYM(VMXCTX_GUEST_RBX, offsetof(struct vmxctx, guest_rbx));
+ASSYM(VMXCTX_GUEST_RBP, offsetof(struct vmxctx, guest_rbp));
+ASSYM(VMXCTX_GUEST_R10, offsetof(struct vmxctx, guest_r10));
+ASSYM(VMXCTX_GUEST_R11, offsetof(struct vmxctx, guest_r11));
+ASSYM(VMXCTX_GUEST_R12, offsetof(struct vmxctx, guest_r12));
+ASSYM(VMXCTX_GUEST_R13, offsetof(struct vmxctx, guest_r13));
+ASSYM(VMXCTX_GUEST_R14, offsetof(struct vmxctx, guest_r14));
+ASSYM(VMXCTX_GUEST_R15, offsetof(struct vmxctx, guest_r15));
+ASSYM(VMXCTX_GUEST_CR2, offsetof(struct vmxctx, guest_cr2));
+
+ASSYM(VMXCTX_HOST_R15, offsetof(struct vmxctx, host_r15));
+ASSYM(VMXCTX_HOST_R14, offsetof(struct vmxctx, host_r14));
+ASSYM(VMXCTX_HOST_R13, offsetof(struct vmxctx, host_r13));
+ASSYM(VMXCTX_HOST_R12, offsetof(struct vmxctx, host_r12));
+ASSYM(VMXCTX_HOST_RBP, offsetof(struct vmxctx, host_rbp));
+ASSYM(VMXCTX_HOST_RSP, offsetof(struct vmxctx, host_rsp));
+ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx));
+ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip));
+
+ASSYM(VMXCTX_LAUNCH_ERROR, offsetof(struct vmxctx, launch_error));
+
+ASSYM(VM_SUCCESS,	VM_SUCCESS);
+ASSYM(VM_FAIL_INVALID,	VM_FAIL_INVALID);
+ASSYM(VM_FAIL_VALID,	VM_FAIL_VALID);
+
+ASSYM(VMX_RETURN_DIRECT,	VMX_RETURN_DIRECT);
+ASSYM(VMX_RETURN_LONGJMP,	VMX_RETURN_LONGJMP);
+ASSYM(VMX_RETURN_VMRESUME,	VMX_RETURN_VMRESUME);
+ASSYM(VMX_RETURN_VMLAUNCH,	VMX_RETURN_VMLAUNCH);
+ASSYM(VMX_RETURN_AST,		VMX_RETURN_AST);
+
+ASSYM(TDF_ASTPENDING, TDF_ASTPENDING);
+ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED);
+ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
+ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread));
diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c
new file mode 100644
index 0000000..2aba63c
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_msr.c
@@ -0,0 +1,172 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/cpufunc.h>
+
+#include "vmx_msr.h"
+
+static boolean_t
+vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
+{
+
+	if (msr_val & (1UL << (bitpos + 32)))
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+static boolean_t
+vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
+{
+
+	if ((msr_val & (1UL << bitpos)) == 0)
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+uint32_t
+vmx_revision(void)
+{
+
+	return (rdmsr(MSR_VMX_BASIC) & 0xffffffff);
+}
+
+/*
+ * Generate a bitmask to be used for the VMCS execution control fields.
+ *
+ * The caller specifies what bits should be set to one in 'ones_mask'
+ * and what bits should be set to zero in 'zeros_mask'. The don't-care
+ * bits are set to the default value. The default values are obtained
+ * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining
+ * VMX Capabilities".
+ *
+ * Returns zero on success and non-zero on error.
+ */
+int
+vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
+	       uint32_t zeros_mask, uint32_t *retval)
+{
+	int i;
+	uint64_t val, trueval;
+	boolean_t true_ctls_avail, one_allowed, zero_allowed;
+
+	/* We cannot ask the same bit to be set to both '1' and '0' */
+	if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
+		return (EINVAL);
+
+	if (rdmsr(MSR_VMX_BASIC) & (1UL << 55))
+		true_ctls_avail = TRUE;
+	else
+		true_ctls_avail = FALSE;
+
+	val = rdmsr(ctl_reg);
+	if (true_ctls_avail)
+		trueval = rdmsr(true_ctl_reg);		/* step c */
+	else
+		trueval = val;				/* step a */
+
+	for (i = 0; i < 32; i++) {
+		one_allowed = vmx_ctl_allows_one_setting(trueval, i);
+		zero_allowed = vmx_ctl_allows_zero_setting(trueval, i);
+
+		KASSERT(one_allowed || zero_allowed,
+			("invalid zero/one setting for bit %d of ctl 0x%0x, "
+			 "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg));
+
+		if (zero_allowed && !one_allowed) {		/* b(i),c(i) */
+			if (ones_mask & (1 << i))
+				return (EINVAL);
+			*retval &= ~(1 << i);
+		} else if (one_allowed && !zero_allowed) {	/* b(i),c(i) */
+			if (zeros_mask & (1 << i))
+				return (EINVAL);
+			*retval |= 1 << i;
+		} else {
+			if (zeros_mask & (1 << i))	/* b(ii),c(ii) */
+				*retval &= ~(1 << i);
+			else if (ones_mask & (1 << i)) /* b(ii), c(ii) */
+				*retval |= 1 << i;
+			else if (!true_ctls_avail)
+				*retval &= ~(1 << i);	/* b(iii) */
+			else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/
+				*retval &= ~(1 << i);
+			else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */
+				*retval |= 1 << i;
+			else {
+				panic("vmx_set_ctlreg: unable to determine "
+				      "correct value of ctl bit %d for msr "
+				      "0x%0x and true msr 0x%0x", i, ctl_reg,
+				      true_ctl_reg);
+			}
+		}
+	}
+
+	return (0);
+}
+
+void
+msr_bitmap_initialize(char *bitmap)
+{
+
+	memset(bitmap, 0xff, PAGE_SIZE);
+}
+
+int
+msr_bitmap_change_access(char *bitmap, u_int msr, int access)
+{
+	int byte, bit;
+
+	if (msr <= 0x00001FFF)
+		byte = msr / 8;
+	else if (msr >= 0xC0000000 && msr <= 0xC0001FFF)
+		byte = 1024 + (msr - 0xC0000000) / 8;
+	else
+		return (EINVAL);
+
+	bit = msr & 0x7;
+
+	if (access & MSR_BITMAP_ACCESS_READ)
+		bitmap[byte] &= ~(1 << bit);
+	else
+		bitmap[byte] |= 1 << bit;
+
+	byte += 2048;
+	if (access & MSR_BITMAP_ACCESS_WRITE)
+		bitmap[byte] &= ~(1 << bit);
+	else
+		bitmap[byte] |= 1 << bit;
+
+	return (0);
+}
diff --git a/sys/amd64/vmm/intel/vmx_msr.h b/sys/amd64/vmm/intel/vmx_msr.h
new file mode 100644
index 0000000..e6379a9
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_msr.h
@@ -0,0 +1,78 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_MSR_H_
+#define	_VMX_MSR_H_
+
+#define	MSR_VMX_BASIC			0x480
+#define	MSR_VMX_EPT_VPID_CAP		0x48C
+
+#define	MSR_VMX_PROCBASED_CTLS		0x482
+#define	MSR_VMX_TRUE_PROCBASED_CTLS	0x48E
+
+#define	MSR_VMX_PINBASED_CTLS		0x481
+#define	MSR_VMX_TRUE_PINBASED_CTLS	0x48D
+
+#define	MSR_VMX_PROCBASED_CTLS2		0x48B
+
+#define	MSR_VMX_EXIT_CTLS		0x483
+#define	MSR_VMX_TRUE_EXIT_CTLS		0x48f
+
+#define	MSR_VMX_ENTRY_CTLS		0x484
+#define	MSR_VMX_TRUE_ENTRY_CTLS		0x490
+
+#define	MSR_VMX_CR0_FIXED0		0x486
+#define	MSR_VMX_CR0_FIXED1		0x487
+
+#define	MSR_VMX_CR4_FIXED0		0x488
+#define	MSR_VMX_CR4_FIXED1		0x489
+
+uint32_t vmx_revision(void);
+
+int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
+		   uint32_t zeros_mask, uint32_t *retval);
+
+/*
+ * According to Section 21.10.4 "Software Access to Related Structures",
+ * changes to data structures pointed to by the VMCS must be made only when
+ * there is no logical processor with a current VMCS that points to the
+ * data structure.
+ *
+ * This pretty much limits us to configuring the MSR bitmap before VMCS
+ * initialization for SMP VMs. Unless of course we do it the hard way - which
+ * would involve some form of synchronization between the vcpus to vmclear
+ * all VMCSs' that point to the bitmap.
+ */
+#define	MSR_BITMAP_ACCESS_NONE	0x0
+#define	MSR_BITMAP_ACCESS_READ	0x1
+#define	MSR_BITMAP_ACCESS_WRITE	0x2
+#define	MSR_BITMAP_ACCESS_RW	(MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE)
+void	msr_bitmap_initialize(char *bitmap);
+int	msr_bitmap_change_access(char *bitmap, u_int msr, int access);
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S
new file mode 100644
index 0000000..4ba582a
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_support.S
@@ -0,0 +1,246 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <machine/asmacros.h>
+
+#include "vmx_assym.s"
+
+/*
+ * Disable interrupts before updating %rsp in VMX_CHECK_AST or
+ * VMX_GUEST_RESTORE.
+ *
+ * The location that %rsp points to is a 'vmxctx' and not a
+ * real stack so we don't want an interrupt handler to trash it
+ */
+#define	VMX_DISABLE_INTERRUPTS		cli
+
+/*
+ * If the thread hosting the vcpu has an ast pending then take care of it
+ * by returning from vmx_setjmp() with a return value of VMX_RETURN_AST.
+ *
+ * Assumes that %rdi holds a pointer to the 'vmxctx' and that interrupts
+ * are disabled.
+ */
+#define	VMX_CHECK_AST							\
+	movq	PCPU(CURTHREAD),%rax;					\
+	testl	$TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax);	\
+	je	9f;							\
+	movq	$VMX_RETURN_AST,%rsi;					\
+	movq	%rdi,%rsp;						\
+	addq	$VMXCTX_TMPSTKTOP,%rsp;					\
+	callq	vmx_return;						\
+9:
+
+/*
+ * Assumes that %rdi holds a pointer to the 'vmxctx'.
+ *
+ * On "return" all registers are updated to reflect guest state. The two
+ * exceptions are %rip and %rsp. These registers are atomically switched
+ * by hardware from the guest area of the vmcs.
+ *
+ * We modify %rsp to point to the 'vmxctx' so we can use it to restore
+ * host context in case of an error with 'vmlaunch' or 'vmresume'.
+ */
+#define	VMX_GUEST_RESTORE						\
+	movq	%rdi,%rsp;						\
+	movq	VMXCTX_GUEST_CR2(%rdi),%rsi;				\
+	movq	%rsi,%cr2;						\
+	movq	VMXCTX_GUEST_RSI(%rdi),%rsi;				\
+	movq	VMXCTX_GUEST_RDX(%rdi),%rdx;				\
+	movq	VMXCTX_GUEST_RCX(%rdi),%rcx;				\
+	movq	VMXCTX_GUEST_R8(%rdi),%r8;				\
+	movq	VMXCTX_GUEST_R9(%rdi),%r9;				\
+	movq	VMXCTX_GUEST_RAX(%rdi),%rax;				\
+	movq	VMXCTX_GUEST_RBX(%rdi),%rbx;				\
+	movq	VMXCTX_GUEST_RBP(%rdi),%rbp;				\
+	movq	VMXCTX_GUEST_R10(%rdi),%r10;				\
+	movq	VMXCTX_GUEST_R11(%rdi),%r11;				\
+	movq	VMXCTX_GUEST_R12(%rdi),%r12;				\
+	movq	VMXCTX_GUEST_R13(%rdi),%r13;				\
+	movq	VMXCTX_GUEST_R14(%rdi),%r14;				\
+	movq	VMXCTX_GUEST_R15(%rdi),%r15;				\
+	movq	VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */
+
+#define	VM_INSTRUCTION_ERROR(reg)					\
+	jnc 	1f;							\
+	movl 	$VM_FAIL_INVALID,reg;		/* CF is set */		\
+	jmp 	3f;							\
+1:	jnz 	2f;							\
+	movl 	$VM_FAIL_VALID,reg;		/* ZF is set */		\
+	jmp 	3f;							\
+2:	movl 	$VM_SUCCESS,reg;					\
+3:	movl	reg,VMXCTX_LAUNCH_ERROR(%rsp)
+
+	.text
+/*
+ * int vmx_setjmp(ctxp)
+ * %rdi = ctxp
+ *
+ * Return value is '0' when it returns directly from here.
+ * Return value is '1' when it returns after a vm exit through vmx_longjmp.
+ */
+ENTRY(vmx_setjmp)
+	movq	(%rsp),%rax			/* return address */
+	movq    %r15,VMXCTX_HOST_R15(%rdi)
+	movq    %r14,VMXCTX_HOST_R14(%rdi)
+	movq    %r13,VMXCTX_HOST_R13(%rdi)
+	movq    %r12,VMXCTX_HOST_R12(%rdi)
+	movq    %rbp,VMXCTX_HOST_RBP(%rdi)
+	movq    %rsp,VMXCTX_HOST_RSP(%rdi)
+	movq    %rbx,VMXCTX_HOST_RBX(%rdi)
+	movq    %rax,VMXCTX_HOST_RIP(%rdi)
+
+	/*
+	 * XXX save host debug registers
+	 */
+	movl	$VMX_RETURN_DIRECT,%eax
+	ret
+END(vmx_setjmp)
+
+/*
+ * void vmx_return(struct vmxctx *ctxp, int retval)
+ * %rdi = ctxp
+ * %rsi = retval
+ * Return to vmm context through vmx_setjmp() with a value of 'retval'.
+ */
+ENTRY(vmx_return)
+	/* Restore host context. */
+	movq	VMXCTX_HOST_R15(%rdi),%r15
+	movq	VMXCTX_HOST_R14(%rdi),%r14
+	movq	VMXCTX_HOST_R13(%rdi),%r13
+	movq	VMXCTX_HOST_R12(%rdi),%r12
+	movq	VMXCTX_HOST_RBP(%rdi),%rbp
+	movq	VMXCTX_HOST_RSP(%rdi),%rsp
+	movq	VMXCTX_HOST_RBX(%rdi),%rbx
+	movq	VMXCTX_HOST_RIP(%rdi),%rax
+	movq	%rax,(%rsp)			/* return address */
+
+	/*
+	 * XXX restore host debug registers
+	 */
+	movl	%esi,%eax
+	ret
+END(vmx_return)
+
+/*
+ * void vmx_longjmp(void)
+ * %rsp points to the struct vmxctx
+ */
+ENTRY(vmx_longjmp)
+	/*
+	 * Save guest state that is not automatically saved in the vmcs.
+	 */
+	movq	%rdi,VMXCTX_GUEST_RDI(%rsp)
+	movq	%rsi,VMXCTX_GUEST_RSI(%rsp)
+	movq	%rdx,VMXCTX_GUEST_RDX(%rsp)
+	movq	%rcx,VMXCTX_GUEST_RCX(%rsp)
+	movq	%r8,VMXCTX_GUEST_R8(%rsp)
+	movq	%r9,VMXCTX_GUEST_R9(%rsp)
+	movq	%rax,VMXCTX_GUEST_RAX(%rsp)
+	movq	%rbx,VMXCTX_GUEST_RBX(%rsp)
+	movq	%rbp,VMXCTX_GUEST_RBP(%rsp)
+	movq	%r10,VMXCTX_GUEST_R10(%rsp)
+	movq	%r11,VMXCTX_GUEST_R11(%rsp)
+	movq	%r12,VMXCTX_GUEST_R12(%rsp)
+	movq	%r13,VMXCTX_GUEST_R13(%rsp)
+	movq	%r14,VMXCTX_GUEST_R14(%rsp)
+	movq	%r15,VMXCTX_GUEST_R15(%rsp)
+
+	movq	%cr2,%rdi
+	movq	%rdi,VMXCTX_GUEST_CR2(%rsp)
+
+	movq	%rsp,%rdi
+	movq	$VMX_RETURN_LONGJMP,%rsi
+
+	addq	$VMXCTX_TMPSTKTOP,%rsp
+	callq	vmx_return
+END(vmx_longjmp)
+
+/*
+ * void vmx_resume(struct vmxctx *ctxp)
+ * %rdi = ctxp
+ *
+ * Although the return type is a 'void' this function may return indirectly
+ * through vmx_setjmp() with a return value of 2.
+ */
+ENTRY(vmx_resume)
+	VMX_DISABLE_INTERRUPTS
+
+	VMX_CHECK_AST
+
+	/*
+	 * Restore guest state that is not automatically loaded from the vmcs.
+	 */
+	VMX_GUEST_RESTORE
+
+	vmresume
+
+	/*
+	 * Capture the reason why vmresume failed.
+	 */
+	VM_INSTRUCTION_ERROR(%eax)
+
+	/* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */
+	movq	%rsp,%rdi
+	movq	$VMX_RETURN_VMRESUME,%rsi
+
+	addq	$VMXCTX_TMPSTKTOP,%rsp
+	callq	vmx_return
+END(vmx_resume)
+
+/*
+ * void vmx_launch(struct vmxctx *ctxp)
+ * %rdi = ctxp
+ *
+ * Although the return type is a 'void' this function may return indirectly
+ * through vmx_setjmp() with a return value of 3.
+ */
+ENTRY(vmx_launch)
+	VMX_DISABLE_INTERRUPTS
+
+	VMX_CHECK_AST
+
+	/*
+	 * Restore guest state that is not automatically loaded from the vmcs.
+	 */
+	VMX_GUEST_RESTORE
+
+	vmlaunch
+
+	/*
+	 * Capture the reason why vmlaunch failed.
+	 */
+	VM_INSTRUCTION_ERROR(%eax)
+
+	/* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */
+	movq	%rsp,%rdi
+	movq	$VMX_RETURN_VMLAUNCH,%rsi
+
+	addq	$VMXCTX_TMPSTKTOP,%rsp
+	callq	vmx_return
+END(vmx_launch)
diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c
new file mode 100644
index 0000000..ef0e9bc
--- /dev/null
+++ b/sys/amd64/vmm/intel/vtd.c
@@ -0,0 +1,677 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <dev/pci/pcireg.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+#include <machine/pci_cfgreg.h>
+
+#include "io/iommu.h"
+
+/*
+ * Documented in the "Intel Virtualization Technology for Directed I/O",
+ * Architecture Spec, September 2008.
+ */
+
+/* Section 10.4 "Register Descriptions" */
+struct vtdmap {
+	volatile uint32_t	version;
+	volatile uint32_t	res0;
+	volatile uint64_t	cap;
+	volatile uint64_t	ext_cap;
+	volatile uint32_t	gcr;
+	volatile uint32_t	gsr;
+	volatile uint64_t	rta;
+	volatile uint64_t	ccr;
+};
+
+#define	VTD_CAP_SAGAW(cap)	(((cap) >> 8) & 0x1F)
+#define	VTD_CAP_ND(cap)		((cap) & 0x7)
+#define	VTD_CAP_CM(cap)		(((cap) >> 7) & 0x1)
+#define	VTD_CAP_SPS(cap)	(((cap) >> 34) & 0xF)
+#define	VTD_CAP_RWBF(cap)	(((cap) >> 4) & 0x1)
+
+#define	VTD_ECAP_DI(ecap)	(((ecap) >> 2) & 0x1)
+#define	VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1)
+#define	VTD_ECAP_IRO(ecap)	(((ecap) >> 8) & 0x3FF)
+
+#define	VTD_GCR_WBF		(1 << 27)
+#define	VTD_GCR_SRTP		(1 << 30)
+#define	VTD_GCR_TE		(1 << 31)
+
+#define	VTD_GSR_WBFS		(1 << 27)
+#define	VTD_GSR_RTPS		(1 << 30)
+#define	VTD_GSR_TES		(1 << 31)
+
+#define	VTD_CCR_ICC		(1UL << 63)	/* invalidate context cache */
+#define	VTD_CCR_CIRG_GLOBAL	(1UL << 61)	/* global invalidation */
+
+#define	VTD_IIR_IVT		(1UL << 63)	/* invalidation IOTLB */
+#define	VTD_IIR_IIRG_GLOBAL	(1ULL << 60)	/* global IOTLB invalidation */
+#define	VTD_IIR_IIRG_DOMAIN	(2ULL << 60)	/* domain IOTLB invalidation */
+#define	VTD_IIR_IIRG_PAGE	(3ULL << 60)	/* page IOTLB invalidation */
+#define	VTD_IIR_DRAIN_READS	(1ULL << 49)	/* drain pending DMA reads */
+#define	VTD_IIR_DRAIN_WRITES	(1ULL << 48)	/* drain pending DMA writes */
+#define	VTD_IIR_DOMAIN_P	32
+
+#define	VTD_ROOT_PRESENT	0x1
+#define	VTD_CTX_PRESENT		0x1
+#define	VTD_CTX_TT_ALL		(1UL << 2)
+
+#define	VTD_PTE_RD		(1UL << 0)
+#define	VTD_PTE_WR		(1UL << 1)
+#define	VTD_PTE_SUPERPAGE	(1UL << 7)
+#define	VTD_PTE_ADDR_M		(0x000FFFFFFFFFF000UL)
+
+struct domain {
+	uint64_t	*ptp;		/* first level page table page */
+	int		pt_levels;	/* number of page table levels */
+	int		addrwidth;	/* 'AW' field in context entry */
+	int		spsmask;	/* supported super page sizes */
+	u_int		id;		/* domain id */
+	vm_paddr_t	maxaddr;	/* highest address to be mapped */
+	SLIST_ENTRY(domain) next;
+};
+
+static SLIST_HEAD(, domain) domhead;
+
+#define	DRHD_MAX_UNITS	8
+static int		drhd_num;
+static struct vtdmap	*vtdmaps[DRHD_MAX_UNITS];
+static int		max_domains;
+typedef int		(*drhd_ident_func_t)(void);
+
+static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
+static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
+
+static MALLOC_DEFINE(M_VTD, "vtd", "vtd");
+
+/*
+ * Config space register definitions from the "Intel 5520 and 5500" datasheet.
+ */
+static int
+tylersburg_vtd_ident(void)
+{
+	int units, nlbus;
+	uint16_t did, vid;
+	uint32_t miscsts, vtbar;
+
+	const int bus = 0;
+	const int slot = 20;
+	const int func = 0;
+
+	units = 0;
+
+	vid = pci_cfgregread(bus, slot, func, PCIR_VENDOR, 2);
+	did = pci_cfgregread(bus, slot, func, PCIR_DEVICE, 2);
+	if (vid != 0x8086 || did != 0x342E)
+		goto done;
+
+	/*
+	 * Check if this is a dual IOH configuration.
+	 */
+	miscsts = pci_cfgregread(bus, slot, func, 0x9C, 4);
+	if (miscsts & (1 << 25))
+		nlbus = pci_cfgregread(bus, slot, func, 0x160, 1);
+	else	
+		nlbus = -1;
+
+	vtbar = pci_cfgregread(bus, slot, func, 0x180, 4);
+	if (vtbar & 0x1) {
+		vtdmaps[units++] = (struct vtdmap *)
+					PHYS_TO_DMAP(vtbar & 0xffffe000);
+	} else if (bootverbose)
+		printf("VT-d unit in legacy IOH is disabled!\n");
+
+	if (nlbus != -1) {
+		vtbar = pci_cfgregread(nlbus, slot, func, 0x180, 4);
+		if (vtbar & 0x1) {
+			vtdmaps[units++] = (struct vtdmap *)
+					   PHYS_TO_DMAP(vtbar & 0xffffe000);
+		} else if (bootverbose)
+			printf("VT-d unit in non-legacy IOH is disabled!\n");
+	}
+done:
+	return (units);
+}
+
+static drhd_ident_func_t drhd_ident_funcs[] = {
+	tylersburg_vtd_ident,
+	NULL
+};
+
+static int
+vtd_max_domains(struct vtdmap *vtdmap)
+{
+	int nd;
+
+	nd = VTD_CAP_ND(vtdmap->cap);
+
+	switch (nd) {
+	case 0:
+		return (16);
+	case 1:
+		return (64);
+	case 2:
+		return (256);
+	case 3:
+		return (1024);
+	case 4:
+		return (4 * 1024);
+	case 5:
+		return (16 * 1024);
+	case 6:
+		return (64 * 1024);
+	default:
+		panic("vtd_max_domains: invalid value of nd (0x%0x)", nd);
+	}
+}
+
+static u_int
+domain_id(void)
+{
+	u_int id;
+	struct domain *dom;
+
+	/* Skip domain id 0 - it is reserved when Caching Mode field is set */
+	for (id = 1; id < max_domains; id++) {
+		SLIST_FOREACH(dom, &domhead, next) {
+			if (dom->id == id)
+				break;
+		}
+		if (dom == NULL)
+			break;		/* found it */
+	}
+	
+	if (id >= max_domains)
+		panic("domain ids exhausted");
+
+	return (id);
+}
+
+static void
+vtd_wbflush(struct vtdmap *vtdmap)
+{
+
+	if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0)
+		pmap_invalidate_cache();
+
+	if (VTD_CAP_RWBF(vtdmap->cap)) {
+		vtdmap->gcr = VTD_GCR_WBF;
+		while ((vtdmap->gsr & VTD_GSR_WBFS) != 0)
+			;
+	}
+}
+
+static void
+vtd_ctx_global_invalidate(struct vtdmap *vtdmap)
+{
+
+	vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL;
+	while ((vtdmap->ccr & VTD_CCR_ICC) != 0)
+		;
+}
+
+static void
+vtd_iotlb_global_invalidate(struct vtdmap *vtdmap)
+{
+	int offset;
+	volatile uint64_t *iotlb_reg, val;
+
+	vtd_wbflush(vtdmap);
+
+	offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16;
+	iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8);
+	
+	*iotlb_reg =  VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL |
+		      VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES;
+
+	while (1) {
+		val = *iotlb_reg;
+		if ((val & VTD_IIR_IVT) == 0)
+			break;
+	}
+}
+
+static void
+vtd_translation_enable(struct vtdmap *vtdmap)
+{
+
+	vtdmap->gcr = VTD_GCR_TE;
+	while ((vtdmap->gsr & VTD_GSR_TES) == 0)
+		;
+}
+
+static void
+vtd_translation_disable(struct vtdmap *vtdmap)
+{
+
+	vtdmap->gcr = 0;
+	while ((vtdmap->gsr & VTD_GSR_TES) != 0)
+		;
+}
+
+static int
+vtd_init(void)
+{
+	int i, units;
+	struct vtdmap *vtdmap;
+	vm_paddr_t ctx_paddr;
+	
+	for (i = 0; drhd_ident_funcs[i] != NULL; i++) {
+		units = (*drhd_ident_funcs[i])();
+		if (units > 0)
+			break;
+	}
+
+	if (units <= 0)
+		return (ENXIO);
+
+	drhd_num = units;
+	vtdmap = vtdmaps[0];
+
+	if (VTD_CAP_CM(vtdmap->cap) != 0)
+		panic("vtd_init: invalid caching mode");
+
+	max_domains = vtd_max_domains(vtdmap);
+
+	/*
+	 * Set up the root-table to point to the context-entry tables
+	 */
+	for (i = 0; i < 256; i++) {
+		ctx_paddr = vtophys(ctx_tables[i]);
+		if (ctx_paddr & PAGE_MASK)
+			panic("ctx table (0x%0lx) not page aligned", ctx_paddr);
+
+		root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT;
+	}
+
+	return (0);
+}
+
+static void
+vtd_cleanup(void)
+{
+}
+
+static void
+vtd_enable(void)
+{
+	int i;
+	struct vtdmap *vtdmap;
+
+	for (i = 0; i < drhd_num; i++) {
+		vtdmap = vtdmaps[i];
+		vtd_wbflush(vtdmap);
+
+		/* Update the root table address */
+		vtdmap->rta = vtophys(root_table);
+		vtdmap->gcr = VTD_GCR_SRTP;
+		while ((vtdmap->gsr & VTD_GSR_RTPS) == 0)
+			;
+
+		vtd_ctx_global_invalidate(vtdmap);
+		vtd_iotlb_global_invalidate(vtdmap);
+
+		vtd_translation_enable(vtdmap);
+	}
+}
+
+static void
+vtd_disable(void)
+{
+	int i;
+	struct vtdmap *vtdmap;
+
+	for (i = 0; i < drhd_num; i++) {
+		vtdmap = vtdmaps[i];
+		vtd_translation_disable(vtdmap);
+	}
+}
+
+static void
+vtd_add_device(void *arg, int bus, int slot, int func)
+{
+	int idx;
+	uint64_t *ctxp;
+	struct domain *dom = arg;
+	vm_paddr_t pt_paddr;
+	struct vtdmap *vtdmap;
+
+	if (bus < 0 || bus > PCI_BUSMAX ||
+	    slot < 0 || slot > PCI_SLOTMAX ||
+	    func < 0 || func > PCI_FUNCMAX)
+		panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
+
+	vtdmap = vtdmaps[0];
+	ctxp = ctx_tables[bus];
+	pt_paddr = vtophys(dom->ptp);
+	idx = (slot << 3 | func) * 2;
+
+	if (ctxp[idx] & VTD_CTX_PRESENT) {
+		panic("vtd_add_device: device %d/%d/%d is already owned by "
+		      "domain %d", bus, slot, func,
+		      (uint16_t)(ctxp[idx + 1] >> 8));
+	}
+
+	/*
+	 * Order is important. The 'present' bit is set only after all fields
+	 * of the context pointer are initialized.
+	 */
+	ctxp[idx + 1] = dom->addrwidth | (dom->id << 8);
+
+	if (VTD_ECAP_DI(vtdmap->ext_cap))
+		ctxp[idx] = VTD_CTX_TT_ALL;
+	else
+		ctxp[idx] = 0;
+
+	ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT;
+
+	/*
+	 * 'Not Present' entries are not cached in either the Context Cache
+	 * or in the IOTLB, so there is no need to invalidate either of them.
+	 */
+}
+
+static void
+vtd_remove_device(void *arg, int bus, int slot, int func)
+{
+	int i, idx;
+	uint64_t *ctxp;
+	struct vtdmap *vtdmap;
+
+	if (bus < 0 || bus > PCI_BUSMAX ||
+	    slot < 0 || slot > PCI_SLOTMAX ||
+	    func < 0 || func > PCI_FUNCMAX)
+		panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
+
+	ctxp = ctx_tables[bus];
+	idx = (slot << 3 | func) * 2;
+
+	/*
+	 * Order is important. The 'present' bit is must be cleared first.
+	 */
+	ctxp[idx] = 0;
+	ctxp[idx + 1] = 0;
+
+	/*
+	 * Invalidate the Context Cache and the IOTLB.
+	 *
+	 * XXX use device-selective invalidation for Context Cache
+	 * XXX use domain-selective invalidation for IOTLB
+	 */
+	for (i = 0; i < drhd_num; i++) {
+		vtdmap = vtdmaps[i];
+		vtd_ctx_global_invalidate(vtdmap);
+		vtd_iotlb_global_invalidate(vtdmap);
+	}
+}
+
+#define	CREATE_MAPPING	0
+#define	REMOVE_MAPPING	1
+
+static uint64_t
+vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,
+		   int remove)
+{
+	struct domain *dom;
+	int i, spshift, ptpshift, ptpindex, nlevels;
+	uint64_t spsize, *ptp;
+
+	dom = arg;
+	ptpindex = 0;
+	ptpshift = 0;
+
+	if (gpa & PAGE_MASK)
+		panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
+
+	if (hpa & PAGE_MASK)
+		panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa);
+
+	if (len & PAGE_MASK)
+		panic("vtd_create_mapping: unaligned len 0x%0lx", len);
+
+	/*
+	 * Compute the size of the mapping that we can accomodate.
+	 *
+	 * This is based on three factors:
+	 * - supported super page size
+	 * - alignment of the region starting at 'gpa' and 'hpa'
+	 * - length of the region 'len'
+	 */
+	spshift = 48;
+	for (i = 3; i >= 0; i--) {
+		spsize = 1UL << spshift;
+		if ((dom->spsmask & (1 << i)) != 0 &&
+		    (gpa & (spsize - 1)) == 0 &&
+		    (hpa & (spsize - 1)) == 0 &&
+		    (len >= spsize)) {
+			break;
+		}
+		spshift -= 9;
+	}
+
+	ptp = dom->ptp;
+	nlevels = dom->pt_levels;
+	while (--nlevels >= 0) {
+		ptpshift = 12 + nlevels * 9;
+		ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+		/* We have reached the leaf mapping */
+		if (spshift >= ptpshift) {
+			break;
+		}
+
+		/*
+		 * We are working on a non-leaf page table page.
+		 *
+		 * Create a downstream page table page if necessary and point
+		 * to it from the current page table.
+		 */
+		if (ptp[ptpindex] == 0) {
+			void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO);
+			ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR;
+		}
+
+		ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M);
+	}
+
+	if ((gpa & ((1UL << ptpshift) - 1)) != 0)
+		panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift);
+
+	/*
+	 * Update the 'gpa' -> 'hpa' mapping
+	 */
+	if (remove) {
+		ptp[ptpindex] = 0;
+	} else {
+		ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR;
+
+		if (nlevels > 0)
+			ptp[ptpindex] |= VTD_PTE_SUPERPAGE;
+	}
+
+	return (1UL << ptpshift);
+}
+
+static uint64_t
+vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
+{
+
+	return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING));
+}
+
+static uint64_t
+vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len)
+{
+
+	return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING));
+}
+
+static void
+vtd_invalidate_tlb(void *dom)
+{
+	int i;
+	struct vtdmap *vtdmap;
+
+	/*
+	 * Invalidate the IOTLB.
+	 * XXX use domain-selective invalidation for IOTLB
+	 */
+	for (i = 0; i < drhd_num; i++) {
+		vtdmap = vtdmaps[i];
+		vtd_iotlb_global_invalidate(vtdmap);
+	}
+}
+
+static void *
+vtd_create_domain(vm_paddr_t maxaddr)
+{
+	struct domain *dom;
+	vm_paddr_t addr;
+	int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth;
+	struct vtdmap *vtdmap;
+
+	if (drhd_num <= 0)
+		panic("vtd_create_domain: no dma remapping hardware available");
+
+	vtdmap = vtdmaps[0];
+
+	/*
+	 * Calculate AGAW.
+	 * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec.
+	 */
+	addr = 0;
+	for (gaw = 0; addr < maxaddr; gaw++)
+		addr = 1ULL << gaw;
+
+	res = (gaw - 12) % 9;
+	if (res == 0)
+		agaw = gaw;
+	else
+		agaw = gaw + 9 - res;
+
+	if (agaw > 64)
+		agaw = 64;
+
+	/*
+	 * Select the smallest Supported AGAW and the corresponding number
+	 * of page table levels.
+	 */
+	pt_levels = 2;
+	sagaw = 30;
+	addrwidth = 0;
+	tmp = VTD_CAP_SAGAW(vtdmap->cap);
+	for (i = 0; i < 5; i++) {
+		if ((tmp & (1 << i)) != 0 && sagaw >= agaw)
+			break;
+		pt_levels++;
+		addrwidth++;
+		sagaw += 9;
+		if (sagaw > 64)
+			sagaw = 64;
+	}
+
+	if (i >= 5) {
+		panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d",
+		      VTD_CAP_SAGAW(vtdmap->cap), agaw);
+	}
+
+	dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK);
+	dom->pt_levels = pt_levels;
+	dom->addrwidth = addrwidth;
+	dom->spsmask = VTD_CAP_SPS(vtdmap->cap);
+	dom->id = domain_id();
+	dom->maxaddr = maxaddr;
+	dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK);
+	if ((uintptr_t)dom->ptp & PAGE_MASK)
+		panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp);
+
+	SLIST_INSERT_HEAD(&domhead, dom, next);
+
+	return (dom);
+}
+
+static void
+vtd_free_ptp(uint64_t *ptp, int level)
+{
+	int i;
+	uint64_t *nlp;
+
+	if (level > 1) {
+		for (i = 0; i < 512; i++) {
+			if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0)
+				continue;
+			if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0)
+				continue;
+			nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M);
+			vtd_free_ptp(nlp, level - 1);
+		}
+	}
+
+	bzero(ptp, PAGE_SIZE);
+	free(ptp, M_VTD);
+}
+
+static void
+vtd_destroy_domain(void *arg)
+{
+	struct domain *dom;
+	
+	dom = arg;
+
+	SLIST_REMOVE(&domhead, dom, domain, next);
+	vtd_free_ptp(dom->ptp, dom->pt_levels);
+	free(dom, M_VTD);
+}
+
+struct iommu_ops iommu_ops_intel = {
+	vtd_init,
+	vtd_cleanup,
+	vtd_enable,
+	vtd_disable,
+	vtd_create_domain,
+	vtd_destroy_domain,
+	vtd_create_mapping,
+	vtd_remove_mapping,
+	vtd_add_device,
+	vtd_remove_device,
+	vtd_invalidate_tlb,
+};
diff --git a/sys/amd64/vmm/io/iommu.c b/sys/amd64/vmm/io/iommu.c
new file mode 100644
index 0000000..c8447cc
--- /dev/null
+++ b/sys/amd64/vmm/io/iommu.c
@@ -0,0 +1,277 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/md_var.h>
+
+#include "vmm_util.h"
+#include "vmm_mem.h"
+#include "iommu.h"
+
+static boolean_t iommu_avail;
+static struct iommu_ops *ops;
+static void *host_domain;
+
+static __inline int
+IOMMU_INIT(void)
+{
+	if (ops != NULL)
+		return ((*ops->init)());
+	else
+		return (ENXIO);
+}
+
+static __inline void
+IOMMU_CLEANUP(void)
+{
+	if (ops != NULL && iommu_avail)
+		(*ops->cleanup)();
+}
+
+static __inline void *
+IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr)
+{
+
+	if (ops != NULL && iommu_avail)
+		return ((*ops->create_domain)(maxaddr));
+	else
+		return (NULL);
+}
+
+static __inline void
+IOMMU_DESTROY_DOMAIN(void *dom)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->destroy_domain)(dom);
+}
+
+static __inline uint64_t
+IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
+{
+
+	if (ops != NULL && iommu_avail)
+		return ((*ops->create_mapping)(domain, gpa, hpa, len));
+	else
+		return (len);		/* XXX */
+}
+
+static __inline uint64_t
+IOMMU_REMOVE_MAPPING(void *domain, vm_paddr_t gpa, uint64_t len)
+{
+
+	if (ops != NULL && iommu_avail)
+		return ((*ops->remove_mapping)(domain, gpa, len));
+	else
+		return (len);		/* XXX */
+}
+
+static __inline void
+IOMMU_ADD_DEVICE(void *domain, int bus, int slot, int func)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->add_device)(domain, bus, slot, func);
+}
+
+static __inline void
+IOMMU_REMOVE_DEVICE(void *domain, int bus, int slot, int func)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->remove_device)(domain, bus, slot, func);
+}
+
+static __inline void
+IOMMU_INVALIDATE_TLB(void *domain)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->invalidate_tlb)(domain);
+}
+
+static __inline void
+IOMMU_ENABLE(void)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->enable)();
+}
+
+static __inline void
+IOMMU_DISABLE(void)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->disable)();
+}
+
+void
+iommu_init(void)
+{
+	int error, bus, slot, func;
+	vm_paddr_t maxaddr;
+	const char *name;
+	device_t dev;
+
+	if (vmm_is_intel())
+		ops = &iommu_ops_intel;
+	else if (vmm_is_amd())
+		ops = &iommu_ops_amd;
+	else
+		ops = NULL;
+
+	error = IOMMU_INIT();
+	if (error)
+		return;
+
+	iommu_avail = TRUE;
+
+	/*
+	 * Create a domain for the devices owned by the host
+	 */
+	maxaddr = vmm_mem_maxaddr();
+	host_domain = IOMMU_CREATE_DOMAIN(maxaddr);
+	if (host_domain == NULL)
+		panic("iommu_init: unable to create a host domain");
+
+	/*
+	 * Create 1:1 mappings from '0' to 'maxaddr' for devices assigned to
+	 * the host
+	 */
+	iommu_create_mapping(host_domain, 0, 0, maxaddr);
+
+	for (bus = 0; bus <= PCI_BUSMAX; bus++) {
+		for (slot = 0; slot <= PCI_SLOTMAX; slot++) {
+			for (func = 0; func <= PCI_FUNCMAX; func++) {
+				dev = pci_find_dbsf(0, bus, slot, func);
+				if (dev == NULL)
+					continue;
+
+				/* skip passthrough devices */
+				name = device_get_name(dev);
+				if (name != NULL && strcmp(name, "ppt") == 0)
+					continue;
+
+				/* everything else belongs to the host domain */
+				iommu_add_device(host_domain, bus, slot, func);
+			}
+		}
+	}
+	IOMMU_ENABLE();
+
+}
+
+void
+iommu_cleanup(void)
+{
+	IOMMU_DISABLE();
+	IOMMU_DESTROY_DOMAIN(host_domain);
+	IOMMU_CLEANUP();
+}
+
+void *
+iommu_create_domain(vm_paddr_t maxaddr)
+{
+
+	return (IOMMU_CREATE_DOMAIN(maxaddr));
+}
+
+void
+iommu_destroy_domain(void *dom)
+{
+
+	IOMMU_DESTROY_DOMAIN(dom);
+}
+
+void
+iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len)
+{
+	uint64_t mapped, remaining;
+
+	remaining = len;
+
+	while (remaining > 0) {
+		mapped = IOMMU_CREATE_MAPPING(dom, gpa, hpa, remaining);
+		gpa += mapped;
+		hpa += mapped;
+		remaining -= mapped;
+	}
+}
+
+void
+iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len)
+{
+	uint64_t unmapped, remaining;
+
+	remaining = len;
+
+	while (remaining > 0) {
+		unmapped = IOMMU_REMOVE_MAPPING(dom, gpa, remaining);
+		gpa += unmapped;
+		remaining -= unmapped;
+	}
+}
+
+void *
+iommu_host_domain(void)
+{
+
+	return (host_domain);
+}
+
+void
+iommu_add_device(void *dom, int bus, int slot, int func)
+{
+
+	IOMMU_ADD_DEVICE(dom, bus, slot, func);
+}
+
+void
+iommu_remove_device(void *dom, int bus, int slot, int func)
+{
+
+	IOMMU_REMOVE_DEVICE(dom, bus, slot, func);
+}
+
+void
+iommu_invalidate_tlb(void *domain)
+{
+
+	IOMMU_INVALIDATE_TLB(domain);
+}
diff --git a/sys/amd64/vmm/io/iommu.h b/sys/amd64/vmm/io/iommu.h
new file mode 100644
index 0000000..d5c1d6e
--- /dev/null
+++ b/sys/amd64/vmm/io/iommu.h
@@ -0,0 +1,75 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IO_IOMMU_H_
+#define	_IO_IOMMU_H_
+
+typedef int (*iommu_init_func_t)(void);
+typedef void (*iommu_cleanup_func_t)(void);
+typedef void (*iommu_enable_func_t)(void);
+typedef void (*iommu_disable_func_t)(void);
+typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr);
+typedef void (*iommu_destroy_domain_t)(void *domain);
+typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa,
+					   vm_paddr_t hpa, uint64_t len);
+typedef uint64_t (*iommu_remove_mapping_t)(void *domain, vm_paddr_t gpa,
+					   uint64_t len);
+typedef void (*iommu_add_device_t)(void *domain, int bus, int slot, int func);
+typedef void (*iommu_remove_device_t)(void *dom, int bus, int slot, int func);
+typedef void (*iommu_invalidate_tlb_t)(void *dom);
+
+struct iommu_ops {
+	iommu_init_func_t	init;		/* module wide */
+	iommu_cleanup_func_t	cleanup;
+	iommu_enable_func_t	enable;
+	iommu_disable_func_t	disable;
+
+	iommu_create_domain_t	create_domain;	/* domain-specific */
+	iommu_destroy_domain_t	destroy_domain;
+	iommu_create_mapping_t	create_mapping;
+	iommu_remove_mapping_t	remove_mapping;
+	iommu_add_device_t	add_device;
+	iommu_remove_device_t	remove_device;
+	iommu_invalidate_tlb_t	invalidate_tlb;
+};
+
+extern struct iommu_ops iommu_ops_intel;
+extern struct iommu_ops iommu_ops_amd;
+
+void	iommu_init(void);
+void	iommu_cleanup(void);
+void	*iommu_host_domain(void);
+void	*iommu_create_domain(vm_paddr_t maxaddr);
+void	iommu_destroy_domain(void *dom);
+void	iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa,
+			     size_t len);
+void	iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len);
+void	iommu_add_device(void *dom, int bus, int slot, int func);
+void	iommu_remove_device(void *dom, int bus, int slot, int func);
+void	iommu_invalidate_tlb(void *domain);
+#endif
diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c
new file mode 100644
index 0000000..fdf136b
--- /dev/null
+++ b/sys/amd64/vmm/io/ppt.c
@@ -0,0 +1,610 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/bus.h>
+#include <sys/pciio.h>
+#include <sys/rman.h>
+#include <sys/smp.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/resource.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+
+#include "vmm_lapic.h"
+#include "vmm_ktr.h"
+
+#include "iommu.h"
+#include "ppt.h"
+
+/* XXX locking */
+
+#define	MAX_PPTDEVS	(sizeof(pptdevs) / sizeof(pptdevs[0]))
+#define	MAX_MMIOSEGS	(PCIR_MAX_BAR_0 + 1)
+#define	MAX_MSIMSGS	32
+
+MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
+
+struct pptintr_arg {				/* pptintr(pptintr_arg) */
+	struct pptdev	*pptdev;
+	int		vec;
+	int 		vcpu;
+};
+
+static struct pptdev {
+	device_t	dev;
+	struct vm	*vm;			/* owner of this device */
+	struct vm_memory_segment mmio[MAX_MMIOSEGS];
+	struct {
+		int	num_msgs;		/* guest state */
+
+		int	startrid;		/* host state */
+		struct resource *res[MAX_MSIMSGS];
+		void	*cookie[MAX_MSIMSGS];
+		struct pptintr_arg arg[MAX_MSIMSGS];
+	} msi;
+
+	struct {
+		int num_msgs;
+		int startrid;
+		int msix_table_rid;
+		struct resource *msix_table_res;
+		struct resource **res;
+		void **cookie;
+		struct pptintr_arg *arg;
+	} msix;
+} pptdevs[32];
+
+static int num_pptdevs;
+
+static int
+ppt_probe(device_t dev)
+{
+	int bus, slot, func;
+	struct pci_devinfo *dinfo;
+
+	dinfo = (struct pci_devinfo *)device_get_ivars(dev);
+
+	bus = pci_get_bus(dev);
+	slot = pci_get_slot(dev);
+	func = pci_get_function(dev);
+
+	/*
+	 * To qualify as a pci passthrough device a device must:
+	 * - be allowed by administrator to be used in this role
+	 * - be an endpoint device
+	 */
+	if (vmm_is_pptdev(bus, slot, func) &&
+	    (dinfo->cfg.hdrtype & PCIM_HDRTYPE) == PCIM_HDRTYPE_NORMAL)
+		return (0);
+	else
+		return (ENXIO);
+}
+
+static int
+ppt_attach(device_t dev)
+{
+	int n;
+
+	if (num_pptdevs >= MAX_PPTDEVS) {
+		printf("ppt_attach: maximum number of pci passthrough devices "
+		       "exceeded\n");
+		return (ENXIO);
+	}
+
+	n = num_pptdevs++;
+	pptdevs[n].dev = dev;
+
+	if (bootverbose)
+		device_printf(dev, "attached\n");
+
+	return (0);
+}
+
+static int
+ppt_detach(device_t dev)
+{
+	/*
+	 * XXX check whether there are any pci passthrough devices assigned
+	 * to guests before we allow this driver to detach.
+	 */
+
+	return (0);
+}
+
+static device_method_t ppt_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,		ppt_probe),
+	DEVMETHOD(device_attach,	ppt_attach),
+	DEVMETHOD(device_detach,	ppt_detach),
+	{0, 0}
+};
+
+static devclass_t ppt_devclass;
+DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, 0);
+DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL);
+
+static struct pptdev *
+ppt_find(int bus, int slot, int func)
+{
+	device_t dev;
+	int i, b, s, f;
+
+	for (i = 0; i < num_pptdevs; i++) {
+		dev = pptdevs[i].dev;
+		b = pci_get_bus(dev);
+		s = pci_get_slot(dev);
+		f = pci_get_function(dev);
+		if (bus == b && slot == s && func == f)
+			return (&pptdevs[i]);
+	}
+	return (NULL);
+}
+
+static void
+ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
+{
+	int i;
+	struct vm_memory_segment *seg;
+
+	for (i = 0; i < MAX_MMIOSEGS; i++) {
+		seg = &ppt->mmio[i];
+		if (seg->len == 0)
+			continue;
+		(void)vm_unmap_mmio(vm, seg->gpa, seg->len);
+		bzero(seg, sizeof(struct vm_memory_segment));
+	}
+}
+
+static void
+ppt_teardown_msi(struct pptdev *ppt)
+{
+	int i, rid;
+	void *cookie;
+	struct resource *res;
+
+	if (ppt->msi.num_msgs == 0)
+		return;
+
+	for (i = 0; i < ppt->msi.num_msgs; i++) {
+		rid = ppt->msi.startrid + i;
+		res = ppt->msi.res[i];
+		cookie = ppt->msi.cookie[i];
+
+		if (cookie != NULL)
+			bus_teardown_intr(ppt->dev, res, cookie);
+
+		if (res != NULL)
+			bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
+		
+		ppt->msi.res[i] = NULL;
+		ppt->msi.cookie[i] = NULL;
+	}
+
+	if (ppt->msi.startrid == 1)
+		pci_release_msi(ppt->dev);
+
+	ppt->msi.num_msgs = 0;
+}
+
+static void 
+ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
+{
+	int rid;
+	struct resource *res;
+	void *cookie;
+
+	rid = ppt->msix.startrid + idx;
+	res = ppt->msix.res[idx];
+	cookie = ppt->msix.cookie[idx];
+
+	if (cookie != NULL) 
+		bus_teardown_intr(ppt->dev, res, cookie);
+
+	if (res != NULL) 
+		bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
+
+	ppt->msix.res[idx] = NULL;
+	ppt->msix.cookie[idx] = NULL;
+}
+
+static void 
+ppt_teardown_msix(struct pptdev *ppt)
+{
+	int i;
+
+	if (ppt->msix.num_msgs == 0) 
+		return;
+
+	for (i = 0; i < ppt->msix.num_msgs; i++) 
+		ppt_teardown_msix_intr(ppt, i);
+
+	if (ppt->msix.msix_table_res) {
+		bus_release_resource(ppt->dev, SYS_RES_MEMORY, 
+				     ppt->msix.msix_table_rid,
+				     ppt->msix.msix_table_res);
+		ppt->msix.msix_table_res = NULL;
+		ppt->msix.msix_table_rid = 0;
+	}
+
+	free(ppt->msix.res, M_PPTMSIX);
+	free(ppt->msix.cookie, M_PPTMSIX);
+	free(ppt->msix.arg, M_PPTMSIX);
+
+	pci_release_msi(ppt->dev);
+
+	ppt->msix.num_msgs = 0;
+}
+
+int
+ppt_assign_device(struct vm *vm, int bus, int slot, int func)
+{
+	struct pptdev *ppt;
+
+	ppt = ppt_find(bus, slot, func);
+	if (ppt != NULL) {
+		/*
+		 * If this device is owned by a different VM then we
+		 * cannot change its owner.
+		 */
+		if (ppt->vm != NULL && ppt->vm != vm)
+			return (EBUSY);
+
+		ppt->vm = vm;
+		iommu_add_device(vm_iommu_domain(vm), bus, slot, func);
+		return (0);
+	}
+	return (ENOENT);
+}
+
+int
+ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
+{
+	struct pptdev *ppt;
+
+	ppt = ppt_find(bus, slot, func);
+	if (ppt != NULL) {
+		/*
+		 * If this device is not owned by this 'vm' then bail out.
+		 */
+		if (ppt->vm != vm)
+			return (EBUSY);
+		ppt_unmap_mmio(vm, ppt);
+		ppt_teardown_msi(ppt);
+		ppt_teardown_msix(ppt);
+		iommu_remove_device(vm_iommu_domain(vm), bus, slot, func);
+		ppt->vm = NULL;
+		return (0);
+	}
+	return (ENOENT);
+}
+
+int
+ppt_unassign_all(struct vm *vm)
+{
+	int i, bus, slot, func;
+	device_t dev;
+
+	for (i = 0; i < num_pptdevs; i++) {
+		if (pptdevs[i].vm == vm) {
+			dev = pptdevs[i].dev;
+			bus = pci_get_bus(dev);
+			slot = pci_get_slot(dev);
+			func = pci_get_function(dev);
+			ppt_unassign_device(vm, bus, slot, func);
+		}
+	}
+
+	return (0);
+}
+
+int
+ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
+	     vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+	int i, error;
+	struct vm_memory_segment *seg;
+	struct pptdev *ppt;
+
+	ppt = ppt_find(bus, slot, func);
+	if (ppt != NULL) {
+		if (ppt->vm != vm)
+			return (EBUSY);
+
+		for (i = 0; i < MAX_MMIOSEGS; i++) {
+			seg = &ppt->mmio[i];
+			if (seg->len == 0) {
+				error = vm_map_mmio(vm, gpa, len, hpa);
+				if (error == 0) {
+					seg->gpa = gpa;
+					seg->len = len;
+				}
+				return (error);
+			}
+		}
+		return (ENOSPC);
+	}
+	return (ENOENT);
+}
+
+static int
+pptintr(void *arg)
+{
+	int vec;
+	struct pptdev *ppt;
+	struct pptintr_arg *pptarg;
+	
+	pptarg = arg;
+	ppt = pptarg->pptdev;
+	vec = pptarg->vec;
+
+	if (ppt->vm != NULL)
+		(void) lapic_set_intr(ppt->vm, pptarg->vcpu, vec);
+	else {
+		/*
+		 * XXX
+		 * This is not expected to happen - panic?
+		 */
+	}
+
+	/*
+	 * For legacy interrupts give other filters a chance in case
+	 * the interrupt was not generated by the passthrough device.
+	 */
+	if (ppt->msi.startrid == 0)
+		return (FILTER_STRAY);
+	else
+		return (FILTER_HANDLED);
+}
+
+/*
+ * XXX
+ * When we try to free the MSI resource the kernel will bind the thread to
+ * the host cpu was originally handling the MSI. The function freeing the
+ * MSI vector (apic_free_vector()) will panic the kernel if the thread
+ * is already bound to a cpu.
+ * 
+ * So, we temporarily unbind the vcpu thread before freeing the MSI resource.
+ */
+static void
+PPT_TEARDOWN_MSI(struct vm *vm, int vcpu, struct pptdev *ppt)
+{
+	int pincpu = -1;
+
+	vm_get_pinning(vm, vcpu, &pincpu);
+
+	if (pincpu >= 0)
+		vm_set_pinning(vm, vcpu, -1);
+
+	ppt_teardown_msi(ppt);
+
+	if (pincpu >= 0)
+		vm_set_pinning(vm, vcpu, pincpu);
+}
+
+int
+ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
+	      int destcpu, int vector, int numvec)
+{
+	int i, rid, flags;
+	int msi_count, startrid, error, tmp;
+	struct pptdev *ppt;
+
+	if ((destcpu >= VM_MAXCPU || destcpu < 0) ||
+	    (vector < 0 || vector > 255) ||
+	    (numvec < 0 || numvec > MAX_MSIMSGS))
+		return (EINVAL);
+
+	ppt = ppt_find(bus, slot, func);
+	if (ppt == NULL)
+		return (ENOENT);
+	if (ppt->vm != vm)		/* Make sure we own this device */
+		return (EBUSY);
+
+	/* Free any allocated resources */
+	PPT_TEARDOWN_MSI(vm, vcpu, ppt);
+
+	if (numvec == 0)		/* nothing more to do */
+		return (0);
+
+	flags = RF_ACTIVE;
+	msi_count = pci_msi_count(ppt->dev);
+	if (msi_count == 0) {
+		startrid = 0;		/* legacy interrupt */
+		msi_count = 1;
+		flags |= RF_SHAREABLE;
+	} else
+		startrid = 1;		/* MSI */
+
+	/*
+	 * The device must be capable of supporting the number of vectors
+	 * the guest wants to allocate.
+	 */
+	if (numvec > msi_count)
+		return (EINVAL);
+
+	/*
+	 * Make sure that we can allocate all the MSI vectors that are needed
+	 * by the guest.
+	 */
+	if (startrid == 1) {
+		tmp = numvec;
+		error = pci_alloc_msi(ppt->dev, &tmp);
+		if (error)
+			return (error);
+		else if (tmp != numvec) {
+			pci_release_msi(ppt->dev);
+			return (ENOSPC);
+		} else {
+			/* success */
+		}
+	}
+	
+	ppt->msi.startrid = startrid;
+
+	/*
+	 * Allocate the irq resource and attach it to the interrupt handler.
+	 */
+	for (i = 0; i < numvec; i++) {
+		ppt->msi.num_msgs = i + 1;
+		ppt->msi.cookie[i] = NULL;
+
+		rid = startrid + i;
+		ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
+							 &rid, flags);
+		if (ppt->msi.res[i] == NULL)
+			break;
+
+		ppt->msi.arg[i].pptdev = ppt;
+		ppt->msi.arg[i].vec = vector + i;
+		ppt->msi.arg[i].vcpu = destcpu;
+
+		error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
+				       INTR_TYPE_NET | INTR_MPSAFE,
+				       pptintr, NULL, &ppt->msi.arg[i],
+				       &ppt->msi.cookie[i]);
+		if (error != 0)
+			break;
+	}
+	
+	if (i < numvec) {
+		PPT_TEARDOWN_MSI(vm, vcpu, ppt);
+		return (ENXIO);
+	}
+
+	return (0);
+}
+
+int
+ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
+	       int idx, uint32_t msg, uint32_t vector_control, uint64_t addr)
+{
+	struct pptdev *ppt;
+	struct pci_devinfo *dinfo;
+	int numvec, alloced, rid, error;
+	size_t res_size, cookie_size, arg_size;
+
+	ppt = ppt_find(bus, slot, func);
+	if (ppt == NULL)
+		return (ENOENT);
+	if (ppt->vm != vm)		/* Make sure we own this device */
+		return (EBUSY);
+
+	dinfo = device_get_ivars(ppt->dev);
+	if (!dinfo) 
+		return (ENXIO);
+
+	/* 
+	 * First-time configuration:
+	 * 	Allocate the MSI-X table
+	 *	Allocate the IRQ resources
+	 *	Set up some variables in ppt->msix
+	 */
+	if (ppt->msix.num_msgs == 0) {
+		numvec = pci_msix_count(ppt->dev);
+		if (numvec <= 0)
+			return (EINVAL);
+
+		ppt->msix.startrid = 1;
+		ppt->msix.num_msgs = numvec;
+
+		res_size = numvec * sizeof(ppt->msix.res[0]);
+		cookie_size = numvec * sizeof(ppt->msix.cookie[0]);
+		arg_size = numvec * sizeof(ppt->msix.arg[0]);
+
+		ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO);
+		ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX,
+					  M_WAITOK | M_ZERO);
+		ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO);
+
+		rid = dinfo->cfg.msix.msix_table_bar;
+		ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev,
+					       SYS_RES_MEMORY, &rid, RF_ACTIVE);
+
+		if (ppt->msix.msix_table_res == NULL) {
+			ppt_teardown_msix(ppt);
+			return (ENOSPC);
+		}
+		ppt->msix.msix_table_rid = rid;
+
+		alloced = numvec;
+		error = pci_alloc_msix(ppt->dev, &alloced);
+		if (error || alloced != numvec) {
+			ppt_teardown_msix(ppt);
+			return (error == 0 ? ENOSPC: error);
+		}
+	}
+
+	if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+		/* Tear down the IRQ if it's already set up */
+		ppt_teardown_msix_intr(ppt, idx);
+
+		/* Allocate the IRQ resource */
+		ppt->msix.cookie[idx] = NULL;
+		rid = ppt->msix.startrid + idx;
+		ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
+							    &rid, RF_ACTIVE);
+		if (ppt->msix.res[idx] == NULL)
+			return (ENXIO);
+	
+		ppt->msix.arg[idx].pptdev = ppt;
+		ppt->msix.arg[idx].vec = msg;
+		ppt->msix.arg[idx].vcpu = (addr >> 12) & 0xFF;
+	
+		/* Setup the MSI-X interrupt */
+		error = bus_setup_intr(ppt->dev, ppt->msix.res[idx],
+				       INTR_TYPE_NET | INTR_MPSAFE,
+				       pptintr, NULL, &ppt->msix.arg[idx],
+				       &ppt->msix.cookie[idx]);
+	
+		if (error != 0) {
+			bus_teardown_intr(ppt->dev, ppt->msix.res[idx], ppt->msix.cookie[idx]);
+			bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]);
+			ppt->msix.cookie[idx] = NULL;
+			ppt->msix.res[idx] = NULL;
+			return (ENXIO);
+		}
+	} else {
+		/* Masked, tear it down if it's already been set up */
+		ppt_teardown_msix_intr(ppt, idx);
+	}
+
+	return (0);
+}
+
diff --git a/sys/amd64/vmm/io/ppt.h b/sys/amd64/vmm/io/ppt.h
new file mode 100644
index 0000000..63c8228
--- /dev/null
+++ b/sys/amd64/vmm/io/ppt.h
@@ -0,0 +1,41 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IO_PPT_H_
+#define	_IO_PPT_H_
+
+int	ppt_assign_device(struct vm *vm, int bus, int slot, int func);
+int	ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
+int	ppt_unassign_all(struct vm *vm);
+int	ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
+		     vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int	ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
+		      int destcpu, int vector, int numvec);
+int	ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
+		       int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
+#endif
diff --git a/sys/amd64/vmm/io/vdev.c b/sys/amd64/vmm/io/vdev.c
new file mode 100644
index 0000000..cd6c5d1
--- /dev/null
+++ b/sys/amd64/vmm/io/vdev.c
@@ -0,0 +1,270 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include "vdev.h"
+
+struct vdev {
+	SLIST_ENTRY(vdev) 	 entry;
+	struct vdev_ops 	*ops;
+	void			*dev;
+};
+static SLIST_HEAD(, vdev)	vdev_head;
+static int 		  	vdev_count;
+
+struct vdev_region {
+	SLIST_ENTRY(vdev_region) 	 entry;
+	struct vdev_ops 		*ops;
+	void				*dev;
+	struct io_region		*io;
+};
+static SLIST_HEAD(, vdev_region)	 region_head;
+static int 		  		 region_count;
+
+static MALLOC_DEFINE(M_VDEV, "vdev", "vdev");
+
+#define VDEV_INIT 	(0)
+#define VDEV_RESET	(1)
+#define VDEV_HALT	(2)
+
+// static const char* vdev_event_str[] = {"VDEV_INIT", "VDEV_RESET", "VDEV_HALT"};
+
+static int
+vdev_system_event(int event)
+{
+	struct vdev 	*vd;
+	int		 rc;
+
+	// TODO: locking
+	SLIST_FOREACH(vd, &vdev_head, entry) {
+		// printf("%s : %s Device %s\n", __func__, vdev_event_str[event], vd->ops->name);
+		switch (event) {
+			case VDEV_INIT:
+				rc = vd->ops->init(vd->dev);
+				break;
+			case VDEV_RESET:
+				rc = vd->ops->reset(vd->dev);
+				break;
+			case VDEV_HALT:
+				rc = vd->ops->halt(vd->dev);
+				break;
+			default:
+				break;
+		}
+		if (rc) {
+			printf("vdev %s init failed rc=%d\n",
+			    vd->ops->name, rc);
+			return rc;
+		}
+	}
+	return 0;
+}
+
+int
+vdev_init(void)
+{
+	return vdev_system_event(VDEV_INIT);
+}
+
+int
+vdev_reset(void)
+{
+	return vdev_system_event(VDEV_RESET);
+}
+
+int
+vdev_halt(void)
+{
+	return vdev_system_event(VDEV_HALT);
+}
+
+void
+vdev_vm_init(void)
+{
+	SLIST_INIT(&vdev_head);
+	vdev_count = 0;
+
+	SLIST_INIT(&region_head);
+	region_count = 0;
+}
+void
+vdev_vm_cleanup(void)
+{
+	struct vdev *vd;
+     
+	// TODO: locking
+	while (!SLIST_EMPTY(&vdev_head)) {
+		vd = SLIST_FIRST(&vdev_head);
+		SLIST_REMOVE_HEAD(&vdev_head, entry);
+		free(vd, M_VDEV);
+		vdev_count--;
+	}
+}
+
+int
+vdev_register(struct vdev_ops *ops, void *dev)
+{
+	struct vdev *vd;
+	vd = malloc(sizeof(*vd), M_VDEV, M_WAITOK | M_ZERO); 
+	vd->ops = ops;
+	vd->dev = dev;
+	
+	// TODO: locking
+	SLIST_INSERT_HEAD(&vdev_head, vd, entry); 
+	vdev_count++;
+	return 0;
+}
+
+void
+vdev_unregister(void *dev)
+{
+	struct vdev 	*vd, *found;
+
+	found = NULL;
+	// TODO: locking
+	SLIST_FOREACH(vd, &vdev_head, entry) {
+		if (vd->dev == dev) {
+			found = vd;
+		}
+	}
+
+	if (found) {
+		SLIST_REMOVE(&vdev_head, found, vdev, entry);
+		free(found, M_VDEV);
+	}
+}
+
+#define IN_RANGE(val, start, end)	\
+    (((val) >= (start)) && ((val) < (end)))
+
+static struct vdev_region*
+vdev_find_region(struct io_region *io, void *dev) 
+{
+	struct 		vdev_region *region, *found;
+	uint64_t	region_base;
+	uint64_t	region_end;
+
+	found = NULL;
+
+	// TODO: locking
+	// FIXME: we should verify we are in the context the current
+	// 	  vcpu here as well.
+	SLIST_FOREACH(region, &region_head, entry) {
+		region_base = region->io->base;
+		region_end = region_base + region->io->len;
+		if (IN_RANGE(io->base, region_base, region_end) &&
+		    IN_RANGE(io->base+io->len, region_base, region_end+1) &&
+		    (dev && dev == region->dev)) {
+			found = region;
+			break;
+		}
+	}
+	return found;
+}
+
+int
+vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io)
+{
+	struct vdev_region *region;
+
+	region = vdev_find_region(io, dev);
+	if (region) {
+		return -EEXIST;
+	}
+
+	region = malloc(sizeof(*region), M_VDEV, M_WAITOK | M_ZERO);
+	region->io = io;
+	region->ops = ops;
+	region->dev = dev;
+
+	// TODO: locking
+	SLIST_INSERT_HEAD(&region_head, region, entry); 
+	region_count++;
+
+	return 0;
+}
+
+void
+vdev_unregister_region(void *dev, struct io_region *io)
+{
+	struct vdev_region *region;
+
+	region = vdev_find_region(io, dev);
+	
+	if (region) {
+		SLIST_REMOVE(&region_head, region, vdev_region, entry);
+		free(region, M_VDEV);
+		region_count--;
+	}
+}
+
+static int
+vdev_memrw(uint64_t gpa, opsize_t size, uint64_t *data, int read)
+{
+	struct vdev_region 	*region;
+	struct io_region	 io;
+	region_attr_t		 attr;
+	int			 rc;
+
+	io.base = gpa;
+	io.len = size;
+
+	region = vdev_find_region(&io, NULL);
+	if (!region)
+		return -EINVAL;
+	
+	attr = (read) ? MMIO_READ : MMIO_WRITE;
+	if (!(region->io->attr & attr))
+		return -EPERM;
+
+	if (read)
+		rc = region->ops->memread(region->dev, gpa, size, data);
+	else 
+		rc = region->ops->memwrite(region->dev, gpa, size, *data);
+
+	return rc;
+}
+
+int
+vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data)
+{
+	return vdev_memrw(gpa, size, data, 1);
+}
+
+int
+vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data)
+{
+	return vdev_memrw(gpa, size, &data, 0);
+}
diff --git a/sys/amd64/vmm/io/vdev.h b/sys/amd64/vmm/io/vdev.h
new file mode 100644
index 0000000..6feeba8
--- /dev/null
+++ b/sys/amd64/vmm/io/vdev.h
@@ -0,0 +1,84 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VDEV_H_
+#define	_VDEV_H_
+
+typedef enum {
+	BYTE	= 1,
+	WORD	= 2,
+	DWORD	= 4,
+	QWORD	= 8,
+} opsize_t;
+
+typedef enum {
+	MMIO_READ = 1,
+	MMIO_WRITE = 2,
+} region_attr_t;
+
+struct io_region {
+	uint64_t	base;
+	uint64_t	len;
+	region_attr_t	attr;
+	int		vcpu;
+};
+
+typedef int (*vdev_init_t)(void* dev);
+typedef int (*vdev_reset_t)(void* dev);
+typedef int (*vdev_halt_t)(void* dev);
+typedef int (*vdev_memread_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t *data);
+typedef int (*vdev_memwrite_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t data);
+
+
+struct vdev_ops {
+	const char	*name;
+	vdev_init_t	init;
+	vdev_reset_t	reset;
+	vdev_halt_t	halt;
+	vdev_memread_t	memread;
+	vdev_memwrite_t	memwrite;
+};
+
+
+void vdev_vm_init(void);
+void vdev_vm_cleanup(void);
+
+int  vdev_register(struct vdev_ops *ops, void *dev);
+void vdev_unregister(void *dev);
+
+int  vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io);
+void vdev_unregister_region(void *dev, struct io_region *io);
+
+int vdev_init(void);
+int vdev_reset(void);
+int vdev_halt(void);
+int vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data);
+int vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data);
+
+#endif	/* _VDEV_H_ */
+
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
new file mode 100644
index 0000000..15fc6c2
--- /dev/null
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -0,0 +1,901 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+
+#include <machine/clock.h>
+#include <x86/specialreg.h>
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+
+#include "vmm_lapic.h"
+#include "vmm_ktr.h"
+#include "vdev.h"
+#include "vlapic.h"
+
+#define	VLAPIC_CTR0(vlapic, format)					\
+	VMM_CTR0((vlapic)->vm, (vlapic)->vcpuid, format)
+
+#define	VLAPIC_CTR1(vlapic, format, p1)					\
+	VMM_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1)
+
+#define	VLAPIC_CTR_IRR(vlapic, msg)					\
+do {									\
+	uint32_t *irrptr = &(vlapic)->apic.irr0;			\
+	irrptr[0] = irrptr[0];	/* silence compiler */			\
+	VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]);	\
+} while (0)
+
+#define	VLAPIC_CTR_ISR(vlapic, msg)					\
+do {									\
+	uint32_t *isrptr = &(vlapic)->apic.isr0;			\
+	isrptr[0] = isrptr[0];	/* silence compiler */			\
+	VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]);	\
+} while (0)
+
+static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
+
+#define	PRIO(x)			((x) >> 4)
+
+#define VLAPIC_VERSION		(16)
+#define VLAPIC_MAXLVT_ENTRIES	(5)
+
+#define	x2apic(vlapic)	(((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0)
+
+enum boot_state {
+	BS_INIT,
+	BS_SIPI,
+	BS_RUNNING
+};
+
+struct vlapic {
+	struct vm		*vm;
+	int			vcpuid;
+
+	struct io_region	*mmio;
+	struct vdev_ops		*ops;
+	struct LAPIC		 apic;
+
+	int			 esr_update;
+
+	int			 divisor;
+	int			 ccr_ticks;
+
+	/*
+	 * The 'isrvec_stk' is a stack of vectors injected by the local apic.
+	 * A vector is popped from the stack when the processor does an EOI.
+	 * The vector on the top of the stack is used to compute the
+	 * Processor Priority in conjunction with the TPR.
+	 */
+	uint8_t			 isrvec_stk[ISRVEC_STK_SIZE];
+	int			 isrvec_stk_top;
+
+	uint64_t		msr_apicbase;
+	enum boot_state		boot_state;
+};
+
+#define VLAPIC_BUS_FREQ	tsc_freq
+
+static int
+vlapic_timer_divisor(uint32_t dcr)
+{
+	switch (dcr & 0xB) {
+	case APIC_TDCR_2:
+		return (2);
+	case APIC_TDCR_4:
+		return (4);
+	case APIC_TDCR_8:
+		return (8);
+	case APIC_TDCR_16:
+		return (16);
+	case APIC_TDCR_32:
+		return (32);
+	case APIC_TDCR_64:
+		return (64);
+	case APIC_TDCR_128:
+		return (128);
+	default:
+		panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
+	}
+}
+
+static void
+vlapic_mask_lvts(uint32_t *lvts, int num_lvt)
+{
+	int i;
+	for (i = 0; i < num_lvt; i++) {
+		*lvts |= APIC_LVT_M;
+		lvts += 4;
+	}
+}
+
+#if 0
+static inline void
+vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
+{
+	printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset,
+	    *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS,
+	    *lvt & APIC_LVTT_M);
+}
+#endif
+
+static uint64_t
+vlapic_get_ccr(struct vlapic *vlapic)
+{
+	struct LAPIC    *lapic = &vlapic->apic;
+	return lapic->ccr_timer;
+}
+
+static void
+vlapic_update_errors(struct vlapic *vlapic)
+{
+	struct LAPIC    *lapic = &vlapic->apic;
+	lapic->esr = 0; // XXX 
+}
+
+static void
+vlapic_init_ipi(struct vlapic *vlapic)
+{
+	struct LAPIC    *lapic = &vlapic->apic;
+	lapic->version = VLAPIC_VERSION;
+	lapic->version |= (VLAPIC_MAXLVT_ENTRIES < MAXLVTSHIFT);
+	lapic->dfr = 0xffffffff;
+	lapic->svr = APIC_SVR_VECTOR;
+	vlapic_mask_lvts(&lapic->lvt_timer, VLAPIC_MAXLVT_ENTRIES+1);
+}
+
+static int
+vlapic_op_reset(void* dev)
+{
+	struct vlapic 	*vlapic = (struct vlapic*)dev;
+	struct LAPIC	*lapic = &vlapic->apic;
+
+	memset(lapic, 0, sizeof(*lapic));
+	lapic->apr = vlapic->vcpuid;
+	vlapic_init_ipi(vlapic);
+	vlapic->divisor = vlapic_timer_divisor(lapic->dcr_timer);
+
+	if (vlapic->vcpuid == 0)
+		vlapic->boot_state = BS_RUNNING;	/* BSP */
+	else
+		vlapic->boot_state = BS_INIT;		/* AP */
+	
+	return 0;
+
+}
+
+static int
+vlapic_op_init(void* dev)
+{
+	struct vlapic *vlapic = (struct vlapic*)dev;
+	vdev_register_region(vlapic->ops, vlapic, vlapic->mmio);
+	return vlapic_op_reset(dev);
+}
+
+static int
+vlapic_op_halt(void* dev)
+{
+	struct vlapic *vlapic = (struct vlapic*)dev;
+	vdev_unregister_region(vlapic, vlapic->mmio);
+	return 0;
+
+}
+
+void
+vlapic_set_intr_ready(struct vlapic *vlapic, int vector)
+{
+	struct LAPIC	*lapic = &vlapic->apic;
+	uint32_t	*irrptr;
+	int		idx;
+
+	if (vector < 0 || vector >= 256)
+		panic("vlapic_set_intr_ready: invalid vector %d\n", vector);
+
+	idx = (vector / 32) * 4;
+	irrptr = &lapic->irr0;
+	atomic_set_int(&irrptr[idx], 1 << (vector % 32));
+	VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
+}
+
+static void
+vlapic_start_timer(struct vlapic *vlapic, uint32_t elapsed)
+{
+	uint32_t icr_timer;
+
+	icr_timer = vlapic->apic.icr_timer;
+
+	vlapic->ccr_ticks = ticks;
+	if (elapsed < icr_timer)
+		vlapic->apic.ccr_timer = icr_timer - elapsed;
+	else {
+		/*
+		 * This can happen when the guest is trying to run its local
+		 * apic timer higher that the setting of 'hz' in the host.
+		 *
+		 * We deal with this by running the guest local apic timer
+		 * at the rate of the host's 'hz' setting.
+		 */
+		vlapic->apic.ccr_timer = 0;
+	}
+}
+
+static __inline uint32_t *
+vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
+{
+	struct LAPIC	*lapic = &vlapic->apic;
+	int 		 i;
+
+	if (offset < APIC_OFFSET_TIMER_LVT || offset > APIC_OFFSET_ERROR_LVT) {
+		panic("vlapic_get_lvt: invalid LVT\n");
+	}
+	i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
+	return ((&lapic->lvt_timer) + i);;
+}
+
+#if 1
+static void
+dump_isrvec_stk(struct vlapic *vlapic)
+{
+	int i;
+	uint32_t *isrptr;
+
+	isrptr = &vlapic->apic.isr0;
+	for (i = 0; i < 8; i++)
+		printf("ISR%d 0x%08x\n", i, isrptr[i * 4]);
+
+	for (i = 0; i <= vlapic->isrvec_stk_top; i++)
+		printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]);
+}
+#endif
+
+/*
+ * Algorithm adopted from section "Interrupt, Task and Processor Priority"
+ * in Intel Architecture Manual Vol 3a.
+ */
+static void
+vlapic_update_ppr(struct vlapic *vlapic)
+{
+	int isrvec, tpr, ppr;
+
+	/*
+	 * Note that the value on the stack at index 0 is always 0.
+	 *
+	 * This is a placeholder for the value of ISRV when none of the
+	 * bits is set in the ISRx registers.
+	 */
+	isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top];
+	tpr = vlapic->apic.tpr;
+
+#if 1
+	{
+		int i, lastprio, curprio, vector, idx;
+		uint32_t *isrptr;
+
+		if (vlapic->isrvec_stk_top == 0 && isrvec != 0)
+			panic("isrvec_stk is corrupted: %d", isrvec);
+
+		/*
+		 * Make sure that the priority of the nested interrupts is
+		 * always increasing.
+		 */
+		lastprio = -1;
+		for (i = 1; i <= vlapic->isrvec_stk_top; i++) {
+			curprio = PRIO(vlapic->isrvec_stk[i]);
+			if (curprio <= lastprio) {
+				dump_isrvec_stk(vlapic);
+				panic("isrvec_stk does not satisfy invariant");
+			}
+			lastprio = curprio;
+		}
+
+		/*
+		 * Make sure that each bit set in the ISRx registers has a
+		 * corresponding entry on the isrvec stack.
+		 */
+		i = 1;
+		isrptr = &vlapic->apic.isr0;
+		for (vector = 0; vector < 256; vector++) {
+			idx = (vector / 32) * 4;
+			if (isrptr[idx] & (1 << (vector % 32))) {
+				if (i > vlapic->isrvec_stk_top ||
+				    vlapic->isrvec_stk[i] != vector) {
+					dump_isrvec_stk(vlapic);
+					panic("ISR and isrvec_stk out of sync");
+				}
+				i++;
+			}
+		}
+	}
+#endif
+
+	if (PRIO(tpr) >= PRIO(isrvec))
+		ppr = tpr;
+	else
+		ppr = isrvec & 0xf0;
+
+	vlapic->apic.ppr = ppr;
+	VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
+}
+
+static void
+vlapic_process_eoi(struct vlapic *vlapic)
+{
+	struct LAPIC	*lapic = &vlapic->apic;
+	uint32_t	*isrptr;
+	int		i, idx, bitpos;
+
+	isrptr = &lapic->isr0;
+
+	/*
+	 * The x86 architecture reserves the the first 32 vectors for use
+	 * by the processor.
+	 */
+	for (i = 7; i > 0; i--) {
+		idx = i * 4;
+		bitpos = fls(isrptr[idx]);
+		if (bitpos != 0) {
+			if (vlapic->isrvec_stk_top <= 0) {
+				panic("invalid vlapic isrvec_stk_top %d",
+				      vlapic->isrvec_stk_top);
+			}
+			isrptr[idx] &= ~(1 << (bitpos - 1));
+			VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi");
+			vlapic->isrvec_stk_top--;
+			vlapic_update_ppr(vlapic);
+			return;
+		}
+	}
+}
+
+static __inline int
+vlapic_get_lvt_field(uint32_t *lvt, uint32_t mask)
+{
+	return (*lvt & mask);
+}
+
+static __inline int
+vlapic_periodic_timer(struct vlapic *vlapic)
+{
+	uint32_t *lvt;
+	
+	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+
+	return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC));
+}
+
+static void
+vlapic_fire_timer(struct vlapic *vlapic)
+{
+	int vector;
+	uint32_t *lvt;
+	
+	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+
+	if (!vlapic_get_lvt_field(lvt, APIC_LVTT_M)) {
+		vector = vlapic_get_lvt_field(lvt,APIC_LVTT_VECTOR);
+		vlapic_set_intr_ready(vlapic, vector);
+	}
+}
+
+static int
+lapic_process_icr(struct vlapic *vlapic, uint64_t icrval)
+{
+	int i;
+	cpuset_t dmask;
+	uint32_t dest, vec, mode;
+	struct vlapic *vlapic2;
+	struct vm_exit *vmexit;
+	
+	if (x2apic(vlapic))
+		dest = icrval >> 32;
+	else
+		dest = icrval >> (32 + 24);
+	vec = icrval & APIC_VECTOR_MASK;
+	mode = icrval & APIC_DELMODE_MASK;
+
+	if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) {
+		switch (icrval & APIC_DEST_MASK) {
+		case APIC_DEST_DESTFLD:
+			CPU_SETOF(dest, &dmask);
+			break;
+		case APIC_DEST_SELF:
+			CPU_SETOF(vlapic->vcpuid, &dmask);
+			break;
+		case APIC_DEST_ALLISELF:
+			dmask = vm_active_cpus(vlapic->vm);
+			break;
+		case APIC_DEST_ALLESELF:
+			dmask = vm_active_cpus(vlapic->vm);
+			CPU_CLR(vlapic->vcpuid, &dmask);
+			break;
+		}
+
+		while ((i = cpusetobj_ffs(&dmask)) != 0) {
+			i--;
+			CPU_CLR(i, &dmask);
+			if (mode == APIC_DELMODE_FIXED)
+				lapic_set_intr(vlapic->vm, i, vec);
+			else
+				vm_inject_nmi(vlapic->vm, i);
+		}
+
+		return (0);	/* handled completely in the kernel */
+	}
+
+	if (mode == APIC_DELMODE_INIT) {
+		if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT)
+			return (0);
+
+		if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
+			vlapic2 = vm_lapic(vlapic->vm, dest);
+
+			/* move from INIT to waiting-for-SIPI state */
+			if (vlapic2->boot_state == BS_INIT) {
+				vlapic2->boot_state = BS_SIPI;
+			}
+
+			return (0);
+		}
+	}
+
+	if (mode == APIC_DELMODE_STARTUP) {
+		if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
+			vlapic2 = vm_lapic(vlapic->vm, dest);
+
+			/*
+			 * Ignore SIPIs in any state other than wait-for-SIPI
+			 */
+			if (vlapic2->boot_state != BS_SIPI)
+				return (0);
+
+			vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
+			vmexit->exitcode = VM_EXITCODE_SPINUP_AP;
+			vmexit->u.spinup_ap.vcpu = dest;
+			vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT;
+
+			/*
+			 * XXX this assumes that the startup IPI always succeeds
+			 */
+			vlapic2->boot_state = BS_RUNNING;
+			vm_activate_cpu(vlapic2->vm, dest);
+
+			return (0);
+		}
+	}
+
+	/*
+	 * This will cause a return to userland.
+	 */
+	return (1);
+}
+
+int
+vlapic_pending_intr(struct vlapic *vlapic)
+{
+	struct LAPIC	*lapic = &vlapic->apic;
+	int	  	 idx, i, bitpos, vector;
+	uint32_t	*irrptr, val;
+
+	irrptr = &lapic->irr0;
+
+	/*
+	 * The x86 architecture reserves the the first 32 vectors for use
+	 * by the processor.
+	 */
+	for (i = 7; i > 0; i--) {
+		idx = i * 4;
+		val = atomic_load_acq_int(&irrptr[idx]);
+		bitpos = fls(val);
+		if (bitpos != 0) {
+			vector = i * 32 + (bitpos - 1);
+			if (PRIO(vector) > PRIO(lapic->ppr)) {
+				VLAPIC_CTR1(vlapic, "pending intr %d", vector);
+				return (vector);
+			} else 
+				break;
+		}
+	}
+	VLAPIC_CTR0(vlapic, "no pending intr");
+	return (-1);
+}
+
+void
+vlapic_intr_accepted(struct vlapic *vlapic, int vector)
+{
+	struct LAPIC	*lapic = &vlapic->apic;
+	uint32_t	*irrptr, *isrptr;
+	int		idx, stk_top;
+
+	/*
+	 * clear the ready bit for vector being accepted in irr 
+	 * and set the vector as in service in isr.
+	 */
+	idx = (vector / 32) * 4;
+
+	irrptr = &lapic->irr0;
+	atomic_clear_int(&irrptr[idx], 1 << (vector % 32));
+	VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted");
+
+	isrptr = &lapic->isr0;
+	isrptr[idx] |= 1 << (vector % 32);
+	VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted");
+
+	/*
+	 * Update the PPR
+	 */
+	vlapic->isrvec_stk_top++;
+
+	stk_top = vlapic->isrvec_stk_top;
+	if (stk_top >= ISRVEC_STK_SIZE)
+		panic("isrvec_stk_top overflow %d", stk_top);
+
+	vlapic->isrvec_stk[stk_top] = vector;
+	vlapic_update_ppr(vlapic);
+}
+
+int
+vlapic_op_mem_read(void* dev, uint64_t gpa, opsize_t size, uint64_t *data)
+{
+	struct vlapic 	*vlapic = (struct vlapic*)dev;
+	struct LAPIC	*lapic = &vlapic->apic;
+	uint64_t	 offset = gpa & ~(PAGE_SIZE);
+	uint32_t	*reg;
+	int		 i;
+
+	if (offset > sizeof(*lapic)) {
+		*data = 0;
+		return 0;
+	}
+	
+	offset &= ~3;
+	switch(offset)
+	{
+		case APIC_OFFSET_ID:
+			if (x2apic(vlapic))
+				*data = vlapic->vcpuid;
+			else
+				*data = vlapic->vcpuid << 24;
+			break;
+		case APIC_OFFSET_VER:
+			*data = lapic->version;
+			break;
+		case APIC_OFFSET_TPR:
+			*data = lapic->tpr;
+			break;
+		case APIC_OFFSET_APR:
+			*data = lapic->apr;
+			break;
+		case APIC_OFFSET_PPR:
+			*data = lapic->ppr;
+			break;
+		case APIC_OFFSET_EOI:
+			*data = lapic->eoi;
+			break;
+		case APIC_OFFSET_LDR:
+			*data = lapic->ldr;
+			break;
+		case APIC_OFFSET_DFR:
+			*data = lapic->dfr;
+			break;
+		case APIC_OFFSET_SVR:
+			*data = lapic->svr;
+			break;
+		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
+			i = (offset - APIC_OFFSET_ISR0) >> 2;
+			reg = &lapic->isr0;
+			*data = *(reg + i);
+			break;
+		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
+			i = (offset - APIC_OFFSET_TMR0) >> 2;
+			reg = &lapic->tmr0;
+			*data = *(reg + i);
+			break;
+		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
+			i = (offset - APIC_OFFSET_IRR0) >> 2;
+			reg = &lapic->irr0;
+			*data = atomic_load_acq_int(reg + i);
+			break;
+		case APIC_OFFSET_ESR:
+			*data = lapic->esr;
+			break;
+		case APIC_OFFSET_ICR_LOW: 
+			*data = lapic->icr_lo;
+			break;
+		case APIC_OFFSET_ICR_HI: 
+			*data = lapic->icr_hi;
+			break;
+		case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+			reg = vlapic_get_lvt(vlapic, offset);	
+			*data = *(reg);
+			break;
+		case APIC_OFFSET_ICR:
+			*data = lapic->icr_timer;
+			break;
+		case APIC_OFFSET_CCR:
+			*data = vlapic_get_ccr(vlapic);
+			break;
+		case APIC_OFFSET_DCR:
+			*data = lapic->dcr_timer;
+			break;
+		case APIC_OFFSET_RRR:
+		default:
+			*data = 0;
+			break;
+	}
+	return 0;
+}
+
+int
+vlapic_op_mem_write(void* dev, uint64_t gpa, opsize_t size, uint64_t data)
+{
+	struct vlapic 	*vlapic = (struct vlapic*)dev;
+	struct LAPIC	*lapic = &vlapic->apic;
+	uint64_t	 offset = gpa & ~(PAGE_SIZE);
+	uint32_t	*reg;
+	int		retval;
+
+	if (offset > sizeof(*lapic)) {
+		return 0;
+	}
+
+	retval = 0;
+	offset &= ~3;
+	switch(offset)
+	{
+		case APIC_OFFSET_ID:
+			break;
+		case APIC_OFFSET_TPR:
+			lapic->tpr = data & 0xff;
+			vlapic_update_ppr(vlapic);
+			break;
+		case APIC_OFFSET_EOI:
+			vlapic_process_eoi(vlapic);
+			break;
+		case APIC_OFFSET_LDR:
+			break;
+		case APIC_OFFSET_DFR:
+			break;
+		case APIC_OFFSET_SVR:
+			lapic->svr = data;
+			break;
+		case APIC_OFFSET_ICR_LOW: 
+			if (!x2apic(vlapic)) {
+				data &= 0xffffffff;
+				data |= (uint64_t)lapic->icr_hi << 32;
+			}
+			retval = lapic_process_icr(vlapic, data);
+			break;
+		case APIC_OFFSET_ICR_HI:
+			if (!x2apic(vlapic)) {
+				retval = 0;
+				lapic->icr_hi = data;
+			}
+			break;
+		case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+			reg = vlapic_get_lvt(vlapic, offset);	
+			if (!(lapic->svr & APIC_SVR_ENABLE)) {
+				data |= APIC_LVT_M;
+			}
+			*reg = data;
+			// vlapic_dump_lvt(offset, reg);
+			break;
+		case APIC_OFFSET_ICR:
+			lapic->icr_timer = data;
+			vlapic_start_timer(vlapic, 0);
+			break;
+
+		case APIC_OFFSET_DCR:
+			lapic->dcr_timer = data;
+			vlapic->divisor = vlapic_timer_divisor(data);
+			break;
+
+		case APIC_OFFSET_ESR:
+			vlapic_update_errors(vlapic);
+			break;
+		case APIC_OFFSET_VER:
+		case APIC_OFFSET_APR:
+		case APIC_OFFSET_PPR:
+		case APIC_OFFSET_RRR:
+		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
+		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
+		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
+		case APIC_OFFSET_CCR:
+		default:
+			// Read only.
+			break;
+	}
+
+	return (retval);
+}
+
+int
+vlapic_timer_tick(struct vlapic *vlapic)
+{
+	int curticks, delta, periodic, fired;
+	uint32_t ccr;
+	uint32_t decrement, leftover;
+
+restart:
+	curticks = ticks;
+	delta = curticks - vlapic->ccr_ticks;
+
+	/* Local APIC timer is disabled */
+	if (vlapic->apic.icr_timer == 0)
+		return (-1);
+
+	/* One-shot mode and timer has already counted down to zero */
+	periodic = vlapic_periodic_timer(vlapic);
+	if (!periodic && vlapic->apic.ccr_timer == 0)
+		return (-1);
+	/*
+	 * The 'curticks' and 'ccr_ticks' are out of sync by more than
+	 * 2^31 ticks. We deal with this by restarting the timer.
+	 */
+	if (delta < 0) {
+		vlapic_start_timer(vlapic, 0);
+		goto restart;
+	}
+
+	fired = 0;
+	decrement = (VLAPIC_BUS_FREQ / vlapic->divisor) / hz;
+
+	vlapic->ccr_ticks = curticks;
+	ccr = vlapic->apic.ccr_timer;
+
+	while (delta-- > 0) {
+		if (ccr > decrement) {
+			ccr -= decrement;
+			continue;
+		}
+
+		/* Trigger the local apic timer interrupt */
+		vlapic_fire_timer(vlapic);
+		if (periodic) {
+			leftover = decrement - ccr;
+			vlapic_start_timer(vlapic, leftover);
+			ccr = vlapic->apic.ccr_timer;
+		} else {
+			/*
+			 * One-shot timer has counted down to zero.
+			 */
+			ccr = 0;
+		}
+		fired = 1;
+		break;
+	}
+
+	vlapic->apic.ccr_timer = ccr;
+
+	if (!fired)
+		return ((ccr / decrement) + 1);
+	else
+		return (0);
+}
+
+struct vdev_ops vlapic_dev_ops = {
+	.name = "vlapic",
+	.init = vlapic_op_init,
+	.reset = vlapic_op_reset,
+	.halt = vlapic_op_halt,
+	.memread = vlapic_op_mem_read,
+	.memwrite = vlapic_op_mem_write,
+};
+static struct io_region vlapic_mmio[VM_MAXCPU];
+
+struct vlapic *
+vlapic_init(struct vm *vm, int vcpuid)
+{
+	struct vlapic 		*vlapic;
+
+	vlapic = malloc(sizeof(struct vlapic), M_VLAPIC, M_WAITOK | M_ZERO);
+	vlapic->vm = vm;
+	vlapic->vcpuid = vcpuid;
+
+	vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED;
+
+	if (vcpuid == 0)
+		vlapic->msr_apicbase |= APICBASE_BSP;
+
+	vlapic->ops = &vlapic_dev_ops;
+
+	vlapic->mmio = vlapic_mmio + vcpuid;
+	vlapic->mmio->base = DEFAULT_APIC_BASE;
+	vlapic->mmio->len = PAGE_SIZE;
+	vlapic->mmio->attr = MMIO_READ|MMIO_WRITE;
+	vlapic->mmio->vcpu = vcpuid;
+
+	vdev_register(&vlapic_dev_ops, vlapic);
+
+	vlapic_op_init(vlapic);
+
+	return (vlapic);
+}
+
+void
+vlapic_cleanup(struct vlapic *vlapic)
+{
+	vlapic_op_halt(vlapic);
+	vdev_unregister(vlapic);
+	free(vlapic, M_VLAPIC);
+}
+
+uint64_t
+vlapic_get_apicbase(struct vlapic *vlapic)
+{
+
+	return (vlapic->msr_apicbase);
+}
+
+void
+vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val)
+{
+	int err;
+	enum x2apic_state state;
+
+	err = vm_get_x2apic_state(vlapic->vm, vlapic->vcpuid, &state);
+	if (err)
+		panic("vlapic_set_apicbase: err %d fetching x2apic state", err);
+
+	if (state == X2APIC_DISABLED)
+		val &= ~APICBASE_X2APIC;
+
+	vlapic->msr_apicbase = val;
+}
+
+void
+vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
+{
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, vcpuid);
+
+	if (state == X2APIC_DISABLED)
+		vlapic->msr_apicbase &= ~APICBASE_X2APIC;
+}
diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h
new file mode 100644
index 0000000..00de019
--- /dev/null
+++ b/sys/amd64/vmm/io/vlapic.h
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VLAPIC_H_
+#define	_VLAPIC_H_
+
+#include "vdev.h"
+
+struct vm;
+  
+/*
+ * Map of APIC Registers:       Offset  Description          		 	Access
+ */
+#define APIC_OFFSET_ID 		0x20    // Local APIC ID               		R/W
+#define APIC_OFFSET_VER 	0x30    // Local APIC Version              	R
+#define APIC_OFFSET_TPR 	0x80    // Task Priority Register          	R/W
+#define APIC_OFFSET_APR 	0x90    // Arbitration Priority Register   	R
+#define APIC_OFFSET_PPR 	0xA0    // Processor Priority Register     	R
+#define APIC_OFFSET_EOI 	0xB0    // EOI Register                    	W
+#define APIC_OFFSET_RRR 	0xC0    // Remote read                     	R
+#define APIC_OFFSET_LDR 	0xD0    // Logical Destination             	R/W
+#define APIC_OFFSET_DFR 	0xE0    // Destination Format Register     	0..27 R;  28..31 R/W
+#define APIC_OFFSET_SVR 	0xF0    // Spurious Interrupt Vector Reg.  	0..3  R;  4..9   R/W
+#define APIC_OFFSET_ISR0 	0x100   // ISR  000-031                    	R
+#define APIC_OFFSET_ISR1 	0x110   // ISR  032-063                    	R
+#define APIC_OFFSET_ISR2 	0x120   // ISR  064-095                    	R
+#define APIC_OFFSET_ISR3 	0x130   // ISR  095-128                    	R
+#define APIC_OFFSET_ISR4 	0x140   // ISR  128-159                    	R
+#define APIC_OFFSET_ISR5 	0x150   // ISR  160-191                    	R
+#define APIC_OFFSET_ISR6 	0x160   // ISR  192-223                    	R
+#define APIC_OFFSET_ISR7 	0x170   // ISR  224-255                    	R
+#define APIC_OFFSET_TMR0 	0x180   // TMR  000-031                    	R
+#define APIC_OFFSET_TMR1 	0x190   // TMR  032-063                    	R
+#define APIC_OFFSET_TMR2 	0x1A0   // TMR  064-095                    	R
+#define APIC_OFFSET_TMR3 	0x1B0   // TMR  095-128                    	R
+#define APIC_OFFSET_TMR4 	0x1C0   // TMR  128-159                    	R
+#define APIC_OFFSET_TMR5 	0x1D0   // TMR  160-191                    	R
+#define APIC_OFFSET_TMR6 	0x1E0   // TMR  192-223                    	R
+#define APIC_OFFSET_TMR7 	0x1F0   // TMR  224-255                    	R
+#define APIC_OFFSET_IRR0 	0x200   // IRR  000-031                    	R
+#define APIC_OFFSET_IRR1 	0x210   // IRR  032-063                    	R
+#define APIC_OFFSET_IRR2 	0x220   // IRR  064-095                    	R
+#define APIC_OFFSET_IRR3 	0x230   // IRR  095-128                    	R
+#define APIC_OFFSET_IRR4 	0x240   // IRR  128-159                    	R
+#define APIC_OFFSET_IRR5 	0x250   // IRR  160-191                    	R
+#define APIC_OFFSET_IRR6 	0x260   // IRR  192-223                    	R
+#define APIC_OFFSET_IRR7 	0x270   // IRR  224-255                    	R
+#define APIC_OFFSET_ESR		0x280   // Error Status Register           	R
+#define APIC_OFFSET_ICR_LOW 	0x300   // Interrupt Command Reg. (0-31)   	R/W
+#define APIC_OFFSET_ICR_HI 	0x310   // Interrupt Command Reg. (32-63)  	R/W
+#define APIC_OFFSET_TIMER_LVT 	0x320   // Local Vector Table (Timer)      	R/W
+#define APIC_OFFSET_THERM_LVT 	0x330   // Local Vector Table (Thermal)    	R/W (PIV+)
+#define APIC_OFFSET_PERF_LVT 	0x340   // Local Vector Table (Performance) 	R/W (P6+)
+#define APIC_OFFSET_LINT0_LVT 	0x350   // Local Vector Table (LINT0)      	R/W
+#define APIC_OFFSET_LINT1_LVT 	0x360 	// Local Vector Table (LINT1)      	R/W
+#define APIC_OFFSET_ERROR_LVT 	0x370   // Local Vector Table (ERROR)      	R/W
+#define APIC_OFFSET_ICR 	0x380   // Initial Count Reg. for Timer    	R/W
+#define APIC_OFFSET_CCR 	0x390   // Current Count of Timer          	R
+#define APIC_OFFSET_DCR 	0x3E0   // Timer Divide Configuration Reg. 	R/W
+
+/*
+ * 16 priority levels with at most one vector injected per level.
+ */
+#define	ISRVEC_STK_SIZE		(16 + 1)
+
+enum x2apic_state;
+
+struct vlapic *vlapic_init(struct vm *vm, int vcpuid);
+void vlapic_cleanup(struct vlapic *vlapic);
+
+int vlapic_op_mem_write(void* dev, uint64_t gpa,
+    			opsize_t size, uint64_t data);
+
+int vlapic_op_mem_read(void* dev, uint64_t gpa,
+    			opsize_t size, uint64_t *data);
+
+int vlapic_pending_intr(struct vlapic *vlapic);
+void vlapic_intr_accepted(struct vlapic *vlapic, int vector);
+void vlapic_set_intr_ready(struct vlapic *vlapic, int vector);
+int vlapic_timer_tick(struct vlapic *vlapic);
+
+uint64_t vlapic_get_apicbase(struct vlapic *vlapic);
+void vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val);
+void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state s);
+
+#endif	/* _VLAPIC_H_ */
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
new file mode 100644
index 0000000..a4dea79
--- /dev/null
+++ b/sys/amd64/vmm/vmm.c
@@ -0,0 +1,1022 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+
+#include <machine/vm.h>
+#include <machine/pcb.h>
+#include <machine/smp.h>
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_host.h"
+#include "vmm_mem.h"
+#include "vmm_util.h"
+#include <machine/vmm_dev.h>
+#include "vlapic.h"
+#include "vmm_msr.h"
+#include "vmm_ipi.h"
+#include "vmm_stat.h"
+#include "vmm_lapic.h"
+
+#include "io/ppt.h"
+#include "io/iommu.h"
+
+struct vlapic;
+
+struct vcpu {
+	int		flags;
+	enum vcpu_state	state;
+	struct mtx	mtx;
+	int		pincpu;		/* host cpuid this vcpu is bound to */
+	int		hostcpu;	/* host cpuid this vcpu last ran on */
+	uint64_t	guest_msrs[VMM_MSR_NUM];
+	struct vlapic	*vlapic;
+	int		 vcpuid;
+	struct savefpu	*guestfpu;	/* guest fpu state */
+	void		*stats;
+	struct vm_exit	exitinfo;
+	enum x2apic_state x2apic_state;
+	int		nmi_pending;
+};
+#define	VCPU_F_PINNED	0x0001
+
+#define	VCPU_PINCPU(vm, vcpuid)	\
+    ((vm->vcpu[vcpuid].flags & VCPU_F_PINNED) ? vm->vcpu[vcpuid].pincpu : -1)
+
+#define	VCPU_UNPIN(vm, vcpuid)	(vm->vcpu[vcpuid].flags &= ~VCPU_F_PINNED)
+
+#define	VCPU_PIN(vm, vcpuid, host_cpuid)				\
+do {									\
+	vm->vcpu[vcpuid].flags |= VCPU_F_PINNED;			\
+	vm->vcpu[vcpuid].pincpu = host_cpuid;				\
+} while(0)
+
+#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
+#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
+#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
+
+#define	VM_MAX_MEMORY_SEGMENTS	2
+
+struct vm {
+	void		*cookie;	/* processor-specific data */
+	void		*iommu;		/* iommu-specific data */
+	struct vcpu	vcpu[VM_MAXCPU];
+	int		num_mem_segs;
+	struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
+	char		name[VM_MAX_NAMELEN];
+
+	/*
+	 * Set of active vcpus.
+	 * An active vcpu is one that has been started implicitly (BSP) or
+	 * explicitly (AP) by sending it a startup ipi.
+	 */
+	cpuset_t	active_cpus;
+};
+
+static struct vmm_ops *ops;
+#define	VMM_INIT()	(ops != NULL ? (*ops->init)() : 0)
+#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
+
+#define	VMINIT(vm)	(ops != NULL ? (*ops->vminit)(vm): NULL)
+#define	VMRUN(vmi, vcpu, rip) \
+	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO)
+#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
+#define	VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm)			\
+    	(ops != NULL ? 							\
+    	(*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) :	\
+	ENXIO)
+#define	VMMMAP_GET(vmi, gpa) \
+	(ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO)
+#define	VMGETREG(vmi, vcpu, num, retval)		\
+	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
+#define	VMSETREG(vmi, vcpu, num, val)		\
+	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
+#define	VMGETDESC(vmi, vcpu, num, desc)		\
+	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
+#define	VMSETDESC(vmi, vcpu, num, desc)		\
+	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
+#define	VMINJECT(vmi, vcpu, type, vec, ec, ecv)	\
+	(ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
+#define	VMGETCAP(vmi, vcpu, num, retval)	\
+	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
+#define	VMSETCAP(vmi, vcpu, num, val)		\
+	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
+
+#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
+#define	fpu_stop_emulating()	clts()
+
+static MALLOC_DEFINE(M_VM, "vm", "vm");
+CTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
+
+/* statistics */
+static VMM_STAT_DEFINE(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
+
+static void
+vcpu_cleanup(struct vcpu *vcpu)
+{
+	vlapic_cleanup(vcpu->vlapic);
+	vmm_stat_free(vcpu->stats);	
+	fpu_save_area_free(vcpu->guestfpu);
+}
+
+static void
+vcpu_init(struct vm *vm, uint32_t vcpu_id)
+{
+	struct vcpu *vcpu;
+	
+	vcpu = &vm->vcpu[vcpu_id];
+
+	vcpu_lock_init(vcpu);
+	vcpu->hostcpu = NOCPU;
+	vcpu->vcpuid = vcpu_id;
+	vcpu->vlapic = vlapic_init(vm, vcpu_id);
+	vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
+	vcpu->guestfpu = fpu_save_area_alloc();
+	fpu_save_area_reset(vcpu->guestfpu);
+	vcpu->stats = vmm_stat_alloc();
+}
+
+struct vm_exit *
+vm_exitinfo(struct vm *vm, int cpuid)
+{
+	struct vcpu *vcpu;
+
+	if (cpuid < 0 || cpuid >= VM_MAXCPU)
+		panic("vm_exitinfo: invalid cpuid %d", cpuid);
+
+	vcpu = &vm->vcpu[cpuid];
+
+	return (&vcpu->exitinfo);
+}
+
+static int
+vmm_init(void)
+{
+	int error;
+
+	vmm_host_state_init();
+	vmm_ipi_init();
+
+	error = vmm_mem_init();
+	if (error)
+		return (error);
+	
+	if (vmm_is_intel())
+		ops = &vmm_ops_intel;
+	else if (vmm_is_amd())
+		ops = &vmm_ops_amd;
+	else
+		return (ENXIO);
+
+	vmm_msr_init();
+
+	return (VMM_INIT());
+}
+
+static int
+vmm_handler(module_t mod, int what, void *arg)
+{
+	int error;
+
+	switch (what) {
+	case MOD_LOAD:
+		vmmdev_init();
+		iommu_init();
+		error = vmm_init();
+		break;
+	case MOD_UNLOAD:
+		error = vmmdev_cleanup();
+		if (error == 0) {
+			iommu_cleanup();
+			vmm_ipi_cleanup();
+			error = VMM_CLEANUP();
+		}
+		break;
+	default:
+		error = 0;
+		break;
+	}
+	return (error);
+}
+
+static moduledata_t vmm_kmod = {
+	"vmm",
+	vmm_handler,
+	NULL
+};
+
+/*
+ * Execute the module load handler after the pci passthru driver has had
+ * a chance to claim devices. We need this information at the time we do
+ * iommu initialization.
+ */
+DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_CONFIGURE + 1, SI_ORDER_ANY);
+MODULE_VERSION(vmm, 1);
+
+SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
+
+struct vm *
+vm_create(const char *name)
+{
+	int i;
+	struct vm *vm;
+	vm_paddr_t maxaddr;
+
+	const int BSP = 0;
+
+	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
+		return (NULL);
+
+	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
+	strcpy(vm->name, name);
+	vm->cookie = VMINIT(vm);
+
+	for (i = 0; i < VM_MAXCPU; i++) {
+		vcpu_init(vm, i);
+		guest_msrs_init(vm, i);
+	}
+
+	maxaddr = vmm_mem_maxaddr();
+	vm->iommu = iommu_create_domain(maxaddr);
+	vm_activate_cpu(vm, BSP);
+
+	return (vm);
+}
+
+static void
+vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
+{
+	size_t len;
+	vm_paddr_t hpa;
+	void *host_domain;
+
+	host_domain = iommu_host_domain();
+
+	len = 0;
+	while (len < seg->len) {
+		hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE);
+		if (hpa == (vm_paddr_t)-1) {
+			panic("vm_free_mem_segs: cannot free hpa "
+			      "associated with gpa 0x%016lx", seg->gpa + len);
+		}
+
+		/*
+		 * Remove the 'gpa' to 'hpa' mapping in VMs domain.
+		 * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'.
+		 */
+		iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE);
+		iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE);
+
+		vmm_mem_free(hpa, PAGE_SIZE);
+
+		len += PAGE_SIZE;
+	}
+
+	/*
+	 * Invalidate cached translations associated with 'vm->iommu' since
+	 * we have now moved some pages from it.
+	 */
+	iommu_invalidate_tlb(vm->iommu);
+
+	bzero(seg, sizeof(struct vm_memory_segment));
+}
+
+void
+vm_destroy(struct vm *vm)
+{
+	int i;
+
+	ppt_unassign_all(vm);
+
+	for (i = 0; i < vm->num_mem_segs; i++)
+		vm_free_mem_seg(vm, &vm->mem_segs[i]);
+
+	vm->num_mem_segs = 0;
+
+	for (i = 0; i < VM_MAXCPU; i++)
+		vcpu_cleanup(&vm->vcpu[i]);
+
+	iommu_destroy_domain(vm->iommu);
+
+	VMCLEANUP(vm->cookie);
+
+	free(vm, M_VM);
+}
+
+const char *
+vm_name(struct vm *vm)
+{
+	return (vm->name);
+}
+
+int
+vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+
+	return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
+			   VM_PROT_RW, spok));
+}
+
+int
+vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+
+	return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0,
+			   VM_PROT_NONE, spok));
+}
+
+/*
+ * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise
+ */
+static boolean_t
+vm_gpa_available(struct vm *vm, vm_paddr_t gpa)
+{
+	int i;
+	vm_paddr_t gpabase, gpalimit;
+
+	if (gpa & PAGE_MASK)
+		panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa);
+
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		gpabase = vm->mem_segs[i].gpa;
+		gpalimit = gpabase + vm->mem_segs[i].len;
+		if (gpa >= gpabase && gpa < gpalimit)
+			return (FALSE);
+	}
+
+	return (TRUE);
+}
+
+int
+vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+	int error, available, allocated;
+	struct vm_memory_segment *seg;
+	vm_paddr_t g, hpa;
+	void *host_domain;
+
+	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+
+	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
+		return (EINVAL);
+	
+	available = allocated = 0;
+	g = gpa;
+	while (g < gpa + len) {
+		if (vm_gpa_available(vm, g))
+			available++;
+		else
+			allocated++;
+
+		g += PAGE_SIZE;
+	}
+
+	/*
+	 * If there are some allocated and some available pages in the address
+	 * range then it is an error.
+	 */
+	if (allocated && available)
+		return (EINVAL);
+
+	/*
+	 * If the entire address range being requested has already been
+	 * allocated then there isn't anything more to do.
+	 */
+	if (allocated && available == 0)
+		return (0);
+
+	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
+		return (E2BIG);
+
+	host_domain = iommu_host_domain();
+
+	seg = &vm->mem_segs[vm->num_mem_segs];
+
+	error = 0;
+	seg->gpa = gpa;
+	seg->len = 0;
+	while (seg->len < len) {
+		hpa = vmm_mem_alloc(PAGE_SIZE);
+		if (hpa == 0) {
+			error = ENOMEM;
+			break;
+		}
+
+		error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE,
+				   VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok);
+		if (error)
+			break;
+
+		/*
+		 * Remove the 1:1 mapping for 'hpa' from the 'host_domain'.
+		 * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain.
+		 */
+		iommu_remove_mapping(host_domain, hpa, PAGE_SIZE);
+		iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE);
+
+		seg->len += PAGE_SIZE;
+	}
+
+	if (error) {
+		vm_free_mem_seg(vm, seg);
+		return (error);
+	}
+
+	/*
+	 * Invalidate cached translations associated with 'host_domain' since
+	 * we have now moved some pages from it.
+	 */
+	iommu_invalidate_tlb(host_domain);
+
+	vm->num_mem_segs++;
+
+	return (0);
+}
+
+vm_paddr_t
+vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+	vm_paddr_t nextpage;
+
+	nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE);
+	if (len > nextpage - gpa)
+		panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len);
+
+	return (VMMMAP_GET(vm->cookie, gpa));
+}
+
+int
+vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
+		  struct vm_memory_segment *seg)
+{
+	int i;
+
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		if (gpabase == vm->mem_segs[i].gpa) {
+			*seg = vm->mem_segs[i];
+			return (0);
+		}
+	}
+	return (-1);
+}
+
+int
+vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
+{
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (reg >= VM_REG_LAST)
+		return (EINVAL);
+
+	return (VMGETREG(vm->cookie, vcpu, reg, retval));
+}
+
+int
+vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
+{
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (reg >= VM_REG_LAST)
+		return (EINVAL);
+
+	return (VMSETREG(vm->cookie, vcpu, reg, val));
+}
+
+static boolean_t
+is_descriptor_table(int reg)
+{
+
+	switch (reg) {
+	case VM_REG_GUEST_IDTR:
+	case VM_REG_GUEST_GDTR:
+		return (TRUE);
+	default:
+		return (FALSE);
+	}
+}
+
+static boolean_t
+is_segment_register(int reg)
+{
+	
+	switch (reg) {
+	case VM_REG_GUEST_ES:
+	case VM_REG_GUEST_CS:
+	case VM_REG_GUEST_SS:
+	case VM_REG_GUEST_DS:
+	case VM_REG_GUEST_FS:
+	case VM_REG_GUEST_GS:
+	case VM_REG_GUEST_TR:
+	case VM_REG_GUEST_LDTR:
+		return (TRUE);
+	default:
+		return (FALSE);
+	}
+}
+
+int
+vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
+		struct seg_desc *desc)
+{
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (!is_segment_register(reg) && !is_descriptor_table(reg))
+		return (EINVAL);
+
+	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
+}
+
+int
+vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
+		struct seg_desc *desc)
+{
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (!is_segment_register(reg) && !is_descriptor_table(reg))
+		return (EINVAL);
+
+	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
+}
+
+int
+vm_get_pinning(struct vm *vm, int vcpuid, int *cpuid)
+{
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	*cpuid = VCPU_PINCPU(vm, vcpuid);
+
+	return (0);
+}
+
+int
+vm_set_pinning(struct vm *vm, int vcpuid, int host_cpuid)
+{
+	struct thread *td;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	td = curthread;		/* XXXSMP only safe when muxing vcpus */
+
+	/* unpin */
+	if (host_cpuid < 0) {
+		VCPU_UNPIN(vm, vcpuid);
+		thread_lock(td);
+		sched_unbind(td);
+		thread_unlock(td);
+		return (0);
+	}
+
+	if (CPU_ABSENT(host_cpuid))
+		return (EINVAL);
+
+	/*
+	 * XXX we should check that 'host_cpuid' has not already been pinned
+	 * by another vm.
+	 */
+	thread_lock(td);
+	sched_bind(td, host_cpuid);
+	thread_unlock(td);
+	VCPU_PIN(vm, vcpuid, host_cpuid);
+
+	return (0);
+}
+
+static void
+restore_guest_fpustate(struct vcpu *vcpu)
+{
+
+	/* flush host state to the pcb */
+	fpuexit(curthread);
+
+	/* restore guest FPU state */
+	fpu_stop_emulating();
+	fpurestore(vcpu->guestfpu);
+
+	/*
+	 * The FPU is now "dirty" with the guest's state so turn on emulation
+	 * to trap any access to the FPU by the host.
+	 */
+	fpu_start_emulating();
+}
+
+static void
+save_guest_fpustate(struct vcpu *vcpu)
+{
+
+	if ((rcr0() & CR0_TS) == 0)
+		panic("fpu emulation not enabled in host!");
+
+	/* save guest FPU state */
+	fpu_stop_emulating();
+	fpusave(vcpu->guestfpu);
+	fpu_start_emulating();
+}
+
+static VMM_STAT_DEFINE(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
+
+int
+vm_run(struct vm *vm, struct vm_run *vmrun)
+{
+	int error, vcpuid, sleepticks, t;
+	struct vcpu *vcpu;
+	struct pcb *pcb;
+	uint64_t tscval, rip;
+	struct vm_exit *vme;
+
+	vcpuid = vmrun->cpuid;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+	vme = &vmrun->vm_exit;
+	rip = vmrun->rip;
+restart:
+	critical_enter();
+
+	tscval = rdtsc();
+
+	pcb = PCPU_GET(curpcb);
+	set_pcb_flags(pcb, PCB_FULL_IRET);
+
+	restore_guest_msrs(vm, vcpuid);	
+	restore_guest_fpustate(vcpu);
+
+	vcpu->hostcpu = curcpu;
+	error = VMRUN(vm->cookie, vcpuid, rip);
+	vcpu->hostcpu = NOCPU;
+
+	save_guest_fpustate(vcpu);
+	restore_host_msrs(vm, vcpuid);
+
+	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
+
+	/* copy the exit information */
+	bcopy(&vcpu->exitinfo, vme, sizeof(struct vm_exit));
+
+	critical_exit();
+
+	/*
+	 * Oblige the guest's desire to 'hlt' by sleeping until the vcpu
+	 * is ready to run.
+	 */
+	if (error == 0 && vme->exitcode == VM_EXITCODE_HLT) {
+		vcpu_lock(vcpu);
+
+		/*
+		 * Figure out the number of host ticks until the next apic
+		 * timer interrupt in the guest.
+		 */
+		sleepticks = lapic_timer_tick(vm, vcpuid);
+
+		/*
+		 * If the guest local apic timer is disabled then sleep for
+		 * a long time but not forever.
+		 */
+		if (sleepticks < 0)
+			sleepticks = hz;
+
+		/*
+		 * Do a final check for pending NMI or interrupts before
+		 * really putting this thread to sleep.
+		 *
+		 * These interrupts could have happened any time after we
+		 * returned from VMRUN() and before we grabbed the vcpu lock.
+		 */
+		if (!vm_nmi_pending(vm, vcpuid) &&
+		    lapic_pending_intr(vm, vcpuid) < 0) {
+			if (sleepticks <= 0)
+				panic("invalid sleepticks %d", sleepticks);
+			t = ticks;
+			msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
+			vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
+		}
+
+		vcpu_unlock(vcpu);
+
+		rip = vme->rip + vme->inst_length;
+		goto restart;
+	}
+
+	return (error);
+}
+
+int
+vm_inject_event(struct vm *vm, int vcpuid, int type,
+		int vector, uint32_t code, int code_valid)
+{
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
+		return (EINVAL);
+
+	if (vector < 0 || vector > 255)
+		return (EINVAL);
+
+	return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
+}
+
+static VMM_STAT_DEFINE(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
+
+int
+vm_inject_nmi(struct vm *vm, int vcpuid)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	vcpu->nmi_pending = 1;
+	vm_interrupt_hostcpu(vm, vcpuid);
+	return (0);
+}
+
+int
+vm_nmi_pending(struct vm *vm, int vcpuid)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	return (vcpu->nmi_pending);
+}
+
+void
+vm_nmi_clear(struct vm *vm, int vcpuid)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	if (vcpu->nmi_pending == 0)
+		panic("vm_nmi_clear: inconsistent nmi_pending state");
+
+	vcpu->nmi_pending = 0;
+	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
+}
+
+int
+vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
+{
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (type < 0 || type >= VM_CAP_MAX)
+		return (EINVAL);
+
+	return (VMGETCAP(vm->cookie, vcpu, type, retval));
+}
+
+int
+vm_set_capability(struct vm *vm, int vcpu, int type, int val)
+{
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (type < 0 || type >= VM_CAP_MAX)
+		return (EINVAL);
+
+	return (VMSETCAP(vm->cookie, vcpu, type, val));
+}
+
+uint64_t *
+vm_guest_msrs(struct vm *vm, int cpu)
+{
+	return (vm->vcpu[cpu].guest_msrs);
+}
+
+struct vlapic *
+vm_lapic(struct vm *vm, int cpu)
+{
+	return (vm->vcpu[cpu].vlapic);
+}
+
+boolean_t
+vmm_is_pptdev(int bus, int slot, int func)
+{
+	int found, b, s, f, n;
+	char *val, *cp, *cp2;
+
+	/*
+	 * setenv pptdevs "1/2/3 4/5/6 7/8/9 10/11/12"
+	 */
+	found = 0;
+	cp = val = getenv("pptdevs");
+	while (cp != NULL && *cp != '\0') {
+		if ((cp2 = strchr(cp, ' ')) != NULL)
+			*cp2 = '\0';
+
+		n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
+		if (n == 3 && bus == b && slot == s && func == f) {
+			found = 1;
+			break;
+		}
+		
+		if (cp2 != NULL)
+			*cp2++ = ' ';
+
+		cp = cp2;
+	}
+	freeenv(val);
+	return (found);
+}
+
+void *
+vm_iommu_domain(struct vm *vm)
+{
+
+	return (vm->iommu);
+}
+
+int
+vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
+{
+	int error;
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	vcpu_lock(vcpu);
+
+	/*
+	 * The following state transitions are allowed:
+	 * IDLE -> RUNNING -> IDLE
+	 * IDLE -> CANNOT_RUN -> IDLE
+	 */
+	if ((vcpu->state == VCPU_IDLE && state != VCPU_IDLE) ||
+	    (vcpu->state != VCPU_IDLE && state == VCPU_IDLE)) {
+		error = 0;
+		vcpu->state = state;
+	} else {
+		error = EBUSY;
+	}
+
+	vcpu_unlock(vcpu);
+
+	return (error);
+}
+
+enum vcpu_state
+vcpu_get_state(struct vm *vm, int vcpuid)
+{
+	struct vcpu *vcpu;
+	enum vcpu_state state;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	vcpu_lock(vcpu);
+	state = vcpu->state;
+	vcpu_unlock(vcpu);
+
+	return (state);
+}
+
+void
+vm_activate_cpu(struct vm *vm, int vcpuid)
+{
+
+	if (vcpuid >= 0 && vcpuid < VM_MAXCPU)
+		CPU_SET(vcpuid, &vm->active_cpus);
+}
+
+cpuset_t
+vm_active_cpus(struct vm *vm)
+{
+
+	return (vm->active_cpus);
+}
+
+void *
+vcpu_stats(struct vm *vm, int vcpuid)
+{
+
+	return (vm->vcpu[vcpuid].stats);
+}
+
+int
+vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
+{
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	*state = vm->vcpu[vcpuid].x2apic_state;
+
+	return (0);
+}
+
+int
+vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
+{
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (state < 0 || state >= X2APIC_STATE_LAST)
+		return (EINVAL);
+
+	vm->vcpu[vcpuid].x2apic_state = state;
+
+	vlapic_set_x2apic_state(vm, vcpuid, state);
+
+	return (0);
+}
+
+void
+vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
+{
+	int hostcpu;
+	struct vcpu *vcpu;
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	vcpu_lock(vcpu);
+	hostcpu = vcpu->hostcpu;
+	if (hostcpu == NOCPU) {
+		/*
+		 * If the vcpu is 'RUNNING' but without a valid 'hostcpu' then
+		 * the host thread must be sleeping waiting for an event to
+		 * kick the vcpu out of 'hlt'.
+		 *
+		 * XXX this is racy because the condition exists right before
+		 * and after calling VMRUN() in vm_run(). The wakeup() is
+		 * benign in this case.
+		 */
+		if (vcpu->state == VCPU_RUNNING)
+			wakeup_one(vcpu);
+	} else {
+		if (vcpu->state != VCPU_RUNNING)
+			panic("invalid vcpu state %d", vcpu->state);
+		if (hostcpu != curcpu)
+			ipi_cpu(hostcpu, vmm_ipinum);
+	}
+	vcpu_unlock(vcpu);
+}
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
new file mode 100644
index 0000000..0150ebd
--- /dev/null
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -0,0 +1,538 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/malloc.h>
+#include <sys/conf.h>
+#include <sys/sysctl.h>
+#include <sys/libkern.h>
+#include <sys/ioccom.h>
+#include <sys/mman.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include "vmm_lapic.h"
+#include "vmm_stat.h"
+#include "vmm_mem.h"
+#include "io/ppt.h"
+#include <machine/vmm_dev.h>
+
+struct vmmdev_softc {
+	struct vm	*vm;		/* vm instance cookie */
+	struct cdev	*cdev;
+	SLIST_ENTRY(vmmdev_softc) link;
+};
+static SLIST_HEAD(, vmmdev_softc) head;
+
+static struct mtx vmmdev_mtx;
+
+static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
+
+SYSCTL_DECL(_hw_vmm);
+
+static struct vmmdev_softc *
+vmmdev_lookup(const char *name)
+{
+	struct vmmdev_softc *sc;
+
+#ifdef notyet	/* XXX kernel is not compiled with invariants */
+	mtx_assert(&vmmdev_mtx, MA_OWNED);
+#endif
+
+	SLIST_FOREACH(sc, &head, link) {
+		if (strcmp(name, vm_name(sc->vm)) == 0)
+			break;
+	}
+
+	return (sc);
+}
+
+static struct vmmdev_softc *
+vmmdev_lookup2(struct cdev *cdev)
+{
+
+	return (cdev->si_drv1);
+}
+
+static int
+vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
+{
+	int error, off, c;
+	vm_paddr_t hpa, gpa;
+	struct vmmdev_softc *sc;
+
+	static char zerobuf[PAGE_SIZE];
+
+	error = 0;
+	mtx_lock(&vmmdev_mtx);
+	sc = vmmdev_lookup2(cdev);
+	if (sc == NULL)
+		error = ENXIO;
+
+	while (uio->uio_resid > 0 && error == 0) {
+		gpa = uio->uio_offset;
+		off = gpa & PAGE_MASK;
+		c = min(uio->uio_resid, PAGE_SIZE - off);
+
+		/*
+		 * The VM has a hole in its physical memory map. If we want to
+		 * use 'dd' to inspect memory beyond the hole we need to
+		 * provide bogus data for memory that lies in the hole.
+		 *
+		 * Since this device does not support lseek(2), dd(1) will
+		 * read(2) blocks of data to simulate the lseek(2).
+		 */
+		hpa = vm_gpa2hpa(sc->vm, gpa, c);
+		if (hpa == (vm_paddr_t)-1) {
+			if (uio->uio_rw == UIO_READ)
+				error = uiomove(zerobuf, c, uio);
+			else
+				error = EFAULT;
+		} else
+			error = uiomove((void *)PHYS_TO_DMAP(hpa), c, uio);
+	}
+
+	mtx_unlock(&vmmdev_mtx);
+	return (error);
+}
+
+static int
+vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
+	     struct thread *td)
+{
+	int error, vcpu, state_changed;
+	enum vcpu_state new_state;
+	struct vmmdev_softc *sc;
+	struct vm_memory_segment *seg;
+	struct vm_register *vmreg;
+	struct vm_seg_desc* vmsegdesc;
+	struct vm_pin *vmpin;
+	struct vm_run *vmrun;
+	struct vm_event *vmevent;
+	struct vm_lapic_irq *vmirq;
+	struct vm_capability *vmcap;
+	struct vm_pptdev *pptdev;
+	struct vm_pptdev_mmio *pptmmio;
+	struct vm_pptdev_msi *pptmsi;
+	struct vm_pptdev_msix *pptmsix;
+	struct vm_nmi *vmnmi;
+	struct vm_stats *vmstats;
+	struct vm_stat_desc *statdesc;
+	struct vm_x2apic *x2apic;
+
+	sc = vmmdev_lookup2(cdev);
+	if (sc == NULL)
+		return (ENXIO);
+
+	vcpu = -1;
+	state_changed = 0;
+
+	/*
+	 * Some VMM ioctls can operate only on vcpus that are not running.
+	 */
+	switch (cmd) {
+	case VM_RUN:
+	case VM_SET_PINNING:
+	case VM_GET_REGISTER:
+	case VM_SET_REGISTER:
+	case VM_GET_SEGMENT_DESCRIPTOR:
+	case VM_SET_SEGMENT_DESCRIPTOR:
+	case VM_INJECT_EVENT:
+	case VM_GET_CAPABILITY:
+	case VM_SET_CAPABILITY:
+	case VM_PPTDEV_MSI:
+	case VM_PPTDEV_MSIX:
+	case VM_SET_X2APIC_STATE:
+		/*
+		 * XXX fragile, handle with care
+		 * Assumes that the first field of the ioctl data is the vcpu.
+		 */
+		vcpu = *(int *)data;
+		if (vcpu < 0 || vcpu >= VM_MAXCPU) {
+			error = EINVAL;
+			goto done;
+		}
+
+		if (cmd == VM_RUN)
+			new_state = VCPU_RUNNING;
+		else
+			new_state = VCPU_CANNOT_RUN;
+
+		error = vcpu_set_state(sc->vm, vcpu, new_state);
+		if (error)
+			goto done;
+
+		state_changed = 1;
+		break;
+
+	case VM_MAP_PPTDEV_MMIO:
+	case VM_BIND_PPTDEV:
+	case VM_UNBIND_PPTDEV:
+	case VM_MAP_MEMORY:
+		/*
+		 * ioctls that operate on the entire virtual machine must
+		 * prevent all vcpus from running.
+		 */
+		error = 0;
+		for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
+			error = vcpu_set_state(sc->vm, vcpu, VCPU_CANNOT_RUN);
+			if (error)
+				break;
+		}
+
+		if (error) {
+			while (--vcpu >= 0)
+				vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
+			goto done;
+		}
+
+		state_changed = 2;
+		break;
+
+	default:
+		break;
+	}
+
+	switch(cmd) {
+	case VM_RUN:
+		vmrun = (struct vm_run *)data;
+		error = vm_run(sc->vm, vmrun);
+		break;
+	case VM_STAT_DESC: {
+		const char *desc;
+		statdesc = (struct vm_stat_desc *)data;
+		desc = vmm_stat_desc(statdesc->index);
+		if (desc != NULL) {
+			error = 0;
+			strlcpy(statdesc->desc, desc, sizeof(statdesc->desc));
+		} else
+			error = EINVAL;
+		break;
+	}
+	case VM_STATS: {
+		CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_TYPES);
+		vmstats = (struct vm_stats *)data;
+		getmicrotime(&vmstats->tv);
+		error = vmm_stat_copy(sc->vm, vmstats->cpuid,
+				      &vmstats->num_entries, vmstats->statbuf);
+		break;
+	}
+	case VM_PPTDEV_MSI:
+		pptmsi = (struct vm_pptdev_msi *)data;
+		error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
+				      pptmsi->bus, pptmsi->slot, pptmsi->func,
+				      pptmsi->destcpu, pptmsi->vector,
+				      pptmsi->numvec);
+		break;
+	case VM_PPTDEV_MSIX:
+		pptmsix = (struct vm_pptdev_msix *)data;
+		error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
+				       pptmsix->bus, pptmsix->slot, 
+				       pptmsix->func, pptmsix->idx,
+				       pptmsix->msg, pptmsix->vector_control,
+				       pptmsix->addr);
+		break;
+	case VM_MAP_PPTDEV_MMIO:
+		pptmmio = (struct vm_pptdev_mmio *)data;
+		error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
+				     pptmmio->func, pptmmio->gpa, pptmmio->len,
+				     pptmmio->hpa);
+		break;
+	case VM_BIND_PPTDEV:
+		pptdev = (struct vm_pptdev *)data;
+		error = ppt_assign_device(sc->vm, pptdev->bus, pptdev->slot,
+					  pptdev->func);
+		break;
+	case VM_UNBIND_PPTDEV:
+		pptdev = (struct vm_pptdev *)data;
+		error = ppt_unassign_device(sc->vm, pptdev->bus, pptdev->slot,
+					    pptdev->func);
+		break;
+	case VM_INJECT_EVENT:
+		vmevent = (struct vm_event *)data;
+		error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type,
+					vmevent->vector,
+					vmevent->error_code,
+					vmevent->error_code_valid);
+		break;
+	case VM_INJECT_NMI:
+		vmnmi = (struct vm_nmi *)data;
+		error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
+		break;
+	case VM_LAPIC_IRQ:
+		vmirq = (struct vm_lapic_irq *)data;
+		error = lapic_set_intr(sc->vm, vmirq->cpuid, vmirq->vector);
+		break;
+	case VM_SET_PINNING:
+		vmpin = (struct vm_pin *)data;
+		error = vm_set_pinning(sc->vm, vmpin->vm_cpuid,
+				       vmpin->host_cpuid);
+		break;
+	case VM_GET_PINNING:
+		vmpin = (struct vm_pin *)data;
+		error = vm_get_pinning(sc->vm, vmpin->vm_cpuid,
+				       &vmpin->host_cpuid);
+		break;
+	case VM_MAP_MEMORY:
+		seg = (struct vm_memory_segment *)data;
+		error = vm_malloc(sc->vm, seg->gpa, seg->len);
+		break;
+	case VM_GET_MEMORY_SEG:
+		seg = (struct vm_memory_segment *)data;
+		seg->len = 0;
+		(void)vm_gpabase2memseg(sc->vm, seg->gpa, seg);
+		error = 0;
+		break;
+	case VM_GET_REGISTER:
+		vmreg = (struct vm_register *)data;
+		error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
+					&vmreg->regval);
+		break;
+	case VM_SET_REGISTER:
+		vmreg = (struct vm_register *)data;
+		error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
+					vmreg->regval);
+		break;
+	case VM_SET_SEGMENT_DESCRIPTOR:
+		vmsegdesc = (struct vm_seg_desc *)data;
+		error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
+					vmsegdesc->regnum,
+					&vmsegdesc->desc);
+		break;
+	case VM_GET_SEGMENT_DESCRIPTOR:
+		vmsegdesc = (struct vm_seg_desc *)data;
+		error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
+					vmsegdesc->regnum,
+					&vmsegdesc->desc);
+		break;
+	case VM_GET_CAPABILITY:
+		vmcap = (struct vm_capability *)data;
+		error = vm_get_capability(sc->vm, vmcap->cpuid,
+					  vmcap->captype,
+					  &vmcap->capval);
+		break;
+	case VM_SET_CAPABILITY:
+		vmcap = (struct vm_capability *)data;
+		error = vm_set_capability(sc->vm, vmcap->cpuid,
+					  vmcap->captype,
+					  vmcap->capval);
+		break;
+	case VM_SET_X2APIC_STATE:
+		x2apic = (struct vm_x2apic *)data;
+		error = vm_set_x2apic_state(sc->vm,
+					    x2apic->cpuid, x2apic->state);
+		break;
+	case VM_GET_X2APIC_STATE:
+		x2apic = (struct vm_x2apic *)data;
+		error = vm_get_x2apic_state(sc->vm,
+					    x2apic->cpuid, &x2apic->state);
+		break;
+	default:
+		error = ENOTTY;
+		break;
+	}
+
+	if (state_changed == 1) {
+		vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
+	} else if (state_changed == 2) {
+		for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++)
+			vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
+	}
+
+done:
+	return (error);
+}
+
+static int
+vmmdev_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr,
+    int nprot, vm_memattr_t *memattr)
+{
+	int error;
+	struct vmmdev_softc *sc;
+
+	error = -1;
+	mtx_lock(&vmmdev_mtx);
+
+	sc = vmmdev_lookup2(cdev);
+	if (sc != NULL && (nprot & PROT_EXEC) == 0) {
+		*paddr = vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE);
+		if (*paddr != (vm_paddr_t)-1)
+			error = 0;
+	}
+
+	mtx_unlock(&vmmdev_mtx);
+
+	return (error);
+}
+
+static void
+vmmdev_destroy(struct vmmdev_softc *sc, boolean_t unlink)
+{
+
+	/*
+	 * XXX must stop virtual machine instances that may be still
+	 * running and cleanup their state.
+	 */
+	if (sc->cdev)
+		destroy_dev(sc->cdev);
+
+	if (sc->vm)
+		vm_destroy(sc->vm);
+
+	if (unlink) {
+		mtx_lock(&vmmdev_mtx);
+		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
+		mtx_unlock(&vmmdev_mtx);
+	}
+
+	free(sc, M_VMMDEV);
+}
+
+static int
+sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	char buf[VM_MAX_NAMELEN];
+	struct vmmdev_softc *sc;
+
+	strlcpy(buf, "beavis", sizeof(buf));
+	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+
+	/*
+	 * XXX TODO if any process has this device open then fail
+	 */
+
+	mtx_lock(&vmmdev_mtx);
+	sc = vmmdev_lookup(buf);
+	if (sc == NULL) {
+		mtx_unlock(&vmmdev_mtx);
+		return (EINVAL);
+	}
+
+	sc->cdev->si_drv1 = NULL;
+	mtx_unlock(&vmmdev_mtx);
+
+	vmmdev_destroy(sc, TRUE);
+
+	return (0);
+}
+SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
+	    NULL, 0, sysctl_vmm_destroy, "A", NULL);
+
+static struct cdevsw vmmdevsw = {
+	.d_name		= "vmmdev",
+	.d_version	= D_VERSION,
+	.d_ioctl	= vmmdev_ioctl,
+	.d_mmap		= vmmdev_mmap,
+	.d_read		= vmmdev_rw,
+	.d_write	= vmmdev_rw,
+};
+
+static int
+sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	struct vm *vm;
+	struct vmmdev_softc *sc, *sc2;
+	char buf[VM_MAX_NAMELEN];
+
+	strlcpy(buf, "beavis", sizeof(buf));
+	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+
+	mtx_lock(&vmmdev_mtx);
+	sc = vmmdev_lookup(buf);
+	mtx_unlock(&vmmdev_mtx);
+	if (sc != NULL)
+		return (EEXIST);
+
+	vm = vm_create(buf);
+	if (vm == NULL)
+		return (EINVAL);
+
+	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
+	sc->vm = vm;
+
+	/*
+	 * Lookup the name again just in case somebody sneaked in when we
+	 * dropped the lock.
+	 */
+	mtx_lock(&vmmdev_mtx);
+	sc2 = vmmdev_lookup(buf);
+	if (sc2 == NULL)
+		SLIST_INSERT_HEAD(&head, sc, link);
+	mtx_unlock(&vmmdev_mtx);
+
+	if (sc2 != NULL) {
+		vmmdev_destroy(sc, FALSE);
+		return (EEXIST);
+	}
+
+	sc->cdev = make_dev(&vmmdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
+			    "vmm/%s", buf);
+	sc->cdev->si_drv1 = sc;
+
+	return (0);
+}
+SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
+	    NULL, 0, sysctl_vmm_create, "A", NULL);
+
+void
+vmmdev_init(void)
+{
+	mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
+}
+
+int
+vmmdev_cleanup(void)
+{
+	int error;
+
+	if (SLIST_EMPTY(&head))
+		error = 0;
+	else
+		error = EBUSY;
+
+	return (error);
+}
diff --git a/sys/amd64/vmm/vmm_host.c b/sys/amd64/vmm/vmm_host.c
new file mode 100644
index 0000000..8dfef73
--- /dev/null
+++ b/sys/amd64/vmm/vmm_host.c
@@ -0,0 +1,124 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/pcpu.h>
+
+#include <machine/cpufunc.h>
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+
+#include "vmm_host.h"
+
+static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4;
+
+void
+vmm_host_state_init(void)
+{
+
+	vmm_host_efer = rdmsr(MSR_EFER);
+	vmm_host_pat = rdmsr(MSR_PAT);
+
+	/*
+	 * We always want CR0.TS to be set when the processor does a VM exit.
+	 *
+	 * With emulation turned on unconditionally after a VM exit, we are
+	 * able to trap inadvertent use of the FPU until the guest FPU state
+	 * has been safely squirreled away.
+	 */
+	vmm_host_cr0 = rcr0() | CR0_TS;
+
+	vmm_host_cr4 = rcr4();
+}
+
+uint64_t
+vmm_get_host_pat(void)
+{
+
+	return (vmm_host_pat);
+}
+
+uint64_t
+vmm_get_host_efer(void)
+{
+
+	return (vmm_host_efer);
+}
+
+uint64_t
+vmm_get_host_cr0(void)
+{
+
+	return (vmm_host_cr0);
+}
+
+uint64_t
+vmm_get_host_cr4(void)
+{
+
+	return (vmm_host_cr4);
+}
+
+uint64_t
+vmm_get_host_datasel(void)
+{
+
+	return (GSEL(GDATA_SEL, SEL_KPL));
+
+}
+
+uint64_t
+vmm_get_host_codesel(void)
+{
+
+	return (GSEL(GCODE_SEL, SEL_KPL));
+}
+
+uint64_t
+vmm_get_host_tsssel(void)
+{
+
+	return (GSEL(GPROC0_SEL, SEL_KPL));
+}
+
+uint64_t
+vmm_get_host_fsbase(void)
+{
+
+	return (0);
+}
+
+uint64_t
+vmm_get_host_idtrbase(void)
+{
+
+	return (r_idt.rd_base);
+}
diff --git a/sys/amd64/vmm/vmm_host.h b/sys/amd64/vmm/vmm_host.h
new file mode 100644
index 0000000..839f54a
--- /dev/null
+++ b/sys/amd64/vmm/vmm_host.h
@@ -0,0 +1,75 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_VMM_HOST_H_
+#define	_VMM_HOST_H_
+
+#ifndef	_KERNEL
+#error "no user-servicable parts inside"
+#endif
+
+void vmm_host_state_init(void);
+
+uint64_t vmm_get_host_pat(void);
+uint64_t vmm_get_host_efer(void);
+uint64_t vmm_get_host_cr0(void);
+uint64_t vmm_get_host_cr4(void);
+uint64_t vmm_get_host_datasel(void);
+uint64_t vmm_get_host_codesel(void);
+uint64_t vmm_get_host_tsssel(void);
+uint64_t vmm_get_host_fsbase(void);
+uint64_t vmm_get_host_idtrbase(void);
+
+/*
+ * Inline access to host state that is used on every VM entry
+ */
+static __inline uint64_t
+vmm_get_host_trbase(void)
+{
+
+	return ((uint64_t)PCPU_GET(tssp));
+}
+
+static __inline uint64_t
+vmm_get_host_gdtrbase(void)
+{
+
+	return ((uint64_t)&gdt[NGDT * curcpu]);
+}
+
+struct pcpu;
+extern struct pcpu __pcpu[];
+
+static __inline uint64_t
+vmm_get_host_gsbase(void)
+{
+
+	return ((uint64_t)&__pcpu[curcpu]);
+}
+
+#endif
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
new file mode 100644
index 0000000..e73f6bb
--- /dev/null
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -0,0 +1,810 @@
+/*-
+ * Copyright (c) 2012 Sandvine, Inc.
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifdef _KERNEL
+#include <sys/param.h>
+#include <sys/pcpu.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+#include <machine/vmm.h>
+#else	/* !_KERNEL */
+#include <sys/types.h>
+#include <sys/errno.h>
+
+#include <machine/vmm.h>
+
+#include <vmmapi.h>
+#endif	/* _KERNEL */
+
+
+
+/* struct vie_op.op_type */
+enum {
+	VIE_OP_TYPE_NONE = 0,
+	VIE_OP_TYPE_MOV,
+	VIE_OP_TYPE_AND,
+	VIE_OP_TYPE_LAST
+};
+
+/* struct vie_op.op_flags */
+#define	VIE_OP_F_IMM		(1 << 0)	/* immediate operand present */
+#define	VIE_OP_F_IMM8		(1 << 1)	/* 8-bit immediate operand */
+
+static const struct vie_op one_byte_opcodes[256] = {
+	[0x89] = {
+		.op_byte = 0x89,
+		.op_type = VIE_OP_TYPE_MOV,
+	},
+	[0x8B] = {
+		.op_byte = 0x8B,
+		.op_type = VIE_OP_TYPE_MOV,
+	},
+	[0xC7] = {
+		.op_byte = 0xC7,
+		.op_type = VIE_OP_TYPE_MOV,
+		.op_flags = VIE_OP_F_IMM,
+	},
+	[0x23] = {
+		.op_byte = 0x23,
+		.op_type = VIE_OP_TYPE_AND,
+	},
+	[0x81] = {
+		/* XXX Group 1 extended opcode - not just AND */
+		.op_byte = 0x81,
+		.op_type = VIE_OP_TYPE_AND,
+		.op_flags = VIE_OP_F_IMM,
+	}
+};
+
+/* struct vie.mod */
+#define	VIE_MOD_INDIRECT		0
+#define	VIE_MOD_INDIRECT_DISP8		1
+#define	VIE_MOD_INDIRECT_DISP32		2
+#define	VIE_MOD_DIRECT			3
+
+/* struct vie.rm */
+#define	VIE_RM_SIB			4
+#define	VIE_RM_DISP32			5
+
+#define	GB				(1024 * 1024 * 1024)
+
+static enum vm_reg_name gpr_map[16] = {
+	VM_REG_GUEST_RAX,
+	VM_REG_GUEST_RCX,
+	VM_REG_GUEST_RDX,
+	VM_REG_GUEST_RBX,
+	VM_REG_GUEST_RSP,
+	VM_REG_GUEST_RBP,
+	VM_REG_GUEST_RSI,
+	VM_REG_GUEST_RDI,
+	VM_REG_GUEST_R8,
+	VM_REG_GUEST_R9,
+	VM_REG_GUEST_R10,
+	VM_REG_GUEST_R11,
+	VM_REG_GUEST_R12,
+	VM_REG_GUEST_R13,
+	VM_REG_GUEST_R14,
+	VM_REG_GUEST_R15
+};
+
+static uint64_t size2mask[] = {
+	[1] = 0xff,
+	[2] = 0xffff,
+	[4] = 0xffffffff,
+	[8] = 0xffffffffffffffff,
+};
+
+static int
+vie_valid_register(enum vm_reg_name reg)
+{
+#ifdef _KERNEL
+	/*
+	 * XXX
+	 * The operand register in which we store the result of the
+	 * read must be a GPR that we can modify even if the vcpu
+	 * is "running". All the GPRs qualify except for %rsp.
+	 *
+	 * This is a limitation of the vm_set_register() API
+	 * and can be fixed if necessary.
+	 */
+	if (reg == VM_REG_GUEST_RSP)
+		return (0);
+#endif
+	return (1);
+}
+
+static int
+vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
+{
+	int error;
+
+	if (!vie_valid_register(reg))
+		return (EINVAL);
+
+	error = vm_get_register(vm, vcpuid, reg, rval);
+
+	return (error);
+}
+
+static int
+vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
+		    uint64_t val, int size)
+{
+	int error;
+	uint64_t origval;
+
+	if (!vie_valid_register(reg))
+		return (EINVAL);
+
+	switch (size) {
+	case 1:
+	case 2:
+		error = vie_read_register(vm, vcpuid, reg, &origval);
+		if (error)
+			return (error);
+		val &= size2mask[size];
+		val |= origval & ~size2mask[size];
+		break;
+	case 4:
+		val &= 0xffffffffUL;
+		break;
+	case 8:
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	error = vm_set_register(vm, vcpuid, reg, val);
+	return (error);
+}
+
+/*
+ * The following simplifying assumptions are made during emulation:
+ *
+ * - guest is in 64-bit mode
+ *   - default address size is 64-bits
+ *   - default operand size is 32-bits
+ *
+ * - operand size override is not supported
+ *
+ * - address size override is not supported
+ */
+static int
+emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+	int error, size;
+	enum vm_reg_name reg;
+	uint64_t val;
+
+	size = 4;
+	error = EINVAL;
+
+	switch (vie->op.op_byte) {
+	case 0x89:
+		/*
+		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
+		 * 89/r:	mov r/m32, r32
+		 * REX.W + 89/r	mov r/m64, r64
+		 */
+		if (vie->rex_w)
+			size = 8;
+		reg = gpr_map[vie->reg];
+		error = vie_read_register(vm, vcpuid, reg, &val);
+		if (error == 0) {
+			val &= size2mask[size];
+			error = memwrite(vm, vcpuid, gpa, val, size, arg);
+		}
+		break;
+	case 0x8B:
+		/*
+		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
+		 * 8B/r:	mov r32, r/m32
+		 * REX.W 8B/r:	mov r64, r/m64
+		 */
+		if (vie->rex_w)
+			size = 8;
+		error = memread(vm, vcpuid, gpa, &val, size, arg);
+		if (error == 0) {
+			reg = gpr_map[vie->reg];
+			error = vie_update_register(vm, vcpuid, reg, val, size);
+		}
+		break;
+	case 0xC7:
+		/*
+		 * MOV from imm32 to mem (ModRM:r/m)
+		 * C7/0		mov r/m32, imm32
+		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
+		 */
+		val = vie->immediate;		/* already sign-extended */
+
+		if (vie->rex_w)
+			size = 8;
+
+		if (size != 8)
+			val &= size2mask[size];
+
+		error = memwrite(vm, vcpuid, gpa, val, size, arg);
+		break;
+	default:
+		break;
+	}
+
+	return (error);
+}
+
+static int
+emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+	int error, size;
+	enum vm_reg_name reg;
+	uint64_t val1, val2;
+
+	size = 4;
+	error = EINVAL;
+
+	switch (vie->op.op_byte) {
+	case 0x23:
+		/*
+		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
+		 * result in reg.
+		 *
+		 * 23/r		and r32, r/m32
+		 * REX.W + 23/r	and r64, r/m64
+		 */
+		if (vie->rex_w)
+			size = 8;
+
+		/* get the first operand */
+		reg = gpr_map[vie->reg];
+		error = vie_read_register(vm, vcpuid, reg, &val1);
+		if (error)
+			break;
+
+		/* get the second operand */
+		error = memread(vm, vcpuid, gpa, &val2, size, arg);
+		if (error)
+			break;
+
+		/* perform the operation and write the result */
+		val1 &= val2;
+		error = vie_update_register(vm, vcpuid, reg, val1, size);
+		break;
+	case 0x81:
+		/*
+		 * AND reg (ModRM:reg) with immediate and store the
+		 * result in reg
+		 *
+		 * 81/          and r/m32, imm32
+		 * REX.W + 81/  and r/m64, imm32 sign-extended to 64
+		 *
+		 * Currently, only the AND operation of the 0x81 opcode
+		 * is implemented (ModRM:reg = b100).
+		 */
+		if ((vie->reg & 7) != 4)
+			break;
+
+		if (vie->rex_w)
+			size = 8;
+		
+		/* get the first operand */
+                error = memread(vm, vcpuid, gpa, &val1, size, arg);
+                if (error)
+			break;
+
+                /*
+		 * perform the operation with the pre-fetched immediate
+		 * operand and write the result
+		 */
+                val1 &= vie->immediate;
+                error = memwrite(vm, vcpuid, gpa, val1, size, arg);
+		break;
+	default:
+		break;
+	}
+	return (error);
+}
+
+int
+vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+			mem_region_read_t memread, mem_region_write_t memwrite,
+			void *memarg)
+{
+	int error;
+
+	if (!vie->decoded)
+		return (EINVAL);
+
+	switch (vie->op.op_type) {
+	case VIE_OP_TYPE_MOV:
+		error = emulate_mov(vm, vcpuid, gpa, vie,
+				    memread, memwrite, memarg);
+		break;
+	case VIE_OP_TYPE_AND:
+		error = emulate_and(vm, vcpuid, gpa, vie,
+				    memread, memwrite, memarg);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	return (error);
+}
+
+#ifdef _KERNEL
+static void
+vie_init(struct vie *vie)
+{
+
+	bzero(vie, sizeof(struct vie));
+
+	vie->base_register = VM_REG_LAST;
+	vie->index_register = VM_REG_LAST;
+}
+
+static int
+gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys,
+	uint64_t *gpa, uint64_t *gpaend)
+{
+	vm_paddr_t hpa;
+	int nlevels, ptpshift, ptpindex;
+	uint64_t *ptpbase, pte, pgsize;
+
+	/*
+	 * XXX assumes 64-bit guest with 4 page walk levels
+	 */
+	nlevels = 4;
+	while (--nlevels >= 0) {
+		/* Zero out the lower 12 bits and the upper 12 bits */
+		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
+
+		hpa = vm_gpa2hpa(vm, ptpphys, PAGE_SIZE);
+		if (hpa == -1)
+			goto error;
+
+		ptpbase = (uint64_t *)PHYS_TO_DMAP(hpa);
+
+		ptpshift = PAGE_SHIFT + nlevels * 9;
+		ptpindex = (gla >> ptpshift) & 0x1FF;
+		pgsize = 1UL << ptpshift;
+
+		pte = ptpbase[ptpindex];
+
+		if ((pte & PG_V) == 0)
+			goto error;
+
+		if (pte & PG_PS) {
+			if (pgsize > 1 * GB)
+				goto error;
+			else
+				break;
+		}
+
+		ptpphys = pte;
+	}
+
+	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
+	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
+	*gpa = pte | (gla & (pgsize - 1));
+	*gpaend = pte + pgsize;
+	return (0);
+
+error:
+	return (-1);
+}
+
+int
+vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
+		      uint64_t cr3, struct vie *vie)
+{
+	int n, err;
+	uint64_t hpa, gpa, gpaend, off;
+
+	/*
+	 * XXX cache previously fetched instructions using 'rip' as the tag
+	 */
+
+	if (inst_length > VIE_INST_SIZE)
+		panic("vmm_fetch_instruction: invalid length %d", inst_length);
+
+	vie_init(vie);
+
+	/* Copy the instruction into 'vie' */
+	while (vie->num_valid < inst_length) {
+		err = gla2gpa(vm, rip, cr3, &gpa, &gpaend);
+		if (err)
+			break;
+
+		off = gpa & PAGE_MASK;
+		n = min(inst_length - vie->num_valid, PAGE_SIZE - off);
+
+		hpa = vm_gpa2hpa(vm, gpa, n);
+		if (hpa == -1)
+			break;
+
+		bcopy((void *)PHYS_TO_DMAP(hpa), &vie->inst[vie->num_valid], n);
+
+		rip += n;
+		vie->num_valid += n;
+	}
+
+	if (vie->num_valid == inst_length)
+		return (0);
+	else
+		return (-1);
+}
+
+static int
+vie_peek(struct vie *vie, uint8_t *x)
+{
+
+	if (vie->num_processed < vie->num_valid) {
+		*x = vie->inst[vie->num_processed];
+		return (0);
+	} else
+		return (-1);
+}
+
+static void
+vie_advance(struct vie *vie)
+{
+
+	vie->num_processed++;
+}
+
+static int
+decode_rex(struct vie *vie)
+{
+	uint8_t x;
+
+	if (vie_peek(vie, &x))
+		return (-1);
+
+	if (x >= 0x40 && x <= 0x4F) {
+		vie->rex_w = x & 0x8 ? 1 : 0;
+		vie->rex_r = x & 0x4 ? 1 : 0;
+		vie->rex_x = x & 0x2 ? 1 : 0;
+		vie->rex_b = x & 0x1 ? 1 : 0;
+
+		vie_advance(vie);
+	}
+
+	return (0);
+}
+
+static int
+decode_opcode(struct vie *vie)
+{
+	uint8_t x;
+
+	if (vie_peek(vie, &x))
+		return (-1);
+
+	vie->op = one_byte_opcodes[x];
+
+	if (vie->op.op_type == VIE_OP_TYPE_NONE)
+		return (-1);
+
+	vie_advance(vie);
+	return (0);
+}
+
+/*
+ * XXX assuming 32-bit or 64-bit guest
+ */
+static int
+decode_modrm(struct vie *vie)
+{
+	uint8_t x;
+
+	if (vie_peek(vie, &x))
+		return (-1);
+
+	vie->mod = (x >> 6) & 0x3;
+	vie->rm =  (x >> 0) & 0x7;
+	vie->reg = (x >> 3) & 0x7;
+
+	/*
+	 * A direct addressing mode makes no sense in the context of an EPT
+	 * fault. There has to be a memory access involved to cause the
+	 * EPT fault.
+	 */
+	if (vie->mod == VIE_MOD_DIRECT)
+		return (-1);
+
+	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
+	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
+		/*
+		 * Table 2-5: Special Cases of REX Encodings
+		 *
+		 * mod=0, r/m=5 is used in the compatibility mode to
+		 * indicate a disp32 without a base register.
+		 *
+		 * mod!=3, r/m=4 is used in the compatibility mode to
+		 * indicate that the SIB byte is present.
+		 *
+		 * The 'b' bit in the REX prefix is don't care in
+		 * this case.
+		 */
+	} else {
+		vie->rm |= (vie->rex_b << 3);
+	}
+
+	vie->reg |= (vie->rex_r << 3);
+
+	/* SIB */
+	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
+		goto done;
+
+	vie->base_register = gpr_map[vie->rm];
+
+	switch (vie->mod) {
+	case VIE_MOD_INDIRECT_DISP8:
+		vie->disp_bytes = 1;
+		break;
+	case VIE_MOD_INDIRECT_DISP32:
+		vie->disp_bytes = 4;
+		break;
+	case VIE_MOD_INDIRECT:
+		if (vie->rm == VIE_RM_DISP32) {
+			vie->disp_bytes = 4;
+			vie->base_register = VM_REG_LAST;	/* no base */
+		}
+		break;
+	}
+
+	/* Figure out immediate operand size (if any) */
+	if (vie->op.op_flags & VIE_OP_F_IMM)
+		vie->imm_bytes = 4;
+	else if (vie->op.op_flags & VIE_OP_F_IMM8)
+		vie->imm_bytes = 1;
+
+done:
+	vie_advance(vie);
+
+	return (0);
+}
+
+static int
+decode_sib(struct vie *vie)
+{
+	uint8_t x;
+
+	/* Proceed only if SIB byte is present */
+	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
+		return (0);
+
+	if (vie_peek(vie, &x))
+		return (-1);
+
+	/* De-construct the SIB byte */
+	vie->ss = (x >> 6) & 0x3;
+	vie->index = (x >> 3) & 0x7;
+	vie->base = (x >> 0) & 0x7;
+
+	/* Apply the REX prefix modifiers */
+	vie->index |= vie->rex_x << 3;
+	vie->base |= vie->rex_b << 3;
+
+	switch (vie->mod) {
+	case VIE_MOD_INDIRECT_DISP8:
+		vie->disp_bytes = 1;
+		break;
+	case VIE_MOD_INDIRECT_DISP32:
+		vie->disp_bytes = 4;
+		break;
+	}
+
+	if (vie->mod == VIE_MOD_INDIRECT &&
+	    (vie->base == 5 || vie->base == 13)) {
+		/*
+		 * Special case when base register is unused if mod = 0
+		 * and base = %rbp or %r13.
+		 *
+		 * Documented in:
+		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+		 * Table 2-5: Special Cases of REX Encodings
+		 */
+		vie->disp_bytes = 4;
+	} else {
+		vie->base_register = gpr_map[vie->base];
+	}
+
+	/*
+	 * All encodings of 'index' are valid except for %rsp (4).
+	 *
+	 * Documented in:
+	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+	 * Table 2-5: Special Cases of REX Encodings
+	 */
+	if (vie->index != 4)
+		vie->index_register = gpr_map[vie->index];
+
+	/* 'scale' makes sense only in the context of an index register */
+	if (vie->index_register < VM_REG_LAST)
+		vie->scale = 1 << vie->ss;
+
+	vie_advance(vie);
+
+	return (0);
+}
+
+static int
+decode_displacement(struct vie *vie)
+{
+	int n, i;
+	uint8_t x;
+
+	union {
+		char	buf[4];
+		int8_t	signed8;
+		int32_t	signed32;
+	} u;
+
+	if ((n = vie->disp_bytes) == 0)
+		return (0);
+
+	if (n != 1 && n != 4)
+		panic("decode_displacement: invalid disp_bytes %d", n);
+
+	for (i = 0; i < n; i++) {
+		if (vie_peek(vie, &x))
+			return (-1);
+
+		u.buf[i] = x;
+		vie_advance(vie);
+	}
+
+	if (n == 1)
+		vie->displacement = u.signed8;		/* sign-extended */
+	else
+		vie->displacement = u.signed32;		/* sign-extended */
+
+	return (0);
+}
+
+static int
+decode_immediate(struct vie *vie)
+{
+	int i, n;
+	uint8_t x;
+	union {
+		char	buf[4];
+		int8_t	signed8;
+		int32_t	signed32;
+	} u;
+
+	if ((n = vie->imm_bytes) == 0)
+		return (0);
+
+	if (n != 1 && n != 4)
+		panic("decode_immediate: invalid imm_bytes %d", n);
+
+	for (i = 0; i < n; i++) {
+		if (vie_peek(vie, &x))
+			return (-1);
+
+		u.buf[i] = x;
+		vie_advance(vie);
+	}
+	
+	if (n == 1)
+		vie->immediate = u.signed8;		/* sign-extended */
+	else
+		vie->immediate = u.signed32;		/* sign-extended */
+
+	return (0);
+}
+
+#define	VERIFY_GLA
+/*
+ * Verify that the 'guest linear address' provided as collateral of the nested
+ * page table fault matches with our instruction decoding.
+ */
+#ifdef VERIFY_GLA
+static int
+verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
+{
+	int error;
+	uint64_t base, idx;
+
+	base = 0;
+	if (vie->base_register != VM_REG_LAST) {
+		error = vm_get_register(vm, cpuid, vie->base_register, &base);
+		if (error) {
+			printf("verify_gla: error %d getting base reg %d\n",
+				error, vie->base_register);
+			return (-1);
+		}
+	}
+
+	idx = 0;
+	if (vie->index_register != VM_REG_LAST) {
+		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
+		if (error) {
+			printf("verify_gla: error %d getting index reg %d\n",
+				error, vie->index_register);
+			return (-1);
+		}
+	}
+
+	if (base + vie->scale * idx + vie->displacement != gla) {
+		printf("verify_gla mismatch: "
+		       "base(0x%0lx), scale(%d), index(0x%0lx), "
+		       "disp(0x%0lx), gla(0x%0lx)\n",
+		       base, vie->scale, idx, vie->displacement, gla);
+		return (-1);
+	}
+
+	return (0);
+}
+#endif	/* VERIFY_GLA */
+
+int
+vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
+{
+
+	if (decode_rex(vie))
+		return (-1);
+
+	if (decode_opcode(vie))
+		return (-1);
+
+	if (decode_modrm(vie))
+		return (-1);
+
+	if (decode_sib(vie))
+		return (-1);
+
+	if (decode_displacement(vie))
+		return (-1);
+	
+	if (decode_immediate(vie))
+		return (-1);
+
+#ifdef VERIFY_GLA
+	if (verify_gla(vm, cpuid, gla, vie))
+		return (-1);
+#endif
+
+	vie->decoded = 1;	/* success */
+
+	return (0);
+}
+#endif	/* _KERNEL */
diff --git a/sys/amd64/vmm/vmm_ipi.c b/sys/amd64/vmm/vmm_ipi.c
new file mode 100644
index 0000000..643d326
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ipi.c
@@ -0,0 +1,93 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/bus.h>
+
+#include <machine/intr_machdep.h>
+#include <machine/apicvar.h>
+#include <machine/segments.h>
+#include <machine/md_var.h>
+
+#include <machine/vmm.h>
+#include "vmm_ipi.h"
+
+extern inthand_t IDTVEC(rsvd), IDTVEC(justreturn);
+
+/*
+ * The default is to use the IPI_AST to interrupt a vcpu.
+ */
+int vmm_ipinum = IPI_AST;
+
+CTASSERT(APIC_SPURIOUS_INT == 255);
+
+void
+vmm_ipi_init(void)
+{
+	int idx;
+	uintptr_t func;
+	struct gate_descriptor *ip;
+
+	/*
+	 * Search backwards from the highest IDT vector available for use
+	 * as our IPI vector. We install the 'justreturn' handler at that
+	 * vector and use it to interrupt the vcpus.
+	 *
+	 * We do this because the IPI_AST is heavyweight and saves all
+	 * registers in the trapframe. This is overkill for our use case
+	 * which is simply to EOI the interrupt and return.
+	 */
+	idx = APIC_SPURIOUS_INT;
+	while (--idx >= APIC_IPI_INTS) {
+		ip = &idt[idx];
+		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
+		if (func == (uintptr_t)&IDTVEC(rsvd)) {
+			vmm_ipinum = idx;
+			setidt(vmm_ipinum, IDTVEC(justreturn), SDT_SYSIGT,
+			       SEL_KPL, 0);
+			break;
+		}
+	}
+	
+	if (vmm_ipinum != IPI_AST && bootverbose) {
+		printf("vmm_ipi_init: installing ipi handler to interrupt "
+		       "vcpus at vector %d\n", vmm_ipinum);
+	}
+}
+
+void
+vmm_ipi_cleanup(void)
+{
+	if (vmm_ipinum != IPI_AST)
+		setidt(vmm_ipinum, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
+}
diff --git a/sys/amd64/vmm/vmm_ipi.h b/sys/amd64/vmm/vmm_ipi.h
new file mode 100644
index 0000000..91552e3
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ipi.h
@@ -0,0 +1,39 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_IPI_H_
+#define _VMM_IPI_H_
+
+struct vm;
+
+extern int vmm_ipinum;
+
+void	vmm_ipi_init(void);
+void	vmm_ipi_cleanup(void);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_ktr.h b/sys/amd64/vmm/vmm_ktr.h
new file mode 100644
index 0000000..e691c61
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ktr.h
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_KTR_H_
+#define	_VMM_KTR_H_
+
+#include <sys/ktr.h>
+#include <sys/pcpu.h>
+
+#define	KTR_VMM	KTR_GEN
+
+#define	VMM_CTR0(vm, vcpuid, format)					\
+CTR3(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu)
+
+#define	VMM_CTR1(vm, vcpuid, format, p1)				\
+CTR4(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+			(p1))
+
+#define	VMM_CTR2(vm, vcpuid, format, p1, p2)				\
+CTR5(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+			(p1), (p2))
+
+#define	VMM_CTR3(vm, vcpuid, format, p1, p2, p3)			\
+CTR6(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+			(p1), (p2), (p3))
+#endif
diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c
new file mode 100644
index 0000000..d024b71
--- /dev/null
+++ b/sys/amd64/vmm/vmm_lapic.c
@@ -0,0 +1,201 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+
+#include <x86/specialreg.h>
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_ipi.h"
+#include "vmm_lapic.h"
+#include "vlapic.h"
+
+int
+lapic_pending_intr(struct vm *vm, int cpu)
+{
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	return (vlapic_pending_intr(vlapic));
+}
+
+void
+lapic_intr_accepted(struct vm *vm, int cpu, int vector)
+{
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	vlapic_intr_accepted(vlapic, vector);
+}
+
+int
+lapic_set_intr(struct vm *vm, int cpu, int vector)
+{
+	struct vlapic *vlapic;
+
+	if (cpu < 0 || cpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (vector < 32 || vector > 255)
+		return (EINVAL);
+
+	vlapic = vm_lapic(vm, cpu);
+	vlapic_set_intr_ready(vlapic, vector);
+
+	vm_interrupt_hostcpu(vm, cpu);
+
+	return (0);
+}
+
+int
+lapic_timer_tick(struct vm *vm, int cpu)
+{
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	return (vlapic_timer_tick(vlapic));
+}
+
+static boolean_t
+x2apic_msr(u_int msr)
+{
+	if (msr >= 0x800 && msr <= 0xBFF)
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+static u_int
+x2apic_msr_to_regoff(u_int msr)
+{
+
+	return ((msr - 0x800) << 4);
+}
+
+boolean_t
+lapic_msr(u_int msr)
+{
+
+	if (x2apic_msr(msr) || (msr == MSR_APICBASE))
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+int
+lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval)
+{
+	int error;
+	u_int offset;
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	if (msr == MSR_APICBASE) {
+		*rval = vlapic_get_apicbase(vlapic);
+		error = 0;
+	} else {
+		offset = x2apic_msr_to_regoff(msr);
+		error = vlapic_op_mem_read(vlapic, offset, DWORD, rval);
+	}
+
+	return (error);
+}
+
+int
+lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val)
+{
+	int error;
+	u_int offset;
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	if (msr == MSR_APICBASE) {
+		vlapic_set_apicbase(vlapic, val);
+		error = 0;
+	} else {
+		offset = x2apic_msr_to_regoff(msr);
+		error = vlapic_op_mem_write(vlapic, offset, DWORD, val);
+	}
+
+	return (error);
+}
+
+int
+lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size,
+		 void *arg)
+{
+	int error;
+	uint64_t off;
+	struct vlapic *vlapic;
+
+	off = gpa - DEFAULT_APIC_BASE;
+
+	/*
+	 * Memory mapped local apic accesses must be 4 bytes wide and
+	 * aligned on a 16-byte boundary.
+	 */
+	if (size != 4 || off & 0xf)
+		return (EINVAL);
+
+	vlapic = vm_lapic(vm, cpu);
+	error = vlapic_op_mem_write(vlapic, off, DWORD, wval);
+	return (error);
+}
+
+int
+lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size,
+		void *arg)
+{
+	int error;
+	uint64_t off;
+	struct vlapic *vlapic;
+
+	off = gpa - DEFAULT_APIC_BASE;
+
+	/*
+	 * Memory mapped local apic accesses must be 4 bytes wide and
+	 * aligned on a 16-byte boundary.
+	 */
+	if (size != 4 || off & 0xf)
+		return (EINVAL);
+
+	vlapic = vm_lapic(vm, cpu);
+	error = vlapic_op_mem_read(vlapic, off, DWORD, rval);
+	return (error);
+}
diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h
new file mode 100644
index 0000000..a79912e
--- /dev/null
+++ b/sys/amd64/vmm/vmm_lapic.h
@@ -0,0 +1,71 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_LAPIC_H_
+#define	_VMM_LAPIC_H_
+
+struct vm;
+
+boolean_t lapic_msr(u_int num);
+int	lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval);
+int	lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval);
+
+int	lapic_mmio_read(void *vm, int cpu, uint64_t gpa,
+			uint64_t *rval, int size, void *arg);
+int	lapic_mmio_write(void *vm, int cpu, uint64_t gpa,
+			 uint64_t wval, int size, void *arg);
+
+int	lapic_timer_tick(struct vm *vm, int cpu);
+
+/*
+ * Returns a vector between 32 and 255 if an interrupt is pending in the
+ * IRR that can be delivered based on the current state of ISR and TPR.
+ *
+ * Note that the vector does not automatically transition to the ISR as a
+ * result of calling this function.
+ *
+ * Returns -1 if there is no eligible vector that can be delivered to the
+ * guest at this time.
+ */
+int	lapic_pending_intr(struct vm *vm, int cpu);
+
+/*
+ * Transition 'vector' from IRR to ISR. This function is called with the
+ * vector returned by 'lapic_pending_intr()' when the guest is able to
+ * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that
+ * block interrupt delivery).
+ */
+void	lapic_intr_accepted(struct vm *vm, int cpu, int vector);
+
+/*
+ * Signals to the LAPIC that an interrupt at 'vector' needs to be generated
+ * to the 'cpu', the state is recorded in IRR.
+ */
+int	lapic_set_intr(struct vm *vm, int cpu, int vector);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c
new file mode 100644
index 0000000..04f99b1
--- /dev/null
+++ b/sys/amd64/vmm/vmm_mem.c
@@ -0,0 +1,135 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/linker.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
+
+#include <machine/md_var.h>
+#include <machine/metadata.h>
+#include <machine/pc/bios.h>
+#include <machine/vmparam.h>
+#include <machine/pmap.h>
+
+#include "vmm_util.h"
+#include "vmm_mem.h"
+
+SYSCTL_DECL(_hw_vmm);
+
+static u_long pages_allocated;
+SYSCTL_ULONG(_hw_vmm, OID_AUTO, pages_allocated, CTLFLAG_RD,
+	     &pages_allocated, 0, "4KB pages allocated");
+
+static void
+update_pages_allocated(int howmany)
+{
+	pages_allocated += howmany;	/* XXX locking? */
+}
+
+int
+vmm_mem_init(void)
+{
+
+	return (0);
+}
+
+vm_paddr_t
+vmm_mem_alloc(size_t size)
+{
+	int flags;
+	vm_page_t m;
+	vm_paddr_t pa;
+
+	if (size != PAGE_SIZE)
+		panic("vmm_mem_alloc: invalid allocation size %lu", size);
+
+	flags = VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
+		VM_ALLOC_ZERO;
+
+	while (1) {
+		/*
+		 * XXX need policy to determine when to back off the allocation
+		 */
+		m = vm_page_alloc(NULL, 0, flags);
+		if (m == NULL)
+			VM_WAIT;
+		else
+			break;
+	}
+
+	pa = VM_PAGE_TO_PHYS(m);
+	
+	if ((m->flags & PG_ZERO) == 0)
+		pagezero((void *)PHYS_TO_DMAP(pa));
+	m->valid = VM_PAGE_BITS_ALL;
+
+	update_pages_allocated(1);
+
+	return (pa);
+}
+
+void
+vmm_mem_free(vm_paddr_t base, size_t length)
+{
+	vm_page_t m;
+
+	if (base & PAGE_MASK) {
+		panic("vmm_mem_free: base 0x%0lx must be aligned on a "
+		      "0x%0x boundary\n", base, PAGE_SIZE);
+	}
+
+	if (length != PAGE_SIZE)
+		panic("vmm_mem_free: invalid length %lu", length);
+
+	m = PHYS_TO_VM_PAGE(base);
+	m->wire_count--;
+	vm_page_free(m);
+	atomic_subtract_int(&cnt.v_wire_count, 1);
+
+	update_pages_allocated(-1);
+}
+
+vm_paddr_t
+vmm_mem_maxaddr(void)
+{
+
+	return (ptoa(Maxmem));
+}
diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h
new file mode 100644
index 0000000..7d45c74
--- /dev/null
+++ b/sys/amd64/vmm/vmm_mem.h
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_VMM_MEM_H_
+#define	_VMM_MEM_H_
+
+int		vmm_mem_init(void);
+vm_paddr_t	vmm_mem_alloc(size_t size);
+void		vmm_mem_free(vm_paddr_t start, size_t size);
+vm_paddr_t	vmm_mem_maxaddr(void);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_msr.c b/sys/amd64/vmm/vmm_msr.c
new file mode 100644
index 0000000..d97c819
--- /dev/null
+++ b/sys/amd64/vmm/vmm_msr.c
@@ -0,0 +1,254 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+
+#include <machine/specialreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_lapic.h"
+#include "vmm_msr.h"
+
+#define	VMM_MSR_F_EMULATE	0x01
+#define	VMM_MSR_F_READONLY	0x02
+#define VMM_MSR_F_INVALID	0x04  /* guest_msr_valid() can override this */
+
+struct vmm_msr {
+	int		num;
+	int		flags;
+	uint64_t	hostval;
+};
+
+static struct vmm_msr vmm_msr[] = {
+	{ MSR_LSTAR,	0 },
+	{ MSR_CSTAR,	0 },
+	{ MSR_STAR,	0 },
+	{ MSR_SF_MASK,	0 },
+	{ MSR_PAT,      VMM_MSR_F_EMULATE | VMM_MSR_F_INVALID },
+	{ MSR_BIOS_SIGN,VMM_MSR_F_EMULATE },
+	{ MSR_MCG_CAP,	VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY },
+};
+
+#define	vmm_msr_num	(sizeof(vmm_msr) / sizeof(vmm_msr[0]))
+CTASSERT(VMM_MSR_NUM >= vmm_msr_num);
+
+#define	readonly_msr(idx)	\
+	((vmm_msr[(idx)].flags & VMM_MSR_F_READONLY) != 0)
+
+#define	emulated_msr(idx)	\
+	((vmm_msr[(idx)].flags & VMM_MSR_F_EMULATE) != 0)
+
+#define invalid_msr(idx)	\
+	((vmm_msr[(idx)].flags & VMM_MSR_F_INVALID) != 0)
+
+void
+vmm_msr_init(void)
+{
+	int i;
+
+	for (i = 0; i < vmm_msr_num; i++) {
+		if (emulated_msr(i))
+			continue;
+		/*
+		 * XXX this assumes that the value of the host msr does not
+		 * change after we have cached it.
+		 */
+		vmm_msr[i].hostval = rdmsr(vmm_msr[i].num);
+	}
+}
+
+void
+guest_msrs_init(struct vm *vm, int cpu)
+{
+	int i;
+	uint64_t *guest_msrs;
+
+	guest_msrs = vm_guest_msrs(vm, cpu);
+	
+	for (i = 0; i < vmm_msr_num; i++) {
+		switch (vmm_msr[i].num) {
+		case MSR_LSTAR:
+		case MSR_CSTAR:
+		case MSR_STAR:
+		case MSR_SF_MASK:
+		case MSR_BIOS_SIGN:
+		case MSR_MCG_CAP:
+			guest_msrs[i] = 0;
+			break;
+		case MSR_PAT:
+			guest_msrs[i] = PAT_VALUE(0, PAT_WRITE_BACK)      |
+				PAT_VALUE(1, PAT_WRITE_THROUGH)   |
+				PAT_VALUE(2, PAT_UNCACHED)        |
+				PAT_VALUE(3, PAT_UNCACHEABLE)     |
+				PAT_VALUE(4, PAT_WRITE_BACK)      |
+				PAT_VALUE(5, PAT_WRITE_THROUGH)   |
+				PAT_VALUE(6, PAT_UNCACHED)        |
+				PAT_VALUE(7, PAT_UNCACHEABLE);
+			break;
+		default:
+			panic("guest_msrs_init: missing initialization for msr "
+			      "0x%0x", vmm_msr[i].num);
+		}
+	}
+}
+
+static int
+msr_num_to_idx(u_int num)
+{
+	int i;
+
+	for (i = 0; i < vmm_msr_num; i++)
+		if (vmm_msr[i].num == num)
+			return (i);
+
+	return (-1);
+}
+
+int
+emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val)
+{
+	int idx;
+	uint64_t *guest_msrs;
+
+	if (lapic_msr(num))
+		return (lapic_wrmsr(vm, cpu, num, val));
+
+	idx = msr_num_to_idx(num);
+	if (idx < 0 || invalid_msr(idx))
+		return (EINVAL);
+
+	if (!readonly_msr(idx)) {
+		guest_msrs = vm_guest_msrs(vm, cpu);
+
+		/* Stash the value */
+		guest_msrs[idx] = val;
+
+		/* Update processor state for non-emulated MSRs */
+		if (!emulated_msr(idx))
+			wrmsr(vmm_msr[idx].num, val);
+	}
+
+	return (0);
+}
+
+int
+emulate_rdmsr(struct vm *vm, int cpu, u_int num)
+{
+	int error, idx;
+	uint32_t eax, edx;
+	uint64_t result, *guest_msrs;
+
+	if (lapic_msr(num)) {
+		error = lapic_rdmsr(vm, cpu, num, &result);
+		goto done;
+	}
+
+	idx = msr_num_to_idx(num);
+	if (idx < 0 || invalid_msr(idx)) {
+		error = EINVAL;
+		goto done;
+	}
+
+	guest_msrs = vm_guest_msrs(vm, cpu);
+	result = guest_msrs[idx];
+
+	/*
+	 * If this is not an emulated msr register make sure that the processor
+	 * state matches our cached state.
+	 */
+	if (!emulated_msr(idx) && (rdmsr(num) != result)) {
+		panic("emulate_rdmsr: msr 0x%0x has inconsistent cached "
+		      "(0x%016lx) and actual (0x%016lx) values", num,
+		      result, rdmsr(num));
+	}
+
+	error = 0;
+
+done:
+	if (error == 0) {
+		eax = result;
+		edx = result >> 32;
+		error = vm_set_register(vm, cpu, VM_REG_GUEST_RAX, eax);
+		if (error)
+			panic("vm_set_register(rax) error %d", error);
+		error = vm_set_register(vm, cpu, VM_REG_GUEST_RDX, edx);
+		if (error)
+			panic("vm_set_register(rdx) error %d", error);
+	}
+	return (error);
+}
+
+void
+restore_guest_msrs(struct vm *vm, int cpu)
+{
+	int i;
+	uint64_t *guest_msrs;
+
+	guest_msrs = vm_guest_msrs(vm, cpu);
+
+	for (i = 0; i < vmm_msr_num; i++) {
+		if (emulated_msr(i))
+			continue;
+		else
+			wrmsr(vmm_msr[i].num, guest_msrs[i]);
+	}
+}
+
+void
+restore_host_msrs(struct vm *vm, int cpu)
+{
+	int i;
+
+	for (i = 0; i < vmm_msr_num; i++) {
+		if (emulated_msr(i))
+			continue;
+		else
+			wrmsr(vmm_msr[i].num, vmm_msr[i].hostval);
+	}
+}
+
+/*
+ * Must be called by the CPU-specific code before any guests are
+ * created
+ */
+void
+guest_msr_valid(int msr)
+{
+	int i;
+
+	for (i = 0; i < vmm_msr_num; i++) {
+		if (vmm_msr[i].num == msr && invalid_msr(i)) {
+			vmm_msr[i].flags &= ~VMM_MSR_F_INVALID;
+		}
+	}
+}
diff --git a/sys/amd64/vmm/vmm_msr.h b/sys/amd64/vmm/vmm_msr.h
new file mode 100644
index 0000000..8a1fda3
--- /dev/null
+++ b/sys/amd64/vmm/vmm_msr.h
@@ -0,0 +1,43 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_VMM_MSR_H_
+#define	_VMM_MSR_H_
+
+#define	VMM_MSR_NUM	16
+struct vm;
+
+void	vmm_msr_init(void);
+int	emulate_wrmsr(struct vm *vm, int vcpu, u_int msr, uint64_t val);
+int	emulate_rdmsr(struct vm *vm, int vcpu, u_int msr);
+void	guest_msrs_init(struct vm *vm, int cpu);
+void	guest_msr_valid(int msr);
+void	restore_host_msrs(struct vm *vm, int cpu);
+void	restore_guest_msrs(struct vm *vm, int cpu);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_stat.c b/sys/amd64/vmm/vmm_stat.c
new file mode 100644
index 0000000..ae60979
--- /dev/null
+++ b/sys/amd64/vmm/vmm_stat.c
@@ -0,0 +1,104 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/smp.h>
+
+#include <machine/vmm.h>
+#include "vmm_stat.h"
+
+static int vstnum;
+static struct vmm_stat_type *vsttab[MAX_VMM_STAT_TYPES];
+
+static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat");
+
+void
+vmm_stat_init(void *arg)
+{
+	struct vmm_stat_type *vst = arg;
+
+	/* We require all stats to identify themselves with a description */
+	if (vst->desc == NULL)
+		return;
+
+	if (vstnum >= MAX_VMM_STAT_TYPES) {
+		printf("Cannot accomodate vmm stat type \"%s\"!\n", vst->desc);
+		return;
+	}
+
+	vst->index = vstnum;
+	vsttab[vstnum++] = vst;
+}
+
+int
+vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf)
+{
+	int i;
+	uint64_t *stats;
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+		
+	stats = vcpu_stats(vm, vcpu);
+	for (i = 0; i < vstnum; i++)
+		buf[i] = stats[i];
+	*num_stats = vstnum;
+	return (0);
+}
+
+void *
+vmm_stat_alloc(void)
+{
+	u_long size;
+	
+	size = vstnum * sizeof(uint64_t);
+
+	return (malloc(size, M_VMM_STAT, M_ZERO | M_WAITOK));
+}
+
+void
+vmm_stat_free(void *vp)
+{
+	free(vp, M_VMM_STAT);
+}
+
+const char *
+vmm_stat_desc(int index)
+{
+
+	if (index >= 0 && index < vstnum)
+		return (vsttab[index]->desc);
+	else
+		return (NULL);
+}
diff --git a/sys/amd64/vmm/vmm_stat.h b/sys/amd64/vmm/vmm_stat.h
new file mode 100644
index 0000000..7c075a6
--- /dev/null
+++ b/sys/amd64/vmm/vmm_stat.h
@@ -0,0 +1,71 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_STAT_H_
+#define	_VMM_STAT_H_
+
+struct vm;
+
+#define	MAX_VMM_STAT_TYPES	64		/* arbitrary */
+
+struct vmm_stat_type {
+	const char	*desc;		/* description of statistic */
+	int	index;			/* position in the stats buffer */
+};
+
+void	vmm_stat_init(void *arg);
+
+#define	VMM_STAT_DEFINE(type, desc)					\
+	struct vmm_stat_type type[1] = {				\
+		{ desc, -1 }						\
+	};								\
+	SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_init, type)
+
+void	*vmm_stat_alloc(void);
+void 	vmm_stat_free(void *vp);
+
+/*
+ * 'buf' should be at least fit 'MAX_VMM_STAT_TYPES' entries
+ */
+int	vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf);
+const char *vmm_stat_desc(int index);
+
+static void __inline
+vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x)
+{
+#ifdef	VMM_KEEP_STATS
+	uint64_t *stats = vcpu_stats(vm, vcpu);
+	if (vst->index >= 0)
+		stats[vst->index] += x;
+#endif
+}
+
+#endif
diff --git a/sys/amd64/vmm/vmm_support.S b/sys/amd64/vmm/vmm_support.S
new file mode 100644
index 0000000..2afc608
--- /dev/null
+++ b/sys/amd64/vmm/vmm_support.S
@@ -0,0 +1,42 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#define	LOCORE
+
+#include <machine/asmacros.h>
+
+#define	LA_EOI	0xB0
+
+	.text
+	SUPERALIGN_TEXT
+IDTVEC(justreturn)
+	pushq	%rax
+	movq	lapic, %rax
+	movl	$0, LA_EOI(%rax)
+	popq	%rax
+	iretq
diff --git a/sys/amd64/vmm/vmm_util.c b/sys/amd64/vmm/vmm_util.c
new file mode 100644
index 0000000..f245f92
--- /dev/null
+++ b/sys/amd64/vmm/vmm_util.c
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/libkern.h>
+
+#include <machine/md_var.h>
+
+#include "vmm_util.h"
+
+boolean_t
+vmm_is_intel(void)
+{
+
+	if (strcmp(cpu_vendor, "GenuineIntel") == 0)
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+boolean_t
+vmm_is_amd(void)
+{
+	if (strcmp(cpu_vendor, "AuthenticAMD") == 0)
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+boolean_t
+vmm_supports_1G_pages(void)
+{
+	unsigned int regs[4];
+
+	/*
+	 * CPUID.80000001:EDX[bit 26] = 1 indicates support for 1GB pages
+	 *
+	 * Both Intel and AMD support this bit.
+	 */
+	if (cpu_exthigh >= 0x80000001) {
+		do_cpuid(0x80000001, regs);
+		if (regs[3] & (1 << 26))
+			return (TRUE);
+	}
+	return (FALSE);
+}
+
+#include <sys/proc.h>
+#include <machine/frame.h>
+#define	DUMP_REG(x)	printf(#x "\t\t0x%016lx\n", (long)(tf->tf_ ## x))
+#define	DUMP_SEG(x)	printf(#x "\t\t0x%04x\n", (unsigned)(tf->tf_ ## x))
+void
+dump_trapframe(struct trapframe *tf)
+{
+	DUMP_REG(rdi);
+	DUMP_REG(rsi);
+	DUMP_REG(rdx);
+	DUMP_REG(rcx);
+	DUMP_REG(r8);
+	DUMP_REG(r9);
+	DUMP_REG(rax);
+	DUMP_REG(rbx);
+	DUMP_REG(rbp);
+	DUMP_REG(r10);
+	DUMP_REG(r11);
+	DUMP_REG(r12);
+	DUMP_REG(r13);
+	DUMP_REG(r14);
+	DUMP_REG(r15);
+	DUMP_REG(trapno);
+	DUMP_REG(addr);
+	DUMP_REG(flags);
+	DUMP_REG(err);
+	DUMP_REG(rip);
+	DUMP_REG(rflags);
+	DUMP_REG(rsp);
+	DUMP_SEG(cs);
+	DUMP_SEG(ss);
+	DUMP_SEG(fs);
+	DUMP_SEG(gs);
+	DUMP_SEG(es);
+	DUMP_SEG(ds);
+}
diff --git a/sys/amd64/vmm/vmm_util.h b/sys/amd64/vmm/vmm_util.h
new file mode 100644
index 0000000..7f82332
--- /dev/null
+++ b/sys/amd64/vmm/vmm_util.h
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_UTIL_H_
+#define	_VMM_UTIL_H_
+
+struct trapframe;
+
+boolean_t	vmm_is_intel(void);
+boolean_t	vmm_is_amd(void);
+boolean_t	vmm_supports_1G_pages(void);
+
+void		dump_trapframe(struct trapframe *tf);
+
+#endif
diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c
new file mode 100644
index 0000000..94abe09
--- /dev/null
+++ b/sys/amd64/vmm/x86.c
@@ -0,0 +1,202 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/cpuset.h>
+
+#include <machine/cpufunc.h>
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+
+#include <machine/vmm.h>
+
+#include "x86.h"
+
+#define	CPUID_VM_HIGH		0x40000000
+
+static const char bhyve_id[12] = "BHyVE BHyVE ";
+
+int
+x86_emulate_cpuid(struct vm *vm, int vcpu_id,
+		  uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+{
+	int error;
+	unsigned int 	func, regs[4];
+	enum x2apic_state x2apic_state;
+
+	func = *eax;
+
+	/*
+	 * Requests for invalid CPUID levels should map to the highest
+	 * available level instead.
+	 */
+	if (cpu_exthigh != 0 && *eax >= 0x80000000) {
+		if (*eax > cpu_exthigh)
+			*eax = cpu_exthigh;
+	} else if (*eax >= 0x40000000) {
+		if (*eax > CPUID_VM_HIGH)
+			*eax = CPUID_VM_HIGH;
+	} else if (*eax > cpu_high) {
+		*eax = cpu_high;
+	}
+
+	/*
+	 * In general the approach used for CPU topology is to
+	 * advertise a flat topology where all CPUs are packages with
+	 * no multi-core or SMT.
+	 */
+	switch (func) {
+		case CPUID_0000_0000:
+		case CPUID_0000_0002:
+		case CPUID_0000_0003:
+		case CPUID_0000_000A:
+			cpuid_count(*eax, *ecx, regs);
+			break;
+
+		case CPUID_8000_0000:
+		case CPUID_8000_0001:
+		case CPUID_8000_0002:
+		case CPUID_8000_0003:
+		case CPUID_8000_0004:
+		case CPUID_8000_0006:
+		case CPUID_8000_0007:
+		case CPUID_8000_0008:
+			cpuid_count(*eax, *ecx, regs);
+			break;
+
+		case CPUID_0000_0001:
+			do_cpuid(1, regs);
+
+			error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
+			if (error) {
+				panic("x86_emulate_cpuid: error %d "
+				      "fetching x2apic state", error);
+			}
+
+			/*
+			 * Override the APIC ID only in ebx
+			 */
+			regs[1] &= ~(CPUID_LOCAL_APIC_ID);
+			regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
+
+			/*
+			 * Don't expose VMX, SpeedStep or TME capability.
+			 * Advertise x2APIC capability and Hypervisor guest.
+			 */
+			regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
+
+			regs[2] |= CPUID2_HV;
+
+			if (x2apic_state != X2APIC_DISABLED)
+				regs[2] |= CPUID2_X2APIC;
+
+			/*
+			 * Hide xsave/osxsave/avx until the FPU save/restore
+			 * issues are resolved
+			 */
+			regs[2] &= ~(CPUID2_XSAVE | CPUID2_OSXSAVE |
+				     CPUID2_AVX);
+
+			/*
+			 * Hide monitor/mwait until we know how to deal with
+			 * these instructions.
+			 */
+			regs[2] &= ~CPUID2_MON;
+
+			/*
+			 * Hide thermal monitoring
+			 */
+			regs[3] &= ~(CPUID_ACPI | CPUID_TM);
+			
+			/*
+			 * Machine check handling is done in the host.
+			 * Hide MTRR capability.
+			 */
+			regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
+
+			/*
+			 * Disable multi-core.
+			 */
+			regs[1] &= ~CPUID_HTT_CORES;
+			regs[3] &= ~CPUID_HTT;
+			break;
+
+		case CPUID_0000_0004:
+			do_cpuid(4, regs);
+
+			/*
+			 * Do not expose topology.
+			 */
+			regs[0] &= 0xffff8000;
+			regs[0] |= 0x04008000;
+			break;
+
+		case CPUID_0000_0006:
+		case CPUID_0000_0007:
+			/*
+			 * Handle the access, but report 0 for
+			 * all options
+			 */
+			regs[0] = 0;
+			regs[1] = 0;
+			regs[2] = 0;
+			regs[3] = 0;
+			break;
+
+		case CPUID_0000_000B:
+			/*
+			 * Processor topology enumeration
+			 */
+			regs[0] = 0;
+			regs[1] = 0;
+			regs[2] = *ecx & 0xff;
+			regs[3] = vcpu_id;
+			break;
+
+		case 0x40000000:
+			regs[0] = CPUID_VM_HIGH;
+			bcopy(bhyve_id, &regs[1], 4);
+			bcopy(bhyve_id, &regs[2], 4);
+			bcopy(bhyve_id, &regs[3], 4);
+			break;
+		default:
+			/* XXX: Leaf 5? */
+			return (0);
+	}
+
+	*eax = regs[0];
+	*ebx = regs[1];
+	*ecx = regs[2];
+	*edx = regs[3];
+	return (1);
+}
diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h
new file mode 100644
index 0000000..368e967
--- /dev/null
+++ b/sys/amd64/vmm/x86.h
@@ -0,0 +1,64 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _X86_H_
+#define	_X86_H_
+
+#define CPUID_0000_0000 (0x0)
+#define CPUID_0000_0001	(0x1)
+#define CPUID_0000_0002 (0x2)
+#define CPUID_0000_0003 (0x3)
+#define CPUID_0000_0004 (0x4)
+#define CPUID_0000_0006 (0x6)
+#define CPUID_0000_0007 (0x7)
+#define	CPUID_0000_000A	(0xA)
+#define	CPUID_0000_000B	(0xB)
+#define CPUID_8000_0000	(0x80000000)
+#define CPUID_8000_0001	(0x80000001)
+#define CPUID_8000_0002	(0x80000002)
+#define CPUID_8000_0003	(0x80000003)
+#define CPUID_8000_0004	(0x80000004)
+#define CPUID_8000_0006	(0x80000006)
+#define CPUID_8000_0007	(0x80000007)
+#define CPUID_8000_0008	(0x80000008)
+
+/*
+ * CPUID instruction Fn0000_0001:
+ */
+#define CPUID_0000_0001_APICID_MASK			(0xff<<24)
+#define CPUID_0000_0001_APICID_SHIFT			24
+
+/*
+ * CPUID instruction Fn0000_0001 ECX
+ */
+#define CPUID_0000_0001_FEAT0_VMX	(1<<5)
+
+int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx,
+		      uint32_t *ecx, uint32_t *edx);
+
+#endif
diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64
index 56c7437..f381c71 100644
--- a/sys/conf/files.amd64
+++ b/sys/conf/files.amd64
@@ -464,6 +464,11 @@ libkern/memset.c		standard
 compat/x86bios/x86bios.c	optional x86bios | atkbd | dpms | vesa
 contrib/x86emu/x86emu.c		optional x86bios | atkbd | dpms | vesa
 #
+# bvm console
+#
+dev/bvm/bvm_console.c		optional	bvmconsole
+dev/bvm/bvm_dbg.c		optional	bvmdebug
+#
 # x86 shared code between IA32, AMD64 and PC98 architectures
 #
 x86/acpica/OsdEnvironment.c	optional	acpi
diff --git a/sys/dev/blackhole/blackhole.c b/sys/dev/blackhole/blackhole.c
new file mode 100644
index 0000000..9d02e50
--- /dev/null
+++ b/sys/dev/blackhole/blackhole.c
@@ -0,0 +1,129 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/bus.h>
+#include <sys/linker.h>
+#include <sys/libkern.h>
+
+#include <dev/pci/pcivar.h>
+
+static int
+linker_file_iterator(linker_file_t lf, void *arg)
+{
+	const char *file = arg;
+
+	if (strcmp(lf->filename, file) == 0)
+		return (1);
+	else
+		return (0);
+}
+
+static boolean_t
+pptdev(int bus, int slot, int func)
+{
+	int found, b, s, f, n;
+	char *val, *cp, *cp2;
+
+	/*
+	 * setenv pptdevs "1/2/3 4/5/6 7/8/9 10/11/12"
+	 */
+	found = 0;
+	cp = val = getenv("pptdevs");
+	while (cp != NULL && *cp != '\0') {
+		if ((cp2 = strchr(cp, ' ')) != NULL)
+			*cp2 = '\0';
+
+		n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
+		if (n == 3 && bus == b && slot == s && func == f) {
+			found = 1;
+			break;
+		}
+		
+		if (cp2 != NULL)
+			*cp2++ = ' ';
+
+		cp = cp2;
+	}
+	freeenv(val);
+	return (found);
+}
+
+static int
+pci_blackhole_probe(device_t dev)
+{
+	int bus, slot, func;
+
+	/*
+	 * If 'vmm.ko' has also been loaded the don't try to claim
+	 * any pci devices.
+	 */
+	if (linker_file_foreach(linker_file_iterator, "vmm.ko"))
+		return (ENXIO);
+
+	bus = pci_get_bus(dev);
+	slot = pci_get_slot(dev);
+	func = pci_get_function(dev);
+	if (pptdev(bus, slot, func))
+		return (0);
+	else
+		return (ENXIO);
+}
+
+static int
+pci_blackhole_attach(device_t dev)
+{
+	/*
+	 * We never really want to claim the devices but just want to prevent
+	 * other drivers from getting to them.
+	 */
+	return (ENXIO);
+}
+
+static device_method_t pci_blackhole_methods[] = {
+        /* Device interface */
+        DEVMETHOD(device_probe,         pci_blackhole_probe),
+        DEVMETHOD(device_attach,        pci_blackhole_attach),
+
+	{ 0, 0 }
+};
+
+static driver_t pci_blackhole_driver = {
+        "blackhole",
+        pci_blackhole_methods,
+};
+
+devclass_t blackhole_devclass;
+
+DRIVER_MODULE(blackhole, pci, pci_blackhole_driver, blackhole_devclass, 0, 0);
+MODULE_DEPEND(blackhole, pci, 1, 1, 1);
diff --git a/sys/dev/bvm/bvm_console.c b/sys/dev/bvm/bvm_console.c
new file mode 100644
index 0000000..a0e70e5
--- /dev/null
+++ b/sys/dev/bvm/bvm_console.c
@@ -0,0 +1,240 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/cons.h>
+#include <sys/tty.h>
+#include <sys/reboot.h>
+#include <sys/bus.h>
+
+#include <sys/kdb.h>
+#include <ddb/ddb.h>
+
+#ifndef	BVMCONS_POLL_HZ
+#define	BVMCONS_POLL_HZ	4
+#endif
+#define BVMBURSTLEN	16	/* max number of bytes to write in one chunk */
+
+static tsw_open_t bvm_tty_open;
+static tsw_close_t bvm_tty_close;
+static tsw_outwakeup_t bvm_tty_outwakeup;
+
+static struct ttydevsw bvm_ttydevsw = {
+	.tsw_flags	= TF_NOPREFIX,
+	.tsw_open	= bvm_tty_open,
+	.tsw_close	= bvm_tty_close,
+	.tsw_outwakeup	= bvm_tty_outwakeup,
+};
+
+static int			polltime;
+static struct callout_handle	bvm_timeouthandle
+    = CALLOUT_HANDLE_INITIALIZER(&bvm_timeouthandle);
+
+#if defined(KDB)
+static int			alt_break_state;
+#endif
+
+#define	BVM_CONS_PORT	0x220
+static int bvm_cons_port = BVM_CONS_PORT;
+
+#define BVM_CONS_SIG	('b' << 8 | 'v')
+
+static void	bvm_timeout(void *);
+
+static cn_probe_t	bvm_cnprobe;
+static cn_init_t	bvm_cninit;
+static cn_term_t	bvm_cnterm;
+static cn_getc_t	bvm_cngetc;
+static cn_putc_t	bvm_cnputc;
+static cn_grab_t 	bvm_cngrab;
+static cn_ungrab_t 	bvm_cnungrab;
+
+CONSOLE_DRIVER(bvm);
+
+static int
+bvm_rcons(u_char *ch)
+{
+	int c;
+
+	c = inl(bvm_cons_port);
+	if (c != -1) {
+		*ch = (u_char)c;
+		return (0);
+	} else
+		return (-1);
+}
+
+static void
+bvm_wcons(u_char ch)
+{
+
+	outl(bvm_cons_port, ch);
+}
+
+static void
+cn_drvinit(void *unused)
+{
+	struct tty *tp;
+
+	if (bvm_consdev.cn_pri != CN_DEAD &&
+	    bvm_consdev.cn_name[0] != '\0') {
+		tp = tty_alloc(&bvm_ttydevsw, NULL);
+		tty_makedev(tp, NULL, "bvmcons");
+	}
+}
+
+static int
+bvm_tty_open(struct tty *tp)
+{
+	polltime = hz / BVMCONS_POLL_HZ;
+	if (polltime < 1)
+		polltime = 1;
+	bvm_timeouthandle = timeout(bvm_timeout, tp, polltime);
+
+	return (0);
+}
+
+static void
+bvm_tty_close(struct tty *tp)
+{
+
+	/* XXX Should be replaced with callout_stop(9) */
+	untimeout(bvm_timeout, tp, bvm_timeouthandle);
+}
+
+static void
+bvm_tty_outwakeup(struct tty *tp)
+{
+	int len, written;
+	u_char buf[BVMBURSTLEN];
+
+	for (;;) {
+		len = ttydisc_getc(tp, buf, sizeof(buf));
+		if (len == 0)
+			break;
+
+		written = 0;
+		while (written < len)
+			bvm_wcons(buf[written++]);
+	}
+}
+
+static void
+bvm_timeout(void *v)
+{
+	struct	tty *tp;
+	int 	c;
+
+	tp = (struct tty *)v;
+
+	tty_lock(tp);
+	while ((c = bvm_cngetc(NULL)) != -1)
+		ttydisc_rint(tp, c, 0);
+	ttydisc_rint_done(tp);
+	tty_unlock(tp);
+
+	bvm_timeouthandle = timeout(bvm_timeout, tp, polltime);
+}
+
+static void
+bvm_cnprobe(struct consdev *cp)
+{
+	int disabled, port;
+
+	disabled = 0;
+	cp->cn_pri = CN_DEAD;
+
+	resource_int_value("bvmconsole", 0, "disabled", &disabled);
+	if (!disabled) {
+		if (resource_int_value("bvmconsole", 0, "port", &port) == 0)
+			bvm_cons_port = port;
+
+		if (inw(bvm_cons_port) == BVM_CONS_SIG)
+			cp->cn_pri = CN_REMOTE;
+	}
+}
+
+static void
+bvm_cninit(struct consdev *cp)
+{
+	int i;
+	const char *bootmsg = "Using bvm console.\n";
+
+	if (boothowto & RB_VERBOSE) {
+		for (i = 0; i < strlen(bootmsg); i++)
+			bvm_cnputc(cp, bootmsg[i]);
+	}
+
+	strcpy(cp->cn_name, "bvmcons");
+}
+
+static void
+bvm_cnterm(struct consdev *cp)
+{
+
+}
+
+static int
+bvm_cngetc(struct consdev *cp)
+{
+	unsigned char ch;
+
+	if (bvm_rcons(&ch) == 0) {
+#if defined(KDB)
+		kdb_alt_break(ch, &alt_break_state);
+#endif
+		return (ch);
+	}
+
+	return (-1);
+}
+
+static void
+bvm_cnputc(struct consdev *cp, int c)
+{
+
+	bvm_wcons(c);
+}
+
+static void
+bvm_cngrab(struct consdev *cp)
+{
+}
+
+static void
+bvm_cnungrab(struct consdev *cp)
+{
+}
+
+SYSINIT(cndev, SI_SUB_CONFIGURE, SI_ORDER_MIDDLE, cn_drvinit, NULL);
diff --git a/sys/dev/bvm/bvm_dbg.c b/sys/dev/bvm/bvm_dbg.c
new file mode 100644
index 0000000..1ba7ce0
--- /dev/null
+++ b/sys/dev/bvm/bvm_dbg.c
@@ -0,0 +1,100 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+
+#include <gdb/gdb.h>
+
+#include <machine/cpufunc.h>
+
+static gdb_probe_f bvm_dbg_probe;
+static gdb_init_f bvm_dbg_init;
+static gdb_term_f bvm_dbg_term;
+static gdb_getc_f bvm_dbg_getc;
+static gdb_putc_f bvm_dbg_putc;
+
+GDB_DBGPORT(bvm, bvm_dbg_probe, bvm_dbg_init, bvm_dbg_term,
+    bvm_dbg_getc, bvm_dbg_putc);
+
+#define	BVM_DBG_PORT	0x224
+static int bvm_dbg_port = BVM_DBG_PORT;
+
+#define BVM_DBG_SIG	('B' << 8 | 'V')
+
+static int
+bvm_dbg_probe(void)
+{
+	int disabled, port;
+
+	disabled = 0;
+	resource_int_value("bvmdbg", 0, "disabled", &disabled);
+
+	if (!disabled) {
+		if (resource_int_value("bvmdbg", 0, "port", &port) == 0)
+			bvm_dbg_port = port;
+
+		if (inw(bvm_dbg_port) == BVM_DBG_SIG) {
+			/*
+			 * Return a higher priority than 0 to override other
+			 * gdb dbgport providers that may be present (e.g. uart)
+			 */
+			return (1);
+		}
+	}
+
+	return (-1);
+}
+
+static void
+bvm_dbg_init(void)
+{
+}
+
+static void
+bvm_dbg_term(void)
+{
+}
+
+static void
+bvm_dbg_putc(int c)
+{
+
+	outl(bvm_dbg_port, c);
+}
+
+static int
+bvm_dbg_getc(void)
+{
+
+	return (inl(bvm_dbg_port));
+}
diff --git a/sys/modules/Makefile b/sys/modules/Makefile
index 1344297..a53f640 100644
--- a/sys/modules/Makefile
+++ b/sys/modules/Makefile
@@ -48,6 +48,7 @@ SUBDIR=	\
 	${_bxe} \
 	${_bios} \
 	${_bktr} \
+	${_blackhole} \
 	${_bm} \
 	bridgestp \
 	bwi \
@@ -335,6 +336,7 @@ SUBDIR=	\
 	vge \
 	${_viawd} \
 	vkbd \
+	${_vmm} \
 	${_vpo} \
 	vr \
 	vte \
@@ -624,6 +626,7 @@ _amdtemp=	amdtemp
 _arcmsr=	arcmsr
 _asmc=		asmc
 _bktr=		bktr
+_blackhole=	blackhole
 _bxe=		bxe
 _cardbus=	cardbus
 _cbb=		cbb
@@ -720,6 +723,7 @@ _twa=		twa
 _vesa=		vesa
 _viawd=		viawd
 _virtio=	virtio
+_vmm=		vmm
 _vxge=  	vxge
 _x86bios=	x86bios
 _wbwd=		wbwd
diff --git a/sys/modules/blackhole/Makefile b/sys/modules/blackhole/Makefile
new file mode 100644
index 0000000..a73cf44
--- /dev/null
+++ b/sys/modules/blackhole/Makefile
@@ -0,0 +1,9 @@
+# $FreeBSD$
+
+.PATH: ${.CURDIR}/../../dev/blackhole
+
+KMOD=	blackhole
+SRCS=	blackhole.c
+SRCS+=	bus_if.h device_if.h pci_if.h
+
+.include <bsd.kmod.mk>
diff --git a/sys/modules/vmm/Makefile b/sys/modules/vmm/Makefile
new file mode 100644
index 0000000..8b565da
--- /dev/null
+++ b/sys/modules/vmm/Makefile
@@ -0,0 +1,62 @@
+# $FreeBSD$
+
+KMOD=	vmm
+
+SRCS=	opt_ddb.h device_if.h bus_if.h pci_if.h
+
+CFLAGS+= -DVMM_KEEP_STATS -DSMP
+CFLAGS+= -I${.CURDIR}/../../amd64/vmm	
+CFLAGS+= -I${.CURDIR}/../../amd64/vmm/io	
+CFLAGS+= -I${.CURDIR}/../../amd64/vmm/intel
+
+# generic vmm support
+.PATH: ${.CURDIR}/../../amd64/vmm
+SRCS+=	vmm.c		\
+	vmm_dev.c	\
+	vmm_host.c	\
+	vmm_instruction_emul.c	\
+	vmm_ipi.c	\
+	vmm_lapic.c	\
+	vmm_mem.c	\
+	vmm_msr.c	\
+	vmm_stat.c	\
+	vmm_util.c	\
+	x86.c		\
+	vmm_support.S
+
+.PATH: ${.CURDIR}/../../amd64/vmm/io
+SRCS+=	iommu.c		\
+	ppt.c           \
+	vdev.c		\
+	vlapic.c
+
+# intel-specific files
+.PATH: ${.CURDIR}/../../amd64/vmm/intel
+SRCS+=	ept.c		\
+	vmcs.c		\
+	vmx_msr.c	\
+	vmx.c		\
+	vtd.c
+
+# amd-specific files
+.PATH: ${.CURDIR}/../../amd64/vmm/amd
+SRCS+=	amdv.c
+
+OBJS=	vmx_support.o
+
+CLEANFILES=	vmx_assym.s vmx_genassym.o
+
+vmx_assym.s:    vmx_genassym.o
+.if exists(@)
+vmx_assym.s:    @/kern/genassym.sh
+.endif
+	sh @/kern/genassym.sh vmx_genassym.o > ${.TARGET}
+
+vmx_support.o:	vmx_support.S vmx_assym.s
+	${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \
+	    ${.IMPSRC} -o ${.TARGET}
+
+vmx_genassym.o: vmx_genassym.c @ machine x86
+	${CC} -c ${CFLAGS:N-fno-common} ${.IMPSRC}
+
+.include <bsd.kmod.mk>
diff --git a/usr.sbin/Makefile.amd64 b/usr.sbin/Makefile.amd64
index 1a1bffe..5ee2165 100644
--- a/usr.sbin/Makefile.amd64
+++ b/usr.sbin/Makefile.amd64
@@ -10,6 +10,9 @@ SUBDIR+=	acpi
 SUBDIR+=	apm
 .endif
 SUBDIR+=	asf
+SUBDIR+=	bhyve
+SUBDIR+=	bhyvectl
+SUBDIR+=	bhyveload
 SUBDIR+=	boot0cfg
 .if ${MK_TOOLCHAIN} != "no"
 SUBDIR+=	btxld
diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile
new file mode 100644
index 0000000..078ef9a
--- /dev/null
+++ b/usr.sbin/bhyve/Makefile
@@ -0,0 +1,27 @@
+#
+# $FreeBSD$
+#
+
+PROG=	bhyve
+
+DEBUG_FLAGS= -g -O0 
+
+SRCS=	acpi.c atpic.c bhyverun.c consport.c dbgport.c elcr.c inout.c 
+SRCS+=  ioapic.c mem.c mevent.c mptbl.c
+SRCS+=	pci_emul.c pci_hostbridge.c pci_passthru.c pci_virtio_block.c
+SRCS+=	pci_virtio_net.c pci_uart.c pit_8254.c pmtmr.c post.c rtc.c uart.c
+SRCS+=	xmsr.c spinup_ap.c
+
+.PATH:	${.CURDIR}/../../sys/amd64/vmm
+SRCS+=	vmm_instruction_emul.c
+
+NO_MAN=
+
+DPADD=	${LIBVMMAPI} ${LIBMD} ${LIBPTHREAD}
+LDADD=	-lvmmapi -lmd -lpthread
+
+WARNS?=	2
+
+CFLAGS+= -I${.CURDIR}/../../sys
+
+.include <bsd.prog.mk>
diff --git a/usr.sbin/bhyve/acpi.c b/usr.sbin/bhyve/acpi.c
new file mode 100644
index 0000000..32effdc
--- /dev/null
+++ b/usr.sbin/bhyve/acpi.c
@@ -0,0 +1,844 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * bhyve ACPI table generator.
+ *
+ * Create the minimal set of ACPI tables required to boot FreeBSD (and
+ * hopefully other o/s's) by writing out ASL template files for each of
+ * the tables and the compiling them to AML with the Intel iasl compiler.
+ * The AML files are then read into guest memory.
+ *
+ *  The tables are placed in the guest's ROM area just below 1MB physical,
+ * above the MPTable.
+ *
+ *  Layout
+ *  ------
+ *   RSDP  ->   0xf0400    (36 bytes fixed)
+ *     RSDT  ->   0xf0440    (36 bytes + 4*N table addrs, 2 used)
+ *     XSDT  ->   0xf0480    (36 bytes + 8*N table addrs, 2 used)
+ *       MADT  ->   0xf0500  (depends on #CPUs)
+ *       FADT  ->   0xf0600  (268 bytes)
+ *         FACS  ->   0xf0780 (64 bytes)
+ *         DSDT  ->   0xf0800 (variable - can go up to 0x100000)
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+
+#include <paths.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "bhyverun.h"
+#include "acpi.h"
+
+/*
+ * Define the base address of the ACPI tables, and the offsets to
+ * the individual tables
+ */
+#define BHYVE_ACPI_BASE		0xf0400
+#define RSDT_OFFSET		0x040
+#define XSDT_OFFSET		0x080
+#define MADT_OFFSET		0x100
+#define FADT_OFFSET		0x200
+#define FACS_OFFSET		0x380
+#define DSDT_OFFSET		0x400
+
+#define	BHYVE_ASL_TEMPLATE	"bhyve.XXXXXXX"
+#define BHYVE_ASL_SUFFIX	".aml"
+#define BHYVE_ASL_COMPILER	"/usr/sbin/iasl"
+
+#define BHYVE_PM_TIMER_ADDR	0x408
+
+static int basl_keep_temps;
+static int basl_verbose_iasl;
+static int basl_ncpu;
+static uint32_t basl_acpi_base = BHYVE_ACPI_BASE;
+
+/*
+ * Contains the full pathname of the template to be passed
+ * to mkstemp/mktemps(3)
+ */
+static char basl_template[MAXPATHLEN];
+static char basl_stemplate[MAXPATHLEN];
+
+struct basl_fio {
+	int	fd;
+	FILE	*fp;
+	char	f_name[MAXPATHLEN];
+};
+
+#define EFPRINTF(...) \
+	err = fprintf(__VA_ARGS__); if (err < 0) goto err_exit;
+
+#define EFFLUSH(x) \
+	err = fflush(x); if (err != 0) goto err_exit;
+
+static int
+basl_fwrite_rsdp(FILE *fp)
+{
+	int err;
+
+	err = 0;
+
+	EFPRINTF(fp, "/*\n");
+	EFPRINTF(fp, " * bhyve RSDP template\n");
+	EFPRINTF(fp, " */\n");
+	EFPRINTF(fp, "[0008]\t\tSignature : \"RSD PTR \"\n");
+	EFPRINTF(fp, "[0001]\t\tChecksum : 43\n");
+	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+	EFPRINTF(fp, "[0001]\t\tRevision : 02\n");
+	EFPRINTF(fp, "[0004]\t\tRSDT Address : %08X\n",
+	    basl_acpi_base + RSDT_OFFSET);
+	EFPRINTF(fp, "[0004]\t\tLength : 00000024\n");
+	EFPRINTF(fp, "[0008]\t\tXSDT Address : 00000000%08X\n",
+	    basl_acpi_base + XSDT_OFFSET);
+	EFPRINTF(fp, "[0001]\t\tExtended Checksum : 00\n");
+	EFPRINTF(fp, "[0003]\t\tReserved : 000000\n");
+
+	EFFLUSH(fp);
+
+	return (0);
+
+err_exit:
+	return (errno);
+}
+
+static int
+basl_fwrite_rsdt(FILE *fp)
+{
+	int err;
+
+	err = 0;
+
+	EFPRINTF(fp, "/*\n");
+	EFPRINTF(fp, " * bhyve RSDT template\n");
+	EFPRINTF(fp, " */\n");
+	EFPRINTF(fp, "[0004]\t\tSignature : \"RSDT\"\n");
+	EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+	EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVRSDT  \"\n");
+	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+	/* iasl will fill in the compiler ID/revision fields */
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+	EFPRINTF(fp, "\n");
+
+	/* Add in pointers to the MADT and FADT */
+	EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : %08X\n",
+	    basl_acpi_base + MADT_OFFSET);
+	EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : %08X\n",
+	    basl_acpi_base + FADT_OFFSET);
+
+	EFFLUSH(fp);
+
+	return (0);
+
+err_exit:
+	return (errno);
+}
+
+static int
+basl_fwrite_xsdt(FILE *fp)
+{
+	int err;
+
+	err = 0;
+
+	EFPRINTF(fp, "/*\n");
+	EFPRINTF(fp, " * bhyve XSDT template\n");
+	EFPRINTF(fp, " */\n");
+	EFPRINTF(fp, "[0004]\t\tSignature : \"XSDT\"\n");
+	EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+	EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVXSDT  \"\n");
+	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+	/* iasl will fill in the compiler ID/revision fields */
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+	EFPRINTF(fp, "\n");
+
+	/* Add in pointers to the MADT and FADT */
+	EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : 00000000%08X\n",
+	    basl_acpi_base + MADT_OFFSET);
+	EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : 00000000%08X\n",
+	    basl_acpi_base + FADT_OFFSET);
+
+	EFFLUSH(fp);
+
+	return (0);
+
+err_exit:
+	return (errno);
+}
+
+static int
+basl_fwrite_madt(FILE *fp)
+{
+	int err;
+	int i;
+
+	err = 0;
+
+	EFPRINTF(fp, "/*\n");
+	EFPRINTF(fp, " * bhyve MADT template\n");
+	EFPRINTF(fp, " */\n");
+	EFPRINTF(fp, "[0004]\t\tSignature : \"APIC\"\n");
+	EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+	EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMADT  \"\n");
+	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+
+	/* iasl will fill in the compiler ID/revision fields */
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp, "[0004]\t\tLocal Apic Address : FEE00000\n");
+	EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n");
+	EFPRINTF(fp, "\t\t\tPC-AT Compatibility : 1\n");
+	EFPRINTF(fp, "\n");
+
+	/* Add a Processor Local APIC entry for each CPU */
+	for (i = 0; i < basl_ncpu; i++) {
+		EFPRINTF(fp, "[0001]\t\tSubtable Type : 00\n");
+		EFPRINTF(fp, "[0001]\t\tLength : 08\n");
+		EFPRINTF(fp, "[0001]\t\tProcessor ID : %02d\n", i);
+		EFPRINTF(fp, "[0001]\t\tLocal Apic ID : %02d\n", i);
+		EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n");
+		EFPRINTF(fp, "\t\t\tProcessor Enabled : 1\n");
+		EFPRINTF(fp, "\n");
+	}
+
+	/* Always a single IOAPIC entry, with ID ncpu+1 */
+	EFPRINTF(fp, "[0001]\t\tSubtable Type : 01\n");
+	EFPRINTF(fp, "[0001]\t\tLength : 0C\n");
+	EFPRINTF(fp, "[0001]\t\tI/O Apic ID : %02d\n", basl_ncpu);
+	EFPRINTF(fp, "[0001]\t\tReserved : 00\n");
+	EFPRINTF(fp, "[0004]\t\tAddress : fec00000\n");
+	EFPRINTF(fp, "[0004]\t\tInterrupt : 00000000\n");
+	EFPRINTF(fp, "\n");
+
+	/* Override the 8259 chained vector. XXX maybe not needed */
+	EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n");
+	EFPRINTF(fp, "[0001]\t\tLength : 0A\n");
+	EFPRINTF(fp, "[0001]\t\tBus : 00\n");
+	EFPRINTF(fp, "[0001]\t\tSource : 09\n");
+	EFPRINTF(fp, "[0004]\t\tInterrupt : 00000009\n");
+	EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0000\n");
+	EFPRINTF(fp, "\t\t\tPolarity : 0\n");
+	EFPRINTF(fp, "\t\t\tTrigger Mode : 0\n");
+	EFPRINTF(fp, "\n");
+
+	EFFLUSH(fp);
+
+	return (0);
+
+err_exit:
+	return (errno);
+}
+
+static int
+basl_fwrite_fadt(FILE *fp)
+{
+	int err;
+
+	err = 0;
+
+	EFPRINTF(fp, "/*\n");
+	EFPRINTF(fp, " * bhyve FADT template\n");
+	EFPRINTF(fp, " */\n");
+	EFPRINTF(fp, "[0004]\t\tSignature : \"FACP\"\n");
+	EFPRINTF(fp, "[0004]\t\tTable Length : 0000010C\n");
+	EFPRINTF(fp, "[0001]\t\tRevision : 05\n");
+	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVFACP  \"\n");
+	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+	/* iasl will fill in the compiler ID/revision fields */
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp, "[0004]\t\tFACS Address : %08X\n",
+	    basl_acpi_base + FACS_OFFSET);
+	EFPRINTF(fp, "[0004]\t\tDSDT Address : %08X\n",
+	    basl_acpi_base + DSDT_OFFSET);
+	EFPRINTF(fp, "[0001]\t\tModel : 00\n");
+	EFPRINTF(fp, "[0001]\t\tPM Profile : 00 [Unspecified]\n");
+	EFPRINTF(fp, "[0002]\t\tSCI Interrupt : 0009\n");
+	EFPRINTF(fp, "[0004]\t\tSMI Command Port : 00000000\n");
+	EFPRINTF(fp, "[0001]\t\tACPI Enable Value : 00\n");
+	EFPRINTF(fp, "[0001]\t\tACPI Disable Value : 00\n");
+	EFPRINTF(fp, "[0001]\t\tS4BIOS Command : 00\n");
+	EFPRINTF(fp, "[0001]\t\tP-State Control : 00\n");
+	EFPRINTF(fp, "[0004]\t\tPM1A Event Block Address : 00000000\n");
+	EFPRINTF(fp, "[0004]\t\tPM1B Event Block Address : 00000000\n");
+	EFPRINTF(fp, "[0004]\t\tPM1A Control Block Address : 00000000\n");
+	EFPRINTF(fp, "[0004]\t\tPM1B Control Block Address : 00000000\n");
+	EFPRINTF(fp, "[0004]\t\tPM2 Control Block Address : 00000000\n");
+	EFPRINTF(fp, "[0004]\t\tPM Timer Block Address : %08X\n",
+		 BHYVE_PM_TIMER_ADDR);
+	EFPRINTF(fp, "[0004]\t\tGPE0 Block Address : 00000000\n");
+	EFPRINTF(fp, "[0004]\t\tGPE1 Block Address : 00000000\n");
+	EFPRINTF(fp, "[0001]\t\tPM1 Event Block Length : 04\n");
+	EFPRINTF(fp, "[0001]\t\tPM1 Control Block Length : 02\n");
+	EFPRINTF(fp, "[0001]\t\tPM2 Control Block Length : 00\n");
+	EFPRINTF(fp, "[0001]\t\tPM Timer Block Length : 04\n");
+	EFPRINTF(fp, "[0001]\t\tGPE0 Block Length : 00\n");
+	EFPRINTF(fp, "[0001]\t\tGPE1 Block Length : 00\n");
+	EFPRINTF(fp, "[0001]\t\tGPE1 Base Offset : 00\n");
+	EFPRINTF(fp, "[0001]\t\t_CST Support : 00\n");
+	EFPRINTF(fp, "[0002]\t\tC2 Latency : 0000\n");
+	EFPRINTF(fp, "[0002]\t\tC3 Latency : 0000\n");
+	EFPRINTF(fp, "[0002]\t\tCPU Cache Size : 0000\n");
+	EFPRINTF(fp, "[0002]\t\tCache Flush Stride : 0000\n");
+	EFPRINTF(fp, "[0001]\t\tDuty Cycle Offset : 00\n");
+	EFPRINTF(fp, "[0001]\t\tDuty Cycle Width : 00\n");
+	EFPRINTF(fp, "[0001]\t\tRTC Day Alarm Index : 00\n");
+	EFPRINTF(fp, "[0001]\t\tRTC Month Alarm Index : 00\n");
+	EFPRINTF(fp, "[0001]\t\tRTC Century Index : 00\n");
+	EFPRINTF(fp, "[0002]\t\tBoot Flags (decoded below) : 0000\n");
+	EFPRINTF(fp, "\t\t\tLegacy Devices Supported (V2) : 0\n");
+	EFPRINTF(fp, "\t\t\t8042 Present on ports 60/64 (V2) : 0\n");
+	EFPRINTF(fp, "\t\t\tVGA Not Present (V4) : 1\n");
+	EFPRINTF(fp, "\t\t\tMSI Not Supported (V4) : 0\n");
+	EFPRINTF(fp, "\t\t\tPCIe ASPM Not Supported (V4) : 1\n");
+	EFPRINTF(fp, "\t\t\tCMOS RTC Not Present (V5) : 0\n");
+	EFPRINTF(fp, "[0001]\t\tReserved : 00\n");
+	EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n");
+	EFPRINTF(fp, "\t\t\tWBINVD instruction is operational (V1) : 1\n");
+	EFPRINTF(fp, "\t\t\tWBINVD flushes all caches (V1) : 0\n");
+	EFPRINTF(fp, "\t\t\tAll CPUs support C1 (V1) : 0\n");
+	EFPRINTF(fp, "\t\t\tC2 works on MP system (V1) : 0\n");
+	EFPRINTF(fp, "\t\t\tControl Method Power Button (V1) : 1\n");
+	EFPRINTF(fp, "\t\t\tControl Method Sleep Button (V1) : 1\n");
+	EFPRINTF(fp, "\t\t\tRTC wake not in fixed reg space (V1) : 0\n");
+	EFPRINTF(fp, "\t\t\tRTC can wake system from S4 (V1) : 0\n");
+	EFPRINTF(fp, "\t\t\t32-bit PM Timer (V1) : 1\n");
+	EFPRINTF(fp, "\t\t\tDocking Supported (V1) : 0\n");
+	EFPRINTF(fp, "\t\t\tReset Register Supported (V2) : 0\n");
+	EFPRINTF(fp, "\t\t\tSealed Case (V3) : 0\n");
+	EFPRINTF(fp, "\t\t\tHeadless - No Video (V3) : 1\n");
+	EFPRINTF(fp, "\t\t\tUse native instr after SLP_TYPx (V3) : 0\n");
+	EFPRINTF(fp, "\t\t\tPCIEXP_WAK Bits Supported (V4) : 0\n");
+	EFPRINTF(fp, "\t\t\tUse Platform Timer (V4) : 0\n");
+	EFPRINTF(fp, "\t\t\tRTC_STS valid on S4 wake (V4) : 0\n");
+	EFPRINTF(fp, "\t\t\tRemote Power-on capable (V4) : 0\n");
+	EFPRINTF(fp, "\t\t\tUse APIC Cluster Model (V4) : 0\n");
+	EFPRINTF(fp, "\t\t\tUse APIC Physical Destination Mode (V4) : 1\n");
+	EFPRINTF(fp, "\t\t\tHardware Reduced (V5) : 0\n");
+	EFPRINTF(fp, "\t\t\tLow Power S0 Idle (V5) : 0\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp,
+	    "[0012]\t\tReset Register : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000001\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp, "[0001]\t\tValue to cause reset : 00\n");
+	EFPRINTF(fp, "[0003]\t\tReserved : 000000\n");
+	EFPRINTF(fp, "[0008]\t\tFACS Address : 00000000%08X\n",
+	    basl_acpi_base + FACS_OFFSET);
+	EFPRINTF(fp, "[0008]\t\tDSDT Address : 00000000%08X\n",
+	    basl_acpi_base + DSDT_OFFSET);
+	EFPRINTF(fp,
+	    "[0012]\t\tPM1A Event Block : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 20\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000001\n");
+	EFPRINTF(fp, "\n");
+	
+	EFPRINTF(fp,
+	    "[0012]\t\tPM1B Event Block : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp,
+	    "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp,
+	    "[0012]\t\tPM1A Control Block : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 10\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000001\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp,
+	    "[0012]\t\tPM1B Control Block : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp,
+	    "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp,
+	    "[0012]\t\tPM2 Control Block : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp,
+	    "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+	EFPRINTF(fp, "\n");
+
+	/* Valid for bhyve */
+	EFPRINTF(fp,
+	    "[0012]\t\tPM Timer Block : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 32\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp,
+	    "[0001]\t\tEncoded Access Width : 03 [DWord Access:32]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n",
+	    BHYVE_PM_TIMER_ADDR);
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp, "[0012]\t\tGPE0 Block : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 80\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp, "[0012]\t\tGPE1 Block : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 00\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp,
+	    "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp,
+	   "[0012]\t\tSleep Control Register : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp,
+	    "[0012]\t\tSleep Status Register : [Generic Address Structure]\n");
+	EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n");
+	EFPRINTF(fp, "[0001]\t\tBit Width : 08\n");
+	EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n");
+	EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n");
+	EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n");
+
+	EFFLUSH(fp);
+
+	return (0);
+
+err_exit:
+	return (errno);
+}
+
+static int
+basl_fwrite_facs(FILE *fp)
+{
+	int err;
+
+	err = 0;
+
+	EFPRINTF(fp, "/*\n");
+	EFPRINTF(fp, " * bhyve FACS template\n");
+	EFPRINTF(fp, " */\n");
+	EFPRINTF(fp, "[0004]\t\tSignature : \"FACS\"\n");
+	EFPRINTF(fp, "[0004]\t\tLength : 00000040\n");
+	EFPRINTF(fp, "[0004]\t\tHardware Signature : 00000000\n");
+	EFPRINTF(fp, "[0004]\t\t32 Firmware Waking Vector : 00000000\n");
+	EFPRINTF(fp, "[0004]\t\tGlobal Lock : 00000000\n");
+	EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n");
+	EFPRINTF(fp, "\t\t\tS4BIOS Support Present : 0\n");
+	EFPRINTF(fp, "\t\t\t64-bit Wake Supported (V2) : 0\n");
+	EFPRINTF(fp,
+	    "[0008]\t\t64 Firmware Waking Vector : 0000000000000000\n");
+	EFPRINTF(fp, "[0001]\t\tVersion : 02\n");
+	EFPRINTF(fp, "[0003]\t\tReserved : 000000\n");
+	EFPRINTF(fp, "[0004]\t\tOspmFlags (decoded below) : 00000000\n");
+	EFPRINTF(fp, "\t\t\t64-bit Wake Env Required (V2) : 0\n");
+
+	EFFLUSH(fp);
+
+	return (0);
+	
+err_exit:
+	return (errno);
+}
+
+static int
+basl_fwrite_dsdt(FILE *fp)
+{
+	int err;
+
+	err = 0;
+
+	EFPRINTF(fp, "/*\n");
+	EFPRINTF(fp, " * bhyve DSDT template\n");
+	EFPRINTF(fp, " */\n");
+	EFPRINTF(fp, "DefinitionBlock (\"bhyve_dsdt.aml\", \"DSDT\", 2,"
+		 "\"BHYVE \", \"BVDSDT  \", 0x00000001)\n");
+	EFPRINTF(fp, "{\n");
+	EFPRINTF(fp, "  Scope (_SB)\n");
+	EFPRINTF(fp, "  {\n");
+	EFPRINTF(fp, "    Device (PCI0)\n");
+	EFPRINTF(fp, "    {\n");
+	EFPRINTF(fp, "      Name (_HID, EisaId (\"PNP0A03\"))\n");
+	EFPRINTF(fp, "      Name (_ADR, Zero)\n");
+	EFPRINTF(fp, "      Name (_UID, One)\n");
+	EFPRINTF(fp, "      Name (_CRS, ResourceTemplate ()\n");
+	EFPRINTF(fp, "      {\n");
+	EFPRINTF(fp, "        WordBusNumber (ResourceProducer, MinFixed,"
+		 "MaxFixed, PosDecode,\n");
+	EFPRINTF(fp, "            0x0000,             // Granularity\n");
+	EFPRINTF(fp, "            0x0000,             // Range Minimum\n");
+	EFPRINTF(fp, "            0x00FF,             // Range Maximum\n");
+	EFPRINTF(fp, "            0x0000,             // Transl Offset\n");
+	EFPRINTF(fp, "            0x0100,             // Length\n");
+	EFPRINTF(fp, "            ,, )\n");
+	EFPRINTF(fp, "         IO (Decode16,\n");
+	EFPRINTF(fp, "            0x0CF8,             // Range Minimum\n");
+	EFPRINTF(fp, "            0x0CF8,             // Range Maximum\n");
+	EFPRINTF(fp, "            0x01,               // Alignment\n");
+	EFPRINTF(fp, "            0x08,               // Length\n");
+	EFPRINTF(fp, "            )\n");
+	EFPRINTF(fp, "         WordIO (ResourceProducer, MinFixed, MaxFixed,"
+		 "PosDecode, EntireRange,\n");
+	EFPRINTF(fp, "            0x0000,             // Granularity\n");
+	EFPRINTF(fp, "            0x0000,             // Range Minimum\n");
+	EFPRINTF(fp, "            0x0CF7,             // Range Maximum\n");
+	EFPRINTF(fp, "            0x0000,             // Transl Offset\n");
+	EFPRINTF(fp, "            0x0CF8,             // Length\n");
+	EFPRINTF(fp, "            ,, , TypeStatic)\n");
+	EFPRINTF(fp, "         WordIO (ResourceProducer, MinFixed, MaxFixed,"
+		 "PosDecode, EntireRange,\n");
+	EFPRINTF(fp, "            0x0000,             // Granularity\n");
+	EFPRINTF(fp, "            0x0D00,             // Range Minimum\n");
+	EFPRINTF(fp, "            0xFFFF,             // Range Maximum\n");
+	EFPRINTF(fp, "            0x0000,             // Transl Offset\n");
+	EFPRINTF(fp, "            0xF300,             // Length\n");
+	EFPRINTF(fp, "             ,, , TypeStatic)\n");
+	EFPRINTF(fp, "          })\n");
+	EFPRINTF(fp, "     }\n");
+	EFPRINTF(fp, "  }\n");
+	EFPRINTF(fp, "\n");
+	EFPRINTF(fp, "  Scope (_SB.PCI0)\n");
+	EFPRINTF(fp, "  {\n");
+	EFPRINTF(fp, "    Device (ISA)\n");
+	EFPRINTF(fp, "    {\n");
+	EFPRINTF(fp, "      Name (_ADR, 0x00010000)\n");
+	EFPRINTF(fp, "      OperationRegion (P40C, PCI_Config, 0x60, 0x04)\n");
+	EFPRINTF(fp, "    }\n");
+	EFPRINTF(fp, "  }\n");
+	EFPRINTF(fp, "\n");
+	EFPRINTF(fp, "  Scope (_SB.PCI0.ISA)\n");
+        EFPRINTF(fp, "  {\n");
+	EFPRINTF(fp, "    Device (RTC)\n");
+        EFPRINTF(fp, "    {\n");
+	EFPRINTF(fp, "      Name (_HID, EisaId (\"PNP0B00\"))\n");
+	EFPRINTF(fp, "      Name (_CRS, ResourceTemplate ()\n");
+	EFPRINTF(fp, "      {\n");
+	EFPRINTF(fp, "        IO (Decode16,\n");
+	EFPRINTF(fp, "            0x0070,             // Range Minimum\n");
+	EFPRINTF(fp, "            0x0070,             // Range Maximum\n");
+	EFPRINTF(fp, "            0x10,               // Alignment\n");
+	EFPRINTF(fp, "            0x02,               // Length\n");
+	EFPRINTF(fp, "            )\n");
+	EFPRINTF(fp, "        IRQNoFlags ()\n");
+	EFPRINTF(fp, "            {8}\n");
+	EFPRINTF(fp, "        IO (Decode16,\n");
+	EFPRINTF(fp, "            0x0072,             // Range Minimum\n");
+	EFPRINTF(fp, "            0x0072,             // Range Maximum\n");
+	EFPRINTF(fp, "            0x02,               // Alignment\n");
+	EFPRINTF(fp, "            0x06,               // Length\n");
+	EFPRINTF(fp, "            )\n");
+	EFPRINTF(fp, "      })\n");
+        EFPRINTF(fp, "    }\n");
+	EFPRINTF(fp, "  }\n");
+	EFPRINTF(fp, "}\n");
+
+	EFFLUSH(fp);
+
+	return (0);
+
+err_exit:
+	return (errno);
+}
+
+static int
+basl_open(struct basl_fio *bf, int suffix)
+{
+	int err;
+
+	err = 0;
+
+	if (suffix) {
+		strncpy(bf->f_name, basl_stemplate, MAXPATHLEN);
+		bf->fd = mkstemps(bf->f_name, strlen(BHYVE_ASL_SUFFIX));
+	} else {
+		strncpy(bf->f_name, basl_template, MAXPATHLEN);
+		bf->fd = mkstemp(bf->f_name);
+	}
+
+	if (bf->fd > 0) {
+		bf->fp = fdopen(bf->fd, "w+");
+		if (bf->fp == NULL) {
+			unlink(bf->f_name);
+			close(bf->fd);
+		}
+	} else {
+		err = 1;
+	}
+
+	return (err);
+}
+
+static void
+basl_close(struct basl_fio *bf)
+{
+
+	if (!basl_keep_temps)
+		unlink(bf->f_name);
+	fclose(bf->fp);
+}
+
+static int
+basl_start(struct basl_fio *in, struct basl_fio *out)
+{
+	int err;
+
+	err = basl_open(in, 0);
+	if (!err) {
+		err = basl_open(out, 1);
+		if (err) {
+			basl_close(in);
+		}
+	}
+
+	return (err);
+}
+
+static void
+basl_end(struct basl_fio *in, struct basl_fio *out)
+{
+
+	basl_close(in);
+	basl_close(out);
+}
+
+static int
+basl_load(int fd, uint64_t off)
+{
+        struct stat sb;
+	int err;
+
+	err = 0;
+
+	if (fstat(fd, &sb) < 0 ||
+	    read(fd, paddr_guest2host(basl_acpi_base + off), sb.st_size) < 0)
+			err = errno;
+
+	return (err);
+}
+
+static int
+basl_compile(int (*fwrite_section)(FILE *fp), uint64_t offset)
+{
+	struct basl_fio io[2];
+	static char iaslbuf[3*MAXPATHLEN + 10];
+	char *fmt;
+	int err;
+
+	err = basl_start(&io[0], &io[1]);
+	if (!err) {
+		err = (*fwrite_section)(io[0].fp);
+
+		if (!err) {
+			/*
+			 * iasl sends the results of the compilation to
+			 * stdout. Shut this down by using the shell to
+			 * redirect stdout to /dev/null, unless the user
+			 * has requested verbose output for debugging
+			 * purposes
+			 */
+			fmt = basl_verbose_iasl ?
+				"%s -p %s %s" :
+				"/bin/sh -c \"%s -p %s %s\" 1> /dev/null";
+				
+			snprintf(iaslbuf, sizeof(iaslbuf),
+				 fmt,
+				 BHYVE_ASL_COMPILER,
+				 io[1].f_name, io[0].f_name);
+			err = system(iaslbuf);
+
+			if (!err) {
+				/*
+				 * Copy the aml output file into guest
+				 * memory at the specified location
+				 */
+				err = basl_load(io[1].fd, offset);
+			}
+		}
+		basl_end(&io[0], &io[1]);
+	}
+
+	return (err);
+}
+
+static int
+basl_make_templates(void)
+{
+	const char *tmpdir;
+	int err;
+	int len;
+
+	err = 0;
+	
+	/*
+	 * 
+	 */
+	if ((tmpdir = getenv("BHYVE_TMPDIR")) == NULL || *tmpdir == '\0' ||
+	    (tmpdir = getenv("TMPDIR")) == NULL || *tmpdir == '\0') {
+		tmpdir = _PATH_TMP;
+	}
+
+	len = strlen(tmpdir);
+
+	if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1) < MAXPATHLEN) {
+		strcpy(basl_template, tmpdir);
+		while (len > 0 && basl_template[len - 1] == '/')
+			len--;
+		basl_template[len] = '/';
+		strcpy(&basl_template[len + 1], BHYVE_ASL_TEMPLATE);
+	} else
+		err = E2BIG;
+
+	if (!err) {
+		/*
+		 * len has been intialized (and maybe adjusted) above
+		 */
+		if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1 +
+		     sizeof(BHYVE_ASL_SUFFIX)) < MAXPATHLEN) {
+			strcpy(basl_stemplate, tmpdir);
+			basl_stemplate[len] = '/';
+			strcpy(&basl_stemplate[len + 1], BHYVE_ASL_TEMPLATE);
+			len = strlen(basl_stemplate);
+			strcpy(&basl_stemplate[len], BHYVE_ASL_SUFFIX);
+		} else
+			err = E2BIG;
+	}
+
+	return (err);
+}
+
+static struct {
+	int	(*wsect)(FILE *fp);
+	uint64_t  offset;
+} basl_ftables[] =
+{
+	{ basl_fwrite_rsdp, 0},
+	{ basl_fwrite_rsdt, RSDT_OFFSET },
+	{ basl_fwrite_xsdt, XSDT_OFFSET },
+	{ basl_fwrite_madt, MADT_OFFSET },
+	{ basl_fwrite_fadt, FADT_OFFSET },
+	{ basl_fwrite_facs, FACS_OFFSET },
+	{ basl_fwrite_dsdt, DSDT_OFFSET },
+	{ NULL }
+};
+
+int
+acpi_build(struct vmctx *ctx, int ncpu, int ioapic)
+{
+	int err;
+	int i;
+
+	err = 0;
+	basl_ncpu = ncpu;
+
+	if (!ioapic) {
+		fprintf(stderr, "ACPI tables require an ioapic\n");
+		return (EINVAL);
+	}
+
+	/*
+	 * For debug, allow the user to have iasl compiler output sent
+	 * to stdout rather than /dev/null
+	 */
+	if (getenv("BHYVE_ACPI_VERBOSE_IASL"))
+		basl_verbose_iasl = 1;
+
+	/*
+	 * Allow the user to keep the generated ASL files for debugging
+	 * instead of deleting them following use
+	 */
+	if (getenv("BHYVE_ACPI_KEEPTMPS"))
+		basl_keep_temps = 1;
+
+	i = 0;
+	err = basl_make_templates();
+
+	/*
+	 * Run through all the ASL files, compiling them and
+	 * copying them into guest memory
+	 */
+	while (!err && basl_ftables[i].wsect != NULL) {
+		err = basl_compile(basl_ftables[i].wsect,
+				   basl_ftables[i].offset);
+		i++;
+	}
+
+	return (err);
+}
diff --git a/usr.sbin/bhyve/acpi.h b/usr.sbin/bhyve/acpi.h
new file mode 100644
index 0000000..fec6c9d
--- /dev/null
+++ b/usr.sbin/bhyve/acpi.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _ACPI_H_
+#define _ACPI_H_
+
+int	acpi_build(struct vmctx *ctx, int ncpu, int ioapic);
+
+#endif /* _ACPI_H_ */
diff --git a/usr.sbin/bhyve/atpic.c b/usr.sbin/bhyve/atpic.c
new file mode 100644
index 0000000..a9fb084
--- /dev/null
+++ b/usr.sbin/bhyve/atpic.c
@@ -0,0 +1,68 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "inout.h"
+
+/*
+ * FreeBSD only writes to the 8259 interrupt controllers to put them in a
+ * shutdown state.
+ *
+ * So, we just ignore the writes.
+ */
+
+#define	IO_ICU1		0x20
+#define	IO_ICU2		0xA0
+#define	ICU_IMR_OFFSET	1
+
+static int
+atpic_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+	      uint32_t *eax, void *arg)
+{
+	if (bytes != 1)
+		return (-1);
+
+	if (in)
+		return (-1);
+
+	/* Pretend all writes to the 8259 are alright */
+	return (0);
+}
+
+INOUT_PORT(atpic, IO_ICU1, IOPORT_F_INOUT, atpic_handler);
+INOUT_PORT(atpic, IO_ICU1 + ICU_IMR_OFFSET, IOPORT_F_INOUT, atpic_handler);
+INOUT_PORT(atpic, IO_ICU2, IOPORT_F_INOUT, atpic_handler);
+INOUT_PORT(atpic, IO_ICU2 + ICU_IMR_OFFSET, IOPORT_F_INOUT, atpic_handler);
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
new file mode 100644
index 0000000..999040f
--- /dev/null
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -0,0 +1,788 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+
+#include <machine/segments.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <libgen.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+#include "acpi.h"
+#include "inout.h"
+#include "dbgport.h"
+#include "mem.h"
+#include "mevent.h"
+#include "mptbl.h"
+#include "pci_emul.h"
+#include "xmsr.h"
+#include "ioapic.h"
+#include "spinup_ap.h"
+
+#define	DEFAULT_GUEST_HZ	100
+#define	DEFAULT_GUEST_TSLICE	200
+
+#define GUEST_NIO_PORT		0x488	/* guest upcalls via i/o port */
+
+#define	VMEXIT_SWITCH		0	/* force vcpu switch in mux mode */
+#define	VMEXIT_CONTINUE		1	/* continue from next instruction */
+#define	VMEXIT_RESTART		2	/* restart current instruction */
+#define	VMEXIT_ABORT		3	/* abort the vm run loop */
+#define	VMEXIT_RESET		4	/* guest machine has reset */
+
+#define MB		(1024UL * 1024)
+#define GB		(1024UL * MB)
+
+typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
+
+int guest_tslice = DEFAULT_GUEST_TSLICE;
+int guest_hz = DEFAULT_GUEST_HZ;
+char *vmname;
+
+u_long lomem_sz;
+u_long himem_sz;
+
+int guest_ncpus;
+
+static int pincpu = -1;
+static int guest_vcpu_mux;
+static int guest_vmexit_on_hlt, guest_vmexit_on_pause, disable_x2apic;
+
+static int foundcpus;
+
+static int strictio;
+
+static int acpi;
+
+static char *lomem_addr;
+static char *himem_addr;
+
+static char *progname;
+static const int BSP = 0;
+
+static int cpumask;
+
+static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
+
+struct vm_exit vmexit[VM_MAXCPU];
+
+struct fbsdstats {
+        uint64_t        vmexit_bogus;
+        uint64_t        vmexit_bogus_switch;
+        uint64_t        vmexit_hlt;
+        uint64_t        vmexit_pause;
+        uint64_t        vmexit_mtrap;
+        uint64_t        vmexit_paging;
+        uint64_t        cpu_switch_rotate;
+        uint64_t        cpu_switch_direct;
+        int             io_reset;
+} stats;
+
+struct mt_vmm_info {
+	pthread_t	mt_thr;
+	struct vmctx	*mt_ctx;
+	int		mt_vcpu;	
+} mt_vmm_info[VM_MAXCPU];
+
+static void
+usage(int code)
+{
+
+        fprintf(stderr,
+                "Usage: %s [-aehABHIP][-g <gdb port>][-z <hz>][-s <pci>]"
+		"[-S <pci>][-p pincpu][-n <pci>][-m lowmem][-M highmem] <vm>\n"
+		"       -a: local apic is in XAPIC mode (default is X2APIC)\n"
+		"       -A: create an ACPI table\n"
+		"       -g: gdb port (default is %d and 0 means don't open)\n"
+		"       -c: # cpus (default 1)\n"
+		"       -p: pin vcpu 'n' to host cpu 'pincpu + n'\n"
+		"       -B: inject breakpoint exception on vm entry\n"
+		"       -H: vmexit from the guest on hlt\n"
+		"       -I: present an ioapic to the guest\n"
+		"       -P: vmexit from the guest on pause\n"
+		"	-e: exit on unhandled i/o access\n"
+		"       -h: help\n"
+		"       -z: guest hz (default is %d)\n"
+		"       -s: <slot,driver,configinfo> PCI slot config\n"
+		"       -S: <slot,driver,configinfo> legacy PCI slot config\n"
+		"       -m: lowmem in MB\n"
+		"       -M: highmem in MB\n"
+		"       -x: mux vcpus to 1 hcpu\n"
+		"       -t: mux vcpu timeslice hz (default %d)\n",
+		progname, DEFAULT_GDB_PORT, DEFAULT_GUEST_HZ,
+		DEFAULT_GUEST_TSLICE);
+	exit(code);
+}
+
+void *
+paddr_guest2host(uintptr_t gaddr)
+{
+	if (lomem_sz == 0)
+		return (NULL);
+
+	if (gaddr < lomem_sz) {
+		return ((void *)(lomem_addr + gaddr));
+	} else if (gaddr >= 4*GB && gaddr < (4*GB + himem_sz)) {
+		return ((void *)(himem_addr + gaddr - 4*GB));
+	} else
+		return (NULL);
+}
+
+int
+fbsdrun_disable_x2apic(void)
+{
+
+	return (disable_x2apic);
+}
+
+int
+fbsdrun_vmexit_on_pause(void)
+{
+
+	return (guest_vmexit_on_pause);
+}
+
+int
+fbsdrun_vmexit_on_hlt(void)
+{
+
+	return (guest_vmexit_on_hlt);
+}
+
+int
+fbsdrun_muxed(void)
+{
+
+	return (guest_vcpu_mux);
+}
+
+static void *
+fbsdrun_start_thread(void *param)
+{
+	char tname[MAXCOMLEN + 1];
+	struct mt_vmm_info *mtp;
+	int vcpu;
+
+	mtp = param;
+	vcpu = mtp->mt_vcpu;
+
+	snprintf(tname, sizeof(tname), "%s vcpu %d", vmname, vcpu);
+	pthread_set_name_np(mtp->mt_thr, tname);
+
+	vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
+
+	/* not reached */
+	exit(1);
+	return (NULL);
+}
+
+void
+fbsdrun_addcpu(struct vmctx *ctx, int vcpu, uint64_t rip)
+{
+	int error;
+
+	if (cpumask & (1 << vcpu)) {
+		fprintf(stderr, "addcpu: attempting to add existing cpu %d\n",
+		    vcpu);
+		exit(1);
+	}
+
+	cpumask |= 1 << vcpu;
+	foundcpus++;
+
+	/*
+	 * Set up the vmexit struct to allow execution to start
+	 * at the given RIP
+	 */
+	vmexit[vcpu].rip = rip;
+	vmexit[vcpu].inst_length = 0;
+
+	if (vcpu == BSP || !guest_vcpu_mux){
+		mt_vmm_info[vcpu].mt_ctx = ctx;
+		mt_vmm_info[vcpu].mt_vcpu = vcpu;
+	
+		error = pthread_create(&mt_vmm_info[vcpu].mt_thr, NULL,
+				fbsdrun_start_thread, &mt_vmm_info[vcpu]);
+		assert(error == 0);
+	}
+}
+
+static int
+fbsdrun_get_next_cpu(int curcpu)
+{
+
+	/*
+	 * Get the next available CPU. Assumes they arrive
+	 * in ascending order with no gaps.
+	 */
+	return ((curcpu + 1) % foundcpus);
+}
+
+static int
+vmexit_catch_reset(void)
+{
+        stats.io_reset++;
+        return (VMEXIT_RESET);
+}
+
+static int
+vmexit_catch_inout(void)
+{
+	return (VMEXIT_ABORT);
+}
+
+static int
+vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
+		     uint32_t eax)
+{
+#if PG_DEBUG /* put all types of debug here */
+        if (eax == 0) {
+		pause_noswitch = 1;
+	} else if (eax == 1) {
+		pause_noswitch = 0;
+	} else {
+		pause_noswitch = 0;
+		if (eax == 5) {
+			vm_set_capability(ctx, *pvcpu, VM_CAP_MTRAP_EXIT, 1);
+		}
+	}
+#endif
+        return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+	int error;
+	int bytes, port, in, out;
+	uint32_t eax;
+	int vcpu;
+
+	vcpu = *pvcpu;
+
+	port = vme->u.inout.port;
+	bytes = vme->u.inout.bytes;
+	eax = vme->u.inout.eax;
+	in = vme->u.inout.in;
+	out = !in;
+
+	/* We don't deal with these */
+	if (vme->u.inout.string || vme->u.inout.rep)
+		return (VMEXIT_ABORT);
+
+	/* Special case of guest reset */
+	if (out && port == 0x64 && (uint8_t)eax == 0xFE)
+		return (vmexit_catch_reset());
+
+        /* Extra-special case of host notifications */
+        if (out && port == GUEST_NIO_PORT)
+                return (vmexit_handle_notify(ctx, vme, pvcpu, eax));
+
+	error = emulate_inout(ctx, vcpu, in, port, bytes, &eax, strictio);
+	if (error == 0 && in)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, eax);
+
+	if (error == 0)
+		return (VMEXIT_CONTINUE);
+	else {
+		fprintf(stderr, "Unhandled %s%c 0x%04x\n",
+			in ? "in" : "out",
+			bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port);
+		return (vmexit_catch_inout());
+	}
+}
+
+static int
+vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+	fprintf(stderr, "vm exit rdmsr 0x%x, cpu %d\n", vme->u.msr.code,
+	    *pvcpu);
+	return (VMEXIT_ABORT);
+}
+
+static int
+vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+	int newcpu;
+	int retval = VMEXIT_CONTINUE;
+
+	newcpu = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code,vme->u.msr.wval);
+
+	if (guest_vcpu_mux && *pvcpu != newcpu) {
+                retval = VMEXIT_SWITCH;
+                *pvcpu = newcpu;
+        }
+        
+        return (retval);
+}
+
+static int
+vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+	int newcpu;
+	int retval = VMEXIT_CONTINUE;
+
+	newcpu = spinup_ap(ctx, *pvcpu,
+			   vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
+
+	if (guest_vcpu_mux && *pvcpu != newcpu) {
+		retval = VMEXIT_SWITCH;
+		*pvcpu = newcpu;
+	}
+        
+	return (retval);
+}
+
+static int
+vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+	fprintf(stderr, "vm exit[%d]\n", *pvcpu);
+	fprintf(stderr, "\treason\t\tVMX\n");
+	fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
+	fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
+	fprintf(stderr, "\terror\t\t%d\n", vmexit->u.vmx.error);
+	fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason);
+	fprintf(stderr, "\tqualification\t0x%016lx\n",
+	    vmexit->u.vmx.exit_qualification);
+
+	return (VMEXIT_ABORT);
+}
+
+static int bogus_noswitch = 1;
+
+static int
+vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+	stats.vmexit_bogus++;
+
+	if (!guest_vcpu_mux || guest_ncpus == 1 || bogus_noswitch) {
+		return (VMEXIT_RESTART);
+	} else {
+		stats.vmexit_bogus_switch++;
+		vmexit->inst_length = 0;
+		*pvcpu = -1;		
+		return (VMEXIT_SWITCH);
+	}
+}
+
+static int
+vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+	stats.vmexit_hlt++;
+	if (fbsdrun_muxed()) {
+		*pvcpu = -1;
+		return (VMEXIT_SWITCH);
+	} else {
+		/*
+		 * Just continue execution with the next instruction. We use
+		 * the HLT VM exit as a way to be friendly with the host
+		 * scheduler.
+		 */
+		return (VMEXIT_CONTINUE);
+	}
+}
+
+static int pause_noswitch;
+
+static int
+vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+	stats.vmexit_pause++;
+
+	if (fbsdrun_muxed() && !pause_noswitch) {
+		*pvcpu = -1;
+		return (VMEXIT_SWITCH);
+        } else {
+		return (VMEXIT_CONTINUE);
+	}
+}
+
+static int
+vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+	stats.vmexit_mtrap++;
+
+	return (VMEXIT_RESTART);
+}
+
+static int
+vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+	int err;
+	stats.vmexit_paging++;
+
+	err = emulate_mem(ctx, *pvcpu, vmexit->u.paging.gpa,
+			  &vmexit->u.paging.vie);
+
+	if (err) {
+		if (err == EINVAL) {
+			fprintf(stderr,
+			    "Failed to emulate instruction at 0x%lx\n", 
+			    vmexit->rip);
+		} else if (err == ESRCH) {
+			fprintf(stderr, "Unhandled memory access to 0x%lx\n",
+			    vmexit->u.paging.gpa);
+		}
+
+		return (VMEXIT_ABORT);
+	}
+
+	return (VMEXIT_CONTINUE);
+}
+
+static void
+sigalrm(int sig)
+{
+	return;
+}
+
+static void
+setup_timeslice(void)
+{
+	struct sigaction sa;
+	struct itimerval itv;
+	int error;
+
+	/*
+	 * Setup a realtime timer to generate a SIGALRM at a
+	 * frequency of 'guest_tslice' ticks per second.
+	 */
+	sigemptyset(&sa.sa_mask);
+	sa.sa_flags = 0;
+	sa.sa_handler = sigalrm;
+	
+	error = sigaction(SIGALRM, &sa, NULL);
+	assert(error == 0);
+
+	itv.it_interval.tv_sec = 0;
+	itv.it_interval.tv_usec = 1000000 / guest_tslice;
+	itv.it_value.tv_sec = 0;
+	itv.it_value.tv_usec = 1000000 / guest_tslice;
+	
+	error = setitimer(ITIMER_REAL, &itv, NULL);
+	assert(error == 0);
+}
+
+static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
+	[VM_EXITCODE_INOUT]  = vmexit_inout,
+	[VM_EXITCODE_VMX]    = vmexit_vmx,
+	[VM_EXITCODE_BOGUS]  = vmexit_bogus,
+	[VM_EXITCODE_RDMSR]  = vmexit_rdmsr,
+	[VM_EXITCODE_WRMSR]  = vmexit_wrmsr,
+	[VM_EXITCODE_MTRAP]  = vmexit_mtrap,
+	[VM_EXITCODE_PAGING] = vmexit_paging,
+	[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
+};
+
+static void
+vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip)
+{
+	int error, rc, prevcpu;
+
+	if (guest_vcpu_mux)
+		setup_timeslice();
+
+	if (pincpu >= 0) {
+		error = vm_set_pinning(ctx, vcpu, pincpu + vcpu);
+		assert(error == 0);
+	}
+
+	while (1) {
+		error = vm_run(ctx, vcpu, rip, &vmexit[vcpu]);
+		if (error != 0) {
+			/*
+			 * It is possible that 'vmmctl' or some other process
+			 * has transitioned the vcpu to CANNOT_RUN state right
+			 * before we tried to transition it to RUNNING.
+			 *
+			 * This is expected to be temporary so just retry.
+			 */
+			if (errno == EBUSY)
+				continue;
+			else
+				break;
+		}
+
+		prevcpu = vcpu;
+                rc = (*handler[vmexit[vcpu].exitcode])(ctx, &vmexit[vcpu],
+                                                       &vcpu);		
+		switch (rc) {
+                case VMEXIT_SWITCH:
+			assert(guest_vcpu_mux);
+			if (vcpu == -1) {
+				stats.cpu_switch_rotate++;
+				vcpu = fbsdrun_get_next_cpu(prevcpu);
+			} else {
+				stats.cpu_switch_direct++;
+			}
+			/* fall through */
+		case VMEXIT_CONTINUE:
+                        rip = vmexit[vcpu].rip + vmexit[vcpu].inst_length;
+			break;
+		case VMEXIT_RESTART:
+                        rip = vmexit[vcpu].rip;
+			break;
+		case VMEXIT_RESET:
+			exit(0);
+		default:
+			exit(1);
+		}
+	}
+	fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
+}
+
+static int
+num_vcpus_allowed(struct vmctx *ctx)
+{
+	int tmp, error;
+
+	error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
+
+	/*
+	 * The guest is allowed to spinup more than one processor only if the
+	 * UNRESTRICTED_GUEST capability is available.
+	 */
+	if (error == 0)
+		return (VM_MAXCPU);
+	else
+		return (1);
+}
+
+int
+main(int argc, char *argv[])
+{
+	int c, error, gdb_port, inject_bkpt, tmp, err, ioapic, bvmcons;
+	int max_vcpus;
+	struct vmctx *ctx;
+	uint64_t rip;
+
+	bvmcons = 0;
+	inject_bkpt = 0;
+	progname = basename(argv[0]);
+	gdb_port = DEFAULT_GDB_PORT;
+	guest_ncpus = 1;
+	ioapic = 0;
+
+	while ((c = getopt(argc, argv, "abehABHIPxp:g:c:z:s:S:n:m:M:")) != -1) {
+		switch (c) {
+		case 'a':
+			disable_x2apic = 1;
+			break;
+		case 'A':
+			acpi = 1;
+			break;
+		case 'b':
+			bvmcons = 1;
+			break;
+		case 'B':
+			inject_bkpt = 1;
+			break;
+		case 'x':
+			guest_vcpu_mux = 1;
+			break;
+		case 'p':
+			pincpu = atoi(optarg);
+			break;
+                case 'c':
+			guest_ncpus = atoi(optarg);
+			break;
+		case 'g':
+			gdb_port = atoi(optarg);
+			break;
+		case 'z':
+			guest_hz = atoi(optarg);
+			break;
+		case 't':
+			guest_tslice = atoi(optarg);
+			break;
+		case 's':
+			pci_parse_slot(optarg, 0);
+			break;
+		case 'S':
+			pci_parse_slot(optarg, 1);
+			break;
+                case 'm':
+			lomem_sz = strtoul(optarg, NULL, 0) * MB;
+			break;
+                case 'M':
+			himem_sz = strtoul(optarg, NULL, 0) * MB;
+			break;
+		case 'H':
+			guest_vmexit_on_hlt = 1;
+			break;
+		case 'I':
+			ioapic = 1;
+			break;
+		case 'P':
+			guest_vmexit_on_pause = 1;
+			break;
+		case 'e':
+			strictio = 1;
+			break;
+		case 'h':
+			usage(0);			
+		default:
+			usage(1);
+		}
+	}
+	argc -= optind;
+	argv += optind;
+
+	if (argc != 1)
+		usage(1);
+
+	/* No need to mux if guest is uni-processor */
+	if (guest_ncpus <= 1)
+		guest_vcpu_mux = 0;
+
+	/* vmexit on hlt if guest is muxed */
+	if (guest_vcpu_mux) {
+		guest_vmexit_on_hlt = 1;
+		guest_vmexit_on_pause = 1;
+	}
+
+	vmname = argv[0];
+
+	ctx = vm_open(vmname);
+	if (ctx == NULL) {
+		perror("vm_open");
+		exit(1);
+	}
+
+	max_vcpus = num_vcpus_allowed(ctx);
+	if (guest_ncpus > max_vcpus) {
+		fprintf(stderr, "%d vCPUs requested but only %d available\n",
+			guest_ncpus, max_vcpus);
+		exit(1);
+	}
+
+	if (fbsdrun_vmexit_on_hlt()) {
+		err = vm_get_capability(ctx, BSP, VM_CAP_HALT_EXIT, &tmp);
+		if (err < 0) {
+			fprintf(stderr, "VM exit on HLT not supported\n");
+			exit(1);
+		}
+		vm_set_capability(ctx, BSP, VM_CAP_HALT_EXIT, 1);
+		handler[VM_EXITCODE_HLT] = vmexit_hlt;
+	}
+
+        if (fbsdrun_vmexit_on_pause()) {
+		/*
+		 * pause exit support required for this mode
+		 */
+		err = vm_get_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, &tmp);
+		if (err < 0) {
+			fprintf(stderr,
+			    "SMP mux requested, no pause support\n");
+			exit(1);
+		}
+		vm_set_capability(ctx, BSP, VM_CAP_PAUSE_EXIT, 1);
+		handler[VM_EXITCODE_PAUSE] = vmexit_pause;
+        }
+
+	if (fbsdrun_disable_x2apic())
+		err = vm_set_x2apic_state(ctx, BSP, X2APIC_DISABLED);
+	else
+		err = vm_set_x2apic_state(ctx, BSP, X2APIC_ENABLED);
+
+	if (err) {
+		fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
+		exit(1);
+	}
+
+	if (lomem_sz != 0) {
+		lomem_addr = vm_map_memory(ctx, 0, lomem_sz);
+		if (lomem_addr == (char *) MAP_FAILED) {
+			lomem_sz = 0;
+		} else if (himem_sz != 0) {
+			himem_addr = vm_map_memory(ctx, 4*GB, himem_sz);
+			if (himem_addr == (char *) MAP_FAILED) {
+				lomem_sz = 0;
+				himem_sz = 0;
+			}
+		}
+	}
+
+	init_inout();
+	init_pci(ctx);
+	if (ioapic)
+		ioapic_init(0);
+
+	if (gdb_port != 0)
+		init_dbgport(gdb_port);
+
+	if (bvmcons)
+		init_bvmcons();
+
+	error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
+	assert(error == 0);
+
+	if (inject_bkpt) {
+		error = vm_inject_event(ctx, BSP, VM_HW_EXCEPTION, IDT_BP);
+		assert(error == 0);
+	}
+
+	/*
+	 * build the guest tables, MP etc.
+	 */
+	mptable_build(ctx, guest_ncpus, ioapic);
+
+	if (acpi) {
+		error = acpi_build(ctx, guest_ncpus, ioapic);
+		assert(error == 0);
+	}
+
+	/*
+	 * Add CPU 0
+	 */
+	fbsdrun_addcpu(ctx, BSP, rip);
+
+	/*
+	 * Head off to the main event dispatch loop
+	 */
+	mevent_dispatch();
+
+	exit(1);
+}
diff --git a/usr.sbin/bhyve/bhyverun.h b/usr.sbin/bhyve/bhyverun.h
new file mode 100644
index 0000000..45033b8
--- /dev/null
+++ b/usr.sbin/bhyve/bhyverun.h
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_FBSDRUN_H_
+#define	_FBSDRUN_H_
+
+#ifndef CTASSERT		/* Allow lint to override */
+#define	CTASSERT(x)		_CTASSERT(x, __LINE__)
+#define	_CTASSERT(x, y)		__CTASSERT(x, y)
+#define	__CTASSERT(x, y)	typedef char __assert ## y[(x) ? 1 : -1]
+#endif
+
+struct vmctx;
+extern int guest_hz;
+extern int guest_tslice;
+extern int guest_ncpus;
+extern char *vmname;
+
+extern u_long lomem_sz, himem_sz;
+
+void *paddr_guest2host(uintptr_t);
+
+void fbsdrun_addcpu(struct vmctx *ctx, int cpu, uint64_t rip);
+int  fbsdrun_muxed(void);
+int  fbsdrun_vmexit_on_hlt(void);
+int  fbsdrun_vmexit_on_pause(void);
+int  fbsdrun_disable_x2apic(void);
+#endif
diff --git a/usr.sbin/bhyve/consport.c b/usr.sbin/bhyve/consport.c
new file mode 100644
index 0000000..3915b6d
--- /dev/null
+++ b/usr.sbin/bhyve/consport.c
@@ -0,0 +1,140 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/select.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <termios.h>
+#include <unistd.h>
+#include <stdbool.h>
+
+#include "inout.h"
+
+#define	BVM_CONSOLE_PORT	0x220
+#define	BVM_CONS_SIG		('b' << 8 | 'v')
+
+static struct termios tio_orig, tio_new;
+
+static void
+ttyclose(void)
+{
+	tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig);
+}
+
+static void
+ttyopen(void)
+{
+	tcgetattr(STDIN_FILENO, &tio_orig);
+
+	cfmakeraw(&tio_new);
+	tcsetattr(STDIN_FILENO, TCSANOW, &tio_new);	
+
+	atexit(ttyclose);
+}
+
+static bool
+tty_char_available(void)
+{
+        fd_set rfds;
+        struct timeval tv;
+
+        FD_ZERO(&rfds);
+        FD_SET(STDIN_FILENO, &rfds);
+        tv.tv_sec = 0;
+        tv.tv_usec = 0;
+        if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) {
+		return (true);
+	} else {
+		return (false);
+	}
+}
+
+static int
+ttyread(void)
+{
+	char rb;
+
+	if (tty_char_available()) {
+		read(STDIN_FILENO, &rb, 1);
+		return (rb & 0xff);
+	} else {
+		return (-1);
+	}
+}
+
+static void
+ttywrite(unsigned char wb)
+{
+	(void) write(STDOUT_FILENO, &wb, 1);
+}
+
+static int
+console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		uint32_t *eax, void *arg)
+{
+	static int opened;
+
+	if (bytes == 2 && in) {
+		*eax = BVM_CONS_SIG;
+		return (0);
+	}
+
+	if (bytes != 4)
+		return (-1);
+
+	if (!opened) {
+		ttyopen();
+		opened = 1;
+	}
+	
+	if (in)
+		*eax = ttyread();
+	else
+		ttywrite(*eax);
+
+	return (0);
+}
+
+static struct inout_port consport = {
+	"bvmcons",
+	BVM_CONSOLE_PORT,
+	IOPORT_F_INOUT,
+	console_handler
+};
+
+void
+init_bvmcons(void)
+{
+
+	register_inout(&consport);
+}
diff --git a/usr.sbin/bhyve/dbgport.c b/usr.sbin/bhyve/dbgport.c
new file mode 100644
index 0000000..034531c
--- /dev/null
+++ b/usr.sbin/bhyve/dbgport.c
@@ -0,0 +1,138 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <sys/uio.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include "inout.h"
+#include "dbgport.h"
+
+#define	BVM_DBG_PORT	0x224
+#define	BVM_DBG_SIG	('B' << 8 | 'V')
+
+static int listen_fd, conn_fd;
+
+static struct sockaddr_in sin;
+
+static int
+dbg_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+	    uint32_t *eax, void *arg)
+{
+	char ch;
+	int nwritten, nread, printonce;
+
+	if (bytes == 2 && in) {
+		*eax = BVM_DBG_SIG;
+		return (0);
+	}
+
+	if (bytes != 4)
+		return (-1);
+
+again:
+	printonce = 0;
+	while (conn_fd < 0) {
+		if (!printonce) {
+			printf("Waiting for connection from gdb\r\n");
+			printonce = 1;
+		}
+		conn_fd = accept(listen_fd, NULL, NULL);
+		if (conn_fd >= 0)
+			fcntl(conn_fd, F_SETFL, O_NONBLOCK);
+		else if (errno != EINTR)
+			perror("accept");
+	}
+
+	if (in) {
+		nread = read(conn_fd, &ch, 1);
+		if (nread == -1 && errno == EAGAIN)
+			*eax = -1;
+		else if (nread == 1)
+			*eax = ch;
+		else {
+			close(conn_fd);
+			conn_fd = -1;
+			goto again;
+		}
+	} else {
+		ch = *eax;
+		nwritten = write(conn_fd, &ch, 1);
+		if (nwritten != 1) {
+			close(conn_fd);
+			conn_fd = -1;
+			goto again;
+		}
+	}
+	return (0);
+}
+
+static struct inout_port dbgport = {
+	"bvmdbg",
+	BVM_DBG_PORT,
+	IOPORT_F_INOUT,
+	dbg_handler
+};
+
+void
+init_dbgport(int sport)
+{
+	conn_fd = -1;
+
+	if ((listen_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+		perror("socket");
+		exit(1);
+	}
+
+	sin.sin_len = sizeof(sin);
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = htonl(INADDR_ANY);
+	sin.sin_port = htons(sport);
+
+	if (bind(listen_fd, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
+		perror("bind");
+		exit(1);
+	}
+
+	if (listen(listen_fd, 1) < 0) {
+		perror("listen");
+		exit(1);
+	}
+
+	register_inout(&dbgport);
+}
diff --git a/usr.sbin/bhyve/dbgport.h b/usr.sbin/bhyve/dbgport.h
new file mode 100644
index 0000000..8c7dab7
--- /dev/null
+++ b/usr.sbin/bhyve/dbgport.h
@@ -0,0 +1,36 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _DBGPORT_H_
+#define	_DBGPORT_H_
+
+#define	DEFAULT_GDB_PORT	6466
+
+void	init_dbgport(int port);
+
+#endif
diff --git a/usr.sbin/bhyve/elcr.c b/usr.sbin/bhyve/elcr.c
new file mode 100644
index 0000000..2417ae1
--- /dev/null
+++ b/usr.sbin/bhyve/elcr.c
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include "inout.h"
+
+/*
+ * EISA interrupt Level Control Register.
+ *
+ * This is a 16-bit register with one bit for each of the IRQ0 through IRQ15.
+ * A level triggered irq is indicated by setting the corresponding bit to '1'.
+ */
+#define	ELCR_PORT	0x4d0
+
+static uint8_t elcr[2] = { 0x00, 0x00 };
+
+static int
+elcr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+	     uint32_t *eax, void *arg)
+{
+	int idx;
+
+	if (bytes != 1)
+		return (-1);
+
+	idx = port - ELCR_PORT;
+
+	if (in)
+		*eax = elcr[idx];
+	else
+		elcr[idx] = *eax;
+
+	return (0);
+}
+INOUT_PORT(elcr, ELCR_PORT + 0, IOPORT_F_INOUT, elcr_handler);
+INOUT_PORT(elcr, ELCR_PORT + 1, IOPORT_F_INOUT, elcr_handler);
diff --git a/usr.sbin/bhyve/inout.c b/usr.sbin/bhyve/inout.c
new file mode 100644
index 0000000..5f47a89f
--- /dev/null
+++ b/usr.sbin/bhyve/inout.c
@@ -0,0 +1,151 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+
+#include <stdio.h>
+#include <assert.h>
+
+#include "inout.h"
+
+SET_DECLARE(inout_port_set, struct inout_port);
+
+#define	MAX_IOPORTS	(1 << 16)
+
+static struct {
+	const char	*name;
+	int		flags;
+	inout_func_t	handler;
+	void		*arg;
+} inout_handlers[MAX_IOPORTS];
+
+static int
+default_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+              uint32_t *eax, void *arg)
+{
+        if (in) {
+                switch (bytes) {
+                case 4:
+                        *eax = 0xffffffff;
+                        break;
+                case 2:
+                        *eax = 0xffff;
+                        break;
+                case 1:
+                        *eax = 0xff;
+                        break;
+                }
+        }
+        
+        return (0);
+}
+
+int
+emulate_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+	      uint32_t *eax, int strict)
+{
+	int flags;
+	uint32_t mask;
+	inout_func_t handler;
+	void *arg;
+
+	assert(port < MAX_IOPORTS);
+
+	handler = inout_handlers[port].handler;
+
+	if (strict && handler == default_inout)
+		return (-1);
+
+	if (!in) {
+		switch (bytes) {
+		case 1:
+			mask = 0xff;
+			break;
+		case 2:
+			mask = 0xffff;
+			break;
+		default:
+			mask = 0xffffffff;
+			break;
+		}
+		*eax = *eax & mask;
+	}
+
+	flags = inout_handlers[port].flags;
+	arg = inout_handlers[port].arg;
+
+	if ((in && (flags & IOPORT_F_IN)) || (!in && (flags & IOPORT_F_OUT)))
+		return ((*handler)(ctx, vcpu, in, port, bytes, eax, arg));
+	else
+		return (-1);
+}
+
+void
+init_inout(void)
+{
+	struct inout_port **iopp, *iop;
+	int i;
+
+	/*
+	 * Set up the default handler for all ports
+	 */
+	for (i = 0; i < MAX_IOPORTS; i++) {
+		inout_handlers[i].name = "default";
+		inout_handlers[i].flags = IOPORT_F_IN | IOPORT_F_OUT;
+		inout_handlers[i].handler = default_inout;
+		inout_handlers[i].arg = NULL;
+	}
+
+	/*
+	 * Overwrite with specified handlers
+	 */
+	SET_FOREACH(iopp, inout_port_set) {
+		iop = *iopp;
+		assert(iop->port < MAX_IOPORTS);
+		inout_handlers[iop->port].name = iop->name;
+		inout_handlers[iop->port].flags = iop->flags;
+		inout_handlers[iop->port].handler = iop->handler;
+		inout_handlers[iop->port].arg = NULL;
+	}
+}
+
+int
+register_inout(struct inout_port *iop)
+{
+	assert(iop->port < MAX_IOPORTS);
+	inout_handlers[iop->port].name = iop->name;
+	inout_handlers[iop->port].flags = iop->flags;
+	inout_handlers[iop->port].handler = iop->handler;
+	inout_handlers[iop->port].arg = iop->arg;
+
+	return (0);
+}
diff --git a/usr.sbin/bhyve/inout.h b/usr.sbin/bhyve/inout.h
new file mode 100644
index 0000000..a73b78d
--- /dev/null
+++ b/usr.sbin/bhyve/inout.h
@@ -0,0 +1,67 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _INOUT_H_
+#define	_INOUT_H_
+
+#include <sys/linker_set.h>
+
+struct vmctx;
+
+typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port,
+			    int bytes, uint32_t *eax, void *arg);
+
+struct inout_port {
+	const char 	*name;
+	int		port;
+	int		flags;
+	inout_func_t	handler;
+	void		*arg;
+};
+#define	IOPORT_F_IN		0x1
+#define	IOPORT_F_OUT		0x2
+#define	IOPORT_F_INOUT		0x3
+
+#define	INOUT_PORT(name, port, flags, handler)				\
+	static struct inout_port __CONCAT(__inout_port, __LINE__) = {	\
+		#name,							\
+		(port),							\
+		(flags),						\
+		(handler),						\
+		0							\
+	};								\
+	DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__))
+	
+void	init_inout(void);
+int	emulate_inout(struct vmctx *, int vcpu, int in, int port, int bytes,
+		      uint32_t *eax, int strict);
+int	register_inout(struct inout_port *iop);
+
+void	init_bvmcons(void);
+
+#endif	/* _INOUT_H_ */
diff --git a/usr.sbin/bhyve/ioapic.c b/usr.sbin/bhyve/ioapic.c
new file mode 100644
index 0000000..c712692
--- /dev/null
+++ b/usr.sbin/bhyve/ioapic.c
@@ -0,0 +1,324 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <x86/apicreg.h>
+#include <machine/vmm.h>
+
+#include <string.h>
+#include <assert.h>
+#include <stdbool.h>
+
+#include <vmmapi.h>
+
+#include "inout.h"
+#include "mem.h"
+#include "bhyverun.h"
+
+#include <stdio.h>
+
+#define	IOAPIC_PADDR	0xFEC00000
+
+#define	IOREGSEL	0x00
+#define	IOWIN		0x10
+
+#define	REDIR_ENTRIES	16
+#define	INTR_ASSERTED(ioapic, pin)	((ioapic)->pinstate[(pin)] == true)
+
+struct ioapic {
+	int		inited;
+	uint32_t	id;
+	uint64_t	redtbl[REDIR_ENTRIES];
+	bool		pinstate[REDIR_ENTRIES];
+
+	uintptr_t	paddr;		/* gpa where the ioapic is mapped */
+	uint32_t	ioregsel;
+	struct memory_region *region;
+};
+
+static struct ioapic ioapics[1];	/* only a single ioapic for now */
+
+static int ioapic_region_read(struct ioapic *ioapic, uintptr_t paddr,
+				int size, uint64_t *data);
+static int ioapic_region_write(struct ioapic *ioapic, uintptr_t paddr,
+				int size, uint64_t data);
+static int ioapic_region_handler(struct vmctx *vm, int vcpu, int dir,
+				 uintptr_t paddr, int size, uint64_t *val,
+				 void *arg1, long arg2);
+
+static void
+ioapic_set_pinstate(struct vmctx *ctx, int pin, bool newstate)
+{
+	int vector, apicid, vcpu;
+	uint32_t low, high;
+	struct ioapic *ioapic;
+	
+	ioapic = &ioapics[0];		/* assume a single ioapic */
+
+	if (pin < 0 || pin >= REDIR_ENTRIES)
+		return;
+
+	/* Nothing to do if interrupt pin has not changed state */
+	if (ioapic->pinstate[pin] == newstate)
+		return;
+
+	ioapic->pinstate[pin] = newstate;	/* record it */
+
+	/* Nothing to do if interrupt pin is deasserted */
+	if (!INTR_ASSERTED(ioapic, pin))
+		return;
+
+	/*
+	 * XXX
+	 * We only deal with:
+	 * - edge triggered interrupts
+	 * - physical destination mode
+	 * - fixed delivery mode
+	 */
+	low = ioapic->redtbl[pin];
+	high = ioapic->redtbl[pin] >> 32;
+	if ((low & IOART_INTMASK) == IOART_INTMCLR &&
+	    (low & IOART_TRGRMOD) == IOART_TRGREDG &&
+	    (low & IOART_DESTMOD) == IOART_DESTPHY &&
+	    (low & IOART_DELMOD) == IOART_DELFIXED) {
+		vector = low & IOART_INTVEC;
+		apicid = high >> APIC_ID_SHIFT;
+		if (apicid != 0xff) {
+			/* unicast */
+			vcpu = vm_apicid2vcpu(ctx, apicid);
+			vm_lapic_irq(ctx, vcpu, vector);
+		} else {
+			/* broadcast */
+			vcpu = 0;
+			while (vcpu < guest_ncpus) {
+				vm_lapic_irq(ctx, vcpu, vector);
+				vcpu++;
+			}
+		}
+	}
+}
+
+void
+ioapic_deassert_pin(struct vmctx *ctx, int pin)
+{
+	ioapic_set_pinstate(ctx, pin, false);
+}
+
+void
+ioapic_assert_pin(struct vmctx *ctx, int pin)
+{
+	ioapic_set_pinstate(ctx, pin, true);
+}
+
+void
+ioapic_init(int which)
+{
+	struct mem_range memp;
+	struct ioapic *ioapic;
+	int error;
+	int i;
+
+	assert(which == 0);
+
+	ioapic = &ioapics[which];
+	assert(ioapic->inited == 0);
+
+	bzero(ioapic, sizeof(struct ioapic));
+
+	/* Initialize all redirection entries to mask all interrupts */
+	for (i = 0; i < REDIR_ENTRIES; i++)
+		ioapic->redtbl[i] = 0x0001000000010000UL;
+
+	ioapic->paddr = IOAPIC_PADDR;
+
+	/* Register emulated memory region */
+	memp.name = "ioapic";
+	memp.flags = MEM_F_RW;
+	memp.handler = ioapic_region_handler;
+	memp.arg1 = ioapic;
+	memp.arg2 = which;
+	memp.base = ioapic->paddr;
+	memp.size = sizeof(struct IOAPIC);
+	error = register_mem(&memp);
+
+	assert (error == 0);
+
+	ioapic->inited = 1;
+}
+
+static uint32_t
+ioapic_read(struct ioapic *ioapic, uint32_t addr)
+{
+	int regnum, pin, rshift;
+
+	assert(ioapic->inited);
+
+	regnum = addr & 0xff;
+	switch (regnum) {
+	case IOAPIC_ID:
+		return (ioapic->id);
+		break;
+	case IOAPIC_VER:
+		return ((REDIR_ENTRIES << MAXREDIRSHIFT) | 0x11);
+		break;
+	case IOAPIC_ARB:
+		return (ioapic->id);
+		break;
+	default:
+		break;
+	}
+
+	/* redirection table entries */
+	if (regnum >= IOAPIC_REDTBL &&
+	    regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) {
+		pin = (regnum - IOAPIC_REDTBL) / 2;
+		if ((regnum - IOAPIC_REDTBL) % 2)
+			rshift = 32;
+		else
+			rshift = 0;
+
+		return (ioapic->redtbl[pin] >> rshift);
+	}
+
+	return (0);
+}
+
+static void
+ioapic_write(struct ioapic *ioapic, uint32_t addr, uint32_t data)
+{
+	int regnum, pin, lshift;
+
+	assert(ioapic->inited);
+
+	regnum = addr & 0xff;
+	switch (regnum) {
+	case IOAPIC_ID:
+		ioapic->id = data & APIC_ID_MASK;
+		break;
+	case IOAPIC_VER:
+	case IOAPIC_ARB:
+		/* readonly */
+		break;
+	default:
+		break;
+	}
+
+	/* redirection table entries */
+	if (regnum >= IOAPIC_REDTBL &&
+	    regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) {
+		pin = (regnum - IOAPIC_REDTBL) / 2;
+		if ((regnum - IOAPIC_REDTBL) % 2)
+			lshift = 32;
+		else
+			lshift = 0;
+
+		ioapic->redtbl[pin] &= ~((uint64_t)0xffffffff << lshift);
+		ioapic->redtbl[pin] |= ((uint64_t)data << lshift);
+	}
+}
+
+static int
+ioapic_region_read(struct ioapic *ioapic, uintptr_t paddr, int size,
+		   uint64_t *data)
+{
+	int offset;
+
+	offset = paddr - ioapic->paddr;
+
+	/*
+	 * The IOAPIC specification allows 32-bit wide accesses to the
+	 * IOREGSEL (offset 0) and IOWIN (offset 16) registers.
+	 */
+	if (size != 4 || (offset != IOREGSEL && offset != IOWIN)) {
+#if 1
+		printf("invalid access to ioapic%d: size %d, offset %d\n",
+		       (int)(ioapic - ioapics), size, offset);
+#endif
+		*data = 0;
+		return (0);
+	}
+
+	if (offset == IOREGSEL)
+		*data = ioapic->ioregsel;
+	else
+		*data = ioapic_read(ioapic, ioapic->ioregsel);
+
+	return (0);
+}
+
+static int
+ioapic_region_write(struct ioapic *ioapic, uintptr_t paddr, int size,
+		    uint64_t data)
+{
+	int offset;
+
+	offset = paddr - ioapic->paddr;
+
+	/*
+	 * The ioapic specification allows 32-bit wide accesses to the
+	 * IOREGSEL (offset 0) and IOWIN (offset 16) registers.
+	 */
+	if (size != 4 || (offset != IOREGSEL && offset != IOWIN)) {
+#if 1
+		printf("invalid access to ioapic%d: size %d, offset %d\n",
+		       (int)(ioapic - ioapics), size, offset);
+#endif
+		return (0);
+	}
+
+	if (offset == IOREGSEL)
+		ioapic->ioregsel = data;
+	else
+		ioapic_write(ioapic, ioapic->ioregsel, data);
+
+	return (0);
+}
+
+static int
+ioapic_region_handler(struct vmctx *vm, int vcpu, int dir, uintptr_t paddr,
+		      int size, uint64_t *val, void *arg1, long arg2)
+{
+	struct ioapic *ioapic;
+	int which;
+
+	ioapic = arg1;
+	which = arg2;
+
+	assert(ioapic == &ioapics[which]);
+
+	if (dir == MEM_F_READ)
+		ioapic_region_read(ioapic, paddr, size, val);
+	else
+		ioapic_region_write(ioapic, paddr, size, *val);
+
+	return (0);
+}
diff --git a/usr.sbin/bhyve/ioapic.h b/usr.sbin/bhyve/ioapic.h
new file mode 100644
index 0000000..4696f9a
--- /dev/null
+++ b/usr.sbin/bhyve/ioapic.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IOAPIC_H_
+#define	_IOAPIC_H_
+
+struct vmctx;
+
+void	ioapic_init(int num);
+void	ioapic_deassert_pin(struct vmctx *ctx, int pin);
+void	ioapic_assert_pin(struct vmctx *ctx, int pin);
+
+#endif
diff --git a/usr.sbin/bhyve/mem.c b/usr.sbin/bhyve/mem.c
new file mode 100644
index 0000000..27f4782
--- /dev/null
+++ b/usr.sbin/bhyve/mem.c
@@ -0,0 +1,218 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Memory ranges are represented with an RB tree. On insertion, the range
+ * is checked for overlaps. On lookup, the key has the same base and limit
+ * so it can be searched within the range.
+ *
+ * It is assumed that all setup of ranges takes place in single-threaded
+ * mode before vCPUs have been started. As such, no locks are used on the
+ * RB tree. If this is no longer the case, then a r/w lock could be used,
+ * with readers on the lookup and a writer if the tree needs to be changed
+ * (and per vCPU caches flushed)
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/tree.h>
+#include <sys/errno.h>
+#include <machine/vmm.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "mem.h"
+
+struct mmio_rb_range {
+	RB_ENTRY(mmio_rb_range)	mr_link;	/* RB tree links */
+	struct mem_range	mr_param;
+	uint64_t                mr_base;
+	uint64_t                mr_end;
+};
+
+struct mmio_rb_tree;
+RB_PROTOTYPE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);
+
+RB_HEAD(mmio_rb_tree, mmio_rb_range) mmio_rbroot;
+
+/*
+ * Per-vCPU cache. Since most accesses from a vCPU will be to
+ * consecutive addresses in a range, it makes sense to cache the
+ * result of a lookup.
+ */
+static struct mmio_rb_range	*mmio_hint[VM_MAXCPU];
+
+static int
+mmio_rb_range_compare(struct mmio_rb_range *a, struct mmio_rb_range *b)
+{
+	if (a->mr_end < b->mr_base)
+		return (-1);
+	else if (a->mr_base > b->mr_end)
+		return (1);
+	return (0);
+}
+
+static int
+mmio_rb_lookup(uint64_t addr, struct mmio_rb_range **entry)
+{
+	struct mmio_rb_range find, *res;
+
+	find.mr_base = find.mr_end = addr;
+
+	res = RB_FIND(mmio_rb_tree, &mmio_rbroot, &find);
+
+	if (res != NULL) {
+		*entry = res;
+		return (0);
+	}
+	
+	return (ENOENT);
+}
+
+static int
+mmio_rb_add(struct mmio_rb_range *new)
+{
+	struct mmio_rb_range *overlap;
+
+	overlap = RB_INSERT(mmio_rb_tree, &mmio_rbroot, new);
+
+	if (overlap != NULL) {
+#ifdef RB_DEBUG
+		printf("overlap detected: new %lx:%lx, tree %lx:%lx\n",
+		       new->mr_base, new->mr_end,
+		       overlap->mr_base, overlap->mr_end);
+#endif
+
+		return (EEXIST);
+	}
+
+	return (0);
+}
+
+#if 0
+static void
+mmio_rb_dump(void)
+{
+	struct mmio_rb_range *np;
+
+	RB_FOREACH(np, mmio_rb_tree, &mmio_rbroot) {
+		printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end,
+		       np->mr_param.name);
+	}
+}
+#endif
+
+RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);
+
+static int
+mem_read(void *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg)
+{
+	int error;
+	struct mem_range *mr = arg;
+
+	error = (*mr->handler)(ctx, vcpu, MEM_F_READ, gpa, size,
+			       rval, mr->arg1, mr->arg2);
+	return (error);
+}
+
+static int
+mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg)
+{
+	int error;
+	struct mem_range *mr = arg;
+
+	error = (*mr->handler)(ctx, vcpu, MEM_F_WRITE, gpa, size,
+			       &wval, mr->arg1, mr->arg2);
+	return (error);
+}
+
+int
+emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie)
+{
+	struct mmio_rb_range *entry;
+	int err;
+
+	/*
+	 * First check the per-vCPU cache
+	 */
+	if (mmio_hint[vcpu] &&
+	    paddr >= mmio_hint[vcpu]->mr_base &&
+	    paddr <= mmio_hint[vcpu]->mr_end) {
+		entry = mmio_hint[vcpu];
+	} else
+		entry = NULL;
+
+	if (entry == NULL) {
+		if (mmio_rb_lookup(paddr, &entry))
+			return (ESRCH);
+
+		/* Update the per-vCPU cache */
+		mmio_hint[vcpu] = entry;
+	}
+
+	assert(entry != NULL && entry == mmio_hint[vcpu]);
+
+	err = vmm_emulate_instruction(ctx, vcpu, paddr, vie,
+				      mem_read, mem_write, &entry->mr_param);
+	return (err);
+}
+
+int
+register_mem(struct mem_range *memp)
+{
+	struct mmio_rb_range *mrp;
+	int		err;
+
+	err = 0;
+
+	mrp = malloc(sizeof(struct mmio_rb_range));
+
+	if (mrp != NULL) {
+		mrp->mr_param = *memp;
+		mrp->mr_base = memp->base;
+		mrp->mr_end = memp->base + memp->size - 1;
+
+		err = mmio_rb_add(mrp);
+		if (err)
+			free(mrp);
+	} else
+		err = ENOMEM;
+
+	return (err);
+}
+
+void
+init_mem(void)
+{
+
+	RB_INIT(&mmio_rbroot);
+}
diff --git a/usr.sbin/bhyve/mem.h b/usr.sbin/bhyve/mem.h
new file mode 100644
index 0000000..88fafe1
--- /dev/null
+++ b/usr.sbin/bhyve/mem.h
@@ -0,0 +1,57 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MEM_H_
+#define	_MEM_H_
+
+#include <sys/linker_set.h>
+
+struct vmctx;
+
+typedef int (*mem_func_t)(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+			  int size, uint64_t *val, void *arg1, long arg2);
+
+struct mem_range {
+	const char 	*name;
+	int		flags;
+	mem_func_t	handler;
+	void		*arg1;
+	long		arg2;
+	uint64_t  	base;
+	uint64_t  	size;
+};
+#define	MEM_F_READ		0x1
+#define	MEM_F_WRITE		0x2
+#define	MEM_F_RW		0x3
+
+void	init_mem(void);
+int     emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie);
+		    
+int	register_mem(struct mem_range *memp);
+
+#endif	/* _MEM_H_ */
diff --git a/usr.sbin/bhyve/mevent.c b/usr.sbin/bhyve/mevent.c
new file mode 100644
index 0000000..a6109db
--- /dev/null
+++ b/usr.sbin/bhyve/mevent.c
@@ -0,0 +1,432 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Micro event library for FreeBSD, designed for a single i/o thread 
+ * using kqueue, and having events be persistent by default.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#include <sys/event.h>
+#include <sys/time.h>
+
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include "mevent.h"
+
+#define	MEVENT_MAX	64
+
+#define MEV_ENABLE	1
+#define MEV_DISABLE	2
+#define MEV_DEL_PENDING	3
+
+extern char *vmname;
+
+static pthread_t mevent_tid;
+static int mevent_pipefd[2];
+static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
+
+struct mevent {	
+	void	(*me_func)(int, enum ev_type, void *);
+	int	me_fd;
+	enum ev_type me_type;
+	void    *me_param;
+	int	me_cq;
+	int	me_state;
+	int	me_closefd;
+	LIST_ENTRY(mevent) me_list;			   
+};
+
+static LIST_HEAD(listhead, mevent) global_head, change_head;
+
+static void
+mevent_qlock(void)
+{
+	pthread_mutex_lock(&mevent_lmutex);
+}
+
+static void
+mevent_qunlock(void)
+{
+	pthread_mutex_unlock(&mevent_lmutex);
+}
+
+static void
+mevent_pipe_read(int fd, enum ev_type type, void *param)
+{
+	char buf[MEVENT_MAX];
+	int status;
+
+	/*
+	 * Drain the pipe read side. The fd is non-blocking so this is
+	 * safe to do.
+	 */
+	do {
+		status = read(fd, buf, sizeof(buf));
+	} while (status == MEVENT_MAX);
+}
+
+static void
+mevent_notify(void)
+{
+	char c;
+	
+	/*
+	 * If calling from outside the i/o thread, write a byte on the
+	 * pipe to force the i/o thread to exit the blocking kevent call.
+	 */
+	if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
+		write(mevent_pipefd[1], &c, 1);
+	}
+}
+
+static int
+mevent_kq_filter(struct mevent *mevp)
+{
+	int retval;
+
+	retval = 0;
+
+	if (mevp->me_type == EVF_READ)
+		retval = EVFILT_READ;
+
+	if (mevp->me_type == EVF_WRITE)
+		retval = EVFILT_WRITE;
+
+	return (retval);
+}
+
+static int
+mevent_kq_flags(struct mevent *mevp)
+{
+	int ret;
+
+	switch (mevp->me_state) {
+	case MEV_ENABLE:
+		ret = EV_ADD;
+		break;
+	case MEV_DISABLE:
+		ret = EV_DISABLE;
+		break;
+	case MEV_DEL_PENDING:
+		ret = EV_DELETE;
+		break;
+	}
+
+	return (ret);
+}
+
+static int
+mevent_kq_fflags(struct mevent *mevp)
+{
+	/* XXX nothing yet, perhaps EV_EOF for reads ? */
+	return (0);
+}
+
+static int
+mevent_build(int mfd, struct kevent *kev)
+{
+	struct mevent *mevp, *tmpp;
+	int i;
+
+	i = 0;
+
+	mevent_qlock();
+
+	LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
+		if (mevp->me_closefd) {
+			/*
+			 * A close of the file descriptor will remove the
+			 * event
+			 */
+			close(mevp->me_fd);
+		} else {
+			kev[i].ident = mevp->me_fd;
+			kev[i].filter = mevent_kq_filter(mevp);
+			kev[i].flags = mevent_kq_flags(mevp);
+			kev[i].fflags = mevent_kq_fflags(mevp);
+			kev[i].data = 0;
+			kev[i].udata = mevp;
+			i++;
+		}
+
+		mevp->me_cq = 0;
+		LIST_REMOVE(mevp, me_list);
+
+		if (mevp->me_state == MEV_DEL_PENDING) {
+			free(mevp);
+		} else {
+			LIST_INSERT_HEAD(&global_head, mevp, me_list);
+		}
+
+		assert(i < MEVENT_MAX);
+	}
+
+	mevent_qunlock();
+
+	return (i);
+}
+
+static void
+mevent_handle(struct kevent *kev, int numev)
+{
+	struct mevent *mevp;
+	int i;
+
+	for (i = 0; i < numev; i++) {
+		mevp = kev[i].udata;
+
+		/* XXX check for EV_ERROR ? */
+
+		(*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
+	}
+}
+
+struct mevent *
+mevent_add(int fd, enum ev_type type,
+	   void (*func)(int, enum ev_type, void *), void *param)
+{
+	struct mevent *lp, *mevp;
+
+	if (fd < 0 || func == NULL) {
+		return (NULL);
+	}
+
+	mevp = NULL;
+
+	mevent_qlock();
+
+	/*
+	 * Verify that the fd/type tuple is not present in any list
+	 */
+	LIST_FOREACH(lp, &global_head, me_list) {
+		if (lp->me_fd == fd && lp->me_type == type) {
+			goto exit;
+		}
+	}
+
+	LIST_FOREACH(lp, &change_head, me_list) {
+		if (lp->me_fd == fd && lp->me_type == type) {
+			goto exit;
+		}
+	}
+
+	/*
+	 * Allocate an entry, populate it, and add it to the change list.
+	 */
+	mevp = malloc(sizeof(struct mevent));
+	if (mevp == NULL) {
+		goto exit;
+	}
+
+	memset(mevp, 0, sizeof(struct mevent));
+	mevp->me_fd = fd;
+	mevp->me_type = type;
+	mevp->me_func = func;
+	mevp->me_param = param;
+
+	LIST_INSERT_HEAD(&change_head, mevp, me_list);
+	mevp->me_cq = 1;
+	mevp->me_state = MEV_ENABLE;
+	mevent_notify();
+
+exit:
+	mevent_qunlock();
+
+	return (mevp);
+}
+
+static int
+mevent_update(struct mevent *evp, int newstate)
+{
+	/*
+	 * It's not possible to enable/disable a deleted event
+	 */
+	if (evp->me_state == MEV_DEL_PENDING)
+		return (EINVAL);
+
+	/*
+	 * No update needed if state isn't changing
+	 */
+	if (evp->me_state == newstate)
+		return (0);
+	
+	mevent_qlock();
+
+	evp->me_state = newstate;
+
+	/*
+	 * Place the entry onto the changed list if not already there.
+	 */
+	if (evp->me_cq == 0) {
+		evp->me_cq = 1;
+		LIST_REMOVE(evp, me_list);
+		LIST_INSERT_HEAD(&change_head, evp, me_list);
+		mevent_notify();
+	}
+
+	mevent_qunlock();
+
+	return (0);
+}
+
+int
+mevent_enable(struct mevent *evp)
+{
+
+	return (mevent_update(evp, MEV_ENABLE));
+}
+
+int
+mevent_disable(struct mevent *evp)
+{
+
+	return (mevent_update(evp, MEV_DISABLE));
+}
+
+static int
+mevent_delete_event(struct mevent *evp, int closefd)
+{
+	mevent_qlock();
+
+	/*
+         * Place the entry onto the changed list if not already there, and
+	 * mark as to be deleted.
+         */
+        if (evp->me_cq == 0) {
+		evp->me_cq = 1;
+		LIST_REMOVE(evp, me_list);
+		LIST_INSERT_HEAD(&change_head, evp, me_list);
+		mevent_notify();
+        }
+	evp->me_state = MEV_DEL_PENDING;
+
+	if (closefd)
+		evp->me_closefd = 1;
+
+	mevent_qunlock();
+
+	return (0);
+}
+
+int
+mevent_delete(struct mevent *evp)
+{
+
+	return (mevent_delete_event(evp, 0));
+}
+
+int
+mevent_delete_close(struct mevent *evp)
+{
+
+	return (mevent_delete_event(evp, 1));
+}
+
+static void
+mevent_set_name(void)
+{
+	char tname[MAXCOMLEN + 1];
+
+	snprintf(tname, sizeof(tname), "%s mevent", vmname);
+	pthread_set_name_np(mevent_tid, tname);
+}
+
+void
+mevent_dispatch(void)
+{
+	struct kevent changelist[MEVENT_MAX];
+	struct kevent eventlist[MEVENT_MAX];
+	struct mevent *pipev;
+	int mfd;
+	int numev;
+	int ret;
+
+	mevent_tid = pthread_self();
+	mevent_set_name();
+
+	mfd = kqueue();
+	assert(mfd > 0);
+
+	/*
+	 * Open the pipe that will be used for other threads to force
+	 * the blocking kqueue call to exit by writing to it. Set the
+	 * descriptor to non-blocking.
+	 */
+	ret = pipe(mevent_pipefd);
+	if (ret < 0) {
+		perror("pipe");
+		exit(0);
+	}
+
+	/*
+	 * Add internal event handler for the pipe write fd
+	 */
+	pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
+	assert(pipev != NULL);
+
+	for (;;) {
+		/*
+		 * Build changelist if required.
+		 * XXX the changelist can be put into the blocking call
+		 * to eliminate the extra syscall. Currently better for
+		 * debug.
+		 */
+		numev = mevent_build(mfd, changelist);
+		if (numev) {
+			ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
+			if (ret == -1) {
+				perror("Error return from kevent change");
+			}
+		}
+
+		/*
+		 * Block awaiting events
+		 */
+		ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
+		if (ret == -1) {
+			perror("Error return from kevent monitor");
+		}
+		
+		/*
+		 * Handle reported events
+		 */
+		mevent_handle(eventlist, ret);
+	}			
+}
diff --git a/usr.sbin/bhyve/mevent.h b/usr.sbin/bhyve/mevent.h
new file mode 100644
index 0000000..32a9d74
--- /dev/null
+++ b/usr.sbin/bhyve/mevent.h
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_MEVENT_H_
+#define	_MEVENT_H_
+
+enum ev_type {
+	EVF_READ,
+	EVF_WRITE
+};
+
+struct mevent;
+
+struct mevent *mevent_add(int fd, enum ev_type type, 
+			  void (*func)(int, enum ev_type, void *),
+			  void *param);
+int	mevent_enable(struct mevent *evp);
+int	mevent_disable(struct mevent *evp);
+int	mevent_delete(struct mevent *evp);
+int	mevent_delete_close(struct mevent *evp);
+
+void	mevent_dispatch(void);
+
+#endif	/* _MEVENT_H_ */
diff --git a/usr.sbin/bhyve/mevent_test.c b/usr.sbin/bhyve/mevent_test.c
new file mode 100644
index 0000000..c72a497
--- /dev/null
+++ b/usr.sbin/bhyve/mevent_test.c
@@ -0,0 +1,180 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Test program for the micro event library. Set up a simple TCP echo
+ * service.
+ *
+ *  cc mevent_test.c mevent.c -lpthread
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+
+#include "mevent.h"
+
+#define TEST_PORT	4321
+
+static pthread_mutex_t accept_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t accept_condvar = PTHREAD_COND_INITIALIZER;
+
+#define MEVENT_ECHO
+
+#ifdef MEVENT_ECHO
+struct esync {
+	pthread_mutex_t	e_mt;
+	pthread_cond_t	e_cond;       
+};
+
+static void
+echoer_callback(int fd, enum ev_type type, void *param)
+{
+	struct esync *sync = param;
+
+	pthread_mutex_lock(&sync->e_mt);
+	pthread_cond_signal(&sync->e_cond);
+	pthread_mutex_unlock(&sync->e_mt);
+}
+
+static void *
+echoer(void *param)
+{
+	struct esync sync;
+	struct mevent *mev;
+	char buf[128];
+	int fd = (int)(uintptr_t) param;
+	int len;
+
+	pthread_mutex_init(&sync.e_mt, NULL);
+	pthread_cond_init(&sync.e_cond, NULL);
+
+	pthread_mutex_lock(&sync.e_mt);
+
+	mev = mevent_add(fd, EVF_READ, echoer_callback, &sync);
+	if (mev == NULL) {
+		printf("Could not allocate echoer event\n");
+		exit(1);
+	}
+
+	while (!pthread_cond_wait(&sync.e_cond, &sync.e_mt)) {
+		len = read(fd, buf, sizeof(buf));
+		if (len > 0) {
+			write(fd, buf, len);
+			write(0, buf, len);
+		} else {
+			break;
+		}
+	}
+
+	mevent_delete_close(mev);
+
+	pthread_mutex_unlock(&sync.e_mt);
+	pthread_mutex_destroy(&sync.e_mt);
+	pthread_cond_destroy(&sync.e_cond);
+}
+
+#else
+
+static void *
+echoer(void *param)
+{
+	char buf[128];
+	int fd = (int)(uintptr_t) param;
+	int len;
+
+	while ((len = read(fd, buf, sizeof(buf))) > 0) {
+		write(1, buf, len);
+	}
+}
+#endif /* MEVENT_ECHO */
+
+static void
+acceptor_callback(int fd, enum ev_type type, void *param)
+{
+	pthread_mutex_lock(&accept_mutex);
+	pthread_cond_signal(&accept_condvar);
+	pthread_mutex_unlock(&accept_mutex);
+}
+
+static void *
+acceptor(void *param)
+{
+	struct sockaddr_in sin;
+	pthread_t tid;
+	int news;
+	int s;
+
+        if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+                perror("socket");
+                exit(1);
+        }
+
+        sin.sin_len = sizeof(sin);
+        sin.sin_family = AF_INET;
+        sin.sin_addr.s_addr = htonl(INADDR_ANY);
+        sin.sin_port = htons(TEST_PORT);
+
+        if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
+                perror("bind");
+                exit(1);
+        }
+
+        if (listen(s, 1) < 0) {
+                perror("listen");
+                exit(1);
+        }
+
+	(void) mevent_add(s, EVF_READ, acceptor_callback, NULL);
+
+	pthread_mutex_lock(&accept_mutex);
+
+	while (!pthread_cond_wait(&accept_condvar, &accept_mutex)) {
+		news = accept(s, NULL, NULL);
+		if (news < 0) {
+			perror("accept error");
+		} else {
+			printf("incoming connection, spawning thread\n");
+			pthread_create(&tid, NULL, echoer,
+				       (void *)(uintptr_t)news);
+		}
+	}
+}
+
+main()
+{
+	pthread_t tid;
+
+	pthread_create(&tid, NULL, acceptor, NULL);
+
+	mevent_dispatch();
+}
diff --git a/usr.sbin/bhyve/mptbl.c b/usr.sbin/bhyve/mptbl.c
new file mode 100644
index 0000000..52790f3
--- /dev/null
+++ b/usr.sbin/bhyve/mptbl.c
@@ -0,0 +1,398 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <x86/mptable.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#include "bhyverun.h"
+#include "mptbl.h"
+
+#define MPTABLE_BASE		0xF0000
+
+#define LAPIC_PADDR		0xFEE00000
+#define LAPIC_VERSION 		16
+
+#define IOAPIC_PADDR		0xFEC00000
+#define IOAPIC_VERSION		0x11
+
+#define MP_SPECREV		4
+#define MPFP_SIG		"_MP_"
+
+/* Configuration header defines */
+#define MPCH_SIG		"PCMP"
+#define MPCH_OEMID		"BHyVe   "
+#define MPCH_OEMID_LEN          8
+#define MPCH_PRODID             "Hypervisor  "
+#define MPCH_PRODID_LEN         12
+
+/* Processor entry defines */
+#define MPEP_SIG_FAMILY		6	/* XXX bhyve should supply this */
+#define MPEP_SIG_MODEL		26
+#define MPEP_SIG_STEPPING	5
+#define MPEP_SIG		\
+	((MPEP_SIG_FAMILY << 8) | \
+	 (MPEP_SIG_MODEL << 4)	| \
+	 (MPEP_SIG_STEPPING))
+
+#define MPEP_FEATURES           (0xBFEBFBFF) /* XXX Intel i7 */
+
+/* Define processor entry struct since <x86/mptable.h> gets it wrong */
+typedef struct BPROCENTRY {
+	u_char		type;
+	u_char		apic_id;
+	u_char		apic_version;
+	u_char		cpu_flags;
+	uint32_t	cpu_signature;
+	uint32_t	feature_flags;
+	uint32_t	reserved1;
+	uint32_t	reserved2;
+}      *bproc_entry_ptr;
+CTASSERT(sizeof(struct BPROCENTRY) == 20);
+
+/* Bus entry defines */
+#define MPE_NUM_BUSES		2
+#define MPE_BUSNAME_LEN		6
+#define MPE_BUSNAME_ISA		"ISA   "
+#define MPE_BUSNAME_PCI		"PCI   "
+
+static void *oem_tbl_start;
+static int oem_tbl_size;
+
+static uint8_t
+mpt_compute_checksum(void *base, size_t len)
+{
+	uint8_t	*bytes;
+	uint8_t	sum;
+
+	for(bytes = base, sum = 0; len > 0; len--) {
+		sum += *bytes++;
+	}
+
+	return (256 - sum);
+}
+
+static void
+mpt_build_mpfp(mpfps_t mpfp, vm_paddr_t gpa)
+{
+
+	memset(mpfp, 0, sizeof(*mpfp));
+	memcpy(mpfp->signature, MPFP_SIG, 4);
+	mpfp->pap = gpa + sizeof(*mpfp);
+	mpfp->length = 1;
+	mpfp->spec_rev = MP_SPECREV;
+	mpfp->checksum = mpt_compute_checksum(mpfp, sizeof(*mpfp));
+}
+
+static void
+mpt_build_mpch(mpcth_t mpch)
+{
+
+	memset(mpch, 0, sizeof(*mpch));
+	memcpy(mpch->signature, MPCH_SIG, 4);
+	mpch->spec_rev = MP_SPECREV;
+	memcpy(mpch->oem_id, MPCH_OEMID, MPCH_OEMID_LEN);
+	memcpy(mpch->product_id, MPCH_PRODID, MPCH_PRODID_LEN);
+	mpch->apic_address = LAPIC_PADDR;
+}
+
+static void
+mpt_build_proc_entries(bproc_entry_ptr mpep, int ncpu)
+{
+	int i;
+
+	for (i = 0; i < ncpu; i++) {
+		memset(mpep, 0, sizeof(*mpep));
+		mpep->type = MPCT_ENTRY_PROCESSOR;
+		mpep->apic_id = i; // XXX
+		mpep->apic_version = LAPIC_VERSION;
+		mpep->cpu_flags = PROCENTRY_FLAG_EN;
+		if (i == 0)
+			mpep->cpu_flags |= PROCENTRY_FLAG_BP;
+		mpep->cpu_signature = MPEP_SIG;
+		mpep->feature_flags = MPEP_FEATURES;
+		mpep++;
+	}
+}
+
+static void
+mpt_build_bus_entries(bus_entry_ptr mpeb)
+{
+
+	memset(mpeb, 0, sizeof(*mpeb));
+	mpeb->type = MPCT_ENTRY_BUS;
+	mpeb->bus_id = ISA;
+	memcpy(mpeb->bus_type, MPE_BUSNAME_ISA, MPE_BUSNAME_LEN);
+	mpeb++;
+
+	memset(mpeb, 0, sizeof(*mpeb));
+	mpeb->type = MPCT_ENTRY_BUS;
+	mpeb->bus_id = PCI;
+	memcpy(mpeb->bus_type, MPE_BUSNAME_PCI, MPE_BUSNAME_LEN);
+}
+
+static void
+mpt_build_ioapic_entries(io_apic_entry_ptr mpei, int id)
+{
+
+	memset(mpei, 0, sizeof(*mpei));
+	mpei->type = MPCT_ENTRY_IOAPIC;
+	mpei->apic_id = id;
+	mpei->apic_version = IOAPIC_VERSION;
+	mpei->apic_flags = IOAPICENTRY_FLAG_EN;
+	mpei->apic_address = IOAPIC_PADDR;
+}
+
+#ifdef notyet
+static void
+mpt_build_ioint_entries(struct mpe_ioint *mpeii, int num_pins, int id)
+{
+	int pin;
+
+	/*
+	 * The following config is taken from kernel mptable.c
+	 * mptable_parse_default_config_ints(...), for now 
+	 * just use the default config, tweek later if needed.
+	 */
+
+
+	/* Run through all 16 pins. */
+	for (pin = 0; pin < num_pins; pin++) {
+		memset(mpeii, 0, sizeof(*mpeii));
+		mpeii->entry_type = MP_ENTRY_IOINT;
+		mpeii->src_bus_id = MPE_BUSID_ISA;
+		mpeii->dst_apic_id = id;
+
+		/*
+		 * All default configs route IRQs from bus 0 to the first 16
+		 * pins of the first I/O APIC with an APIC ID of 2.
+		 */
+		mpeii->dst_apic_intin = pin;
+		switch (pin) {
+		case 0:
+			/* Pin 0 is an ExtINT pin. */
+			mpeii->intr_type = MPEII_INTR_EXTINT;
+			break;
+		case 2:
+			/* IRQ 0 is routed to pin 2. */
+			mpeii->intr_type = MPEII_INTR_INT;
+			mpeii->src_bus_irq = 0;
+			break;
+		case 5:
+		case 10:
+		case 11:
+			/*
+			 * PCI Irqs set to level triggered.
+			 */
+			mpeii->intr_flags = MPEII_FLAGS_TRIGMODE_LEVEL;
+			mpeii->src_bus_id = MPE_BUSID_PCI;
+		default:
+			/* All other pins are identity mapped. */
+			mpeii->intr_type = MPEII_INTR_INT;
+			mpeii->src_bus_irq = pin;
+			break;
+		}
+		mpeii++;
+	}
+
+}
+
+#define COPYSTR(dest, src, bytes)		\
+	memcpy(dest, src, bytes); 		\
+	str[bytes] = 0;
+
+static void
+mptable_dump(struct mp_floating_pointer *mpfp, struct mp_config_hdr *mpch)
+{
+	static char 	 str[16];
+	int 		 i;
+	char 		*cur;
+
+	union mpe {
+		struct mpe_proc 	*proc;
+		struct mpe_bus  	*bus;
+		struct mpe_ioapic 	*ioapic;
+		struct mpe_ioint 	*ioint;
+		struct mpe_lint 	*lnit;
+		char   			*p;
+	};
+
+	union mpe mpe;
+
+	printf(" MP Floating Pointer :\n");
+	COPYSTR(str, mpfp->signature, 4);
+	printf("\tsignature:\t%s\n", str);
+	printf("\tmpch paddr:\t%x\n", mpfp->mptable_paddr);
+	printf("\tlength:\t%x\n", mpfp->length);
+	printf("\tspecrec:\t%x\n", mpfp->specrev);
+	printf("\tchecksum:\t%x\n", mpfp->checksum);
+	printf("\tfeature1:\t%x\n", mpfp->feature1);
+	printf("\tfeature2:\t%x\n", mpfp->feature2);
+	printf("\tfeature3:\t%x\n", mpfp->feature3);
+	printf("\tfeature4:\t%x\n", mpfp->feature4);
+
+	printf(" MP Configuration Header :\n");
+	COPYSTR(str, mpch->signature, 4);
+	printf("    signature: 		%s\n", str);
+	printf("    length: 		%x\n", mpch->length);
+	printf("    specrec: 		%x\n", mpch->specrev);
+	printf("    checksum: 		%x\n", mpch->checksum);
+	COPYSTR(str, mpch->oemid, MPCH_OEMID_LEN);
+	printf("    oemid: 		%s\n", str);
+	COPYSTR(str, mpch->prodid, MPCH_PRODID_LEN);
+	printf("    prodid: 		%s\n", str);
+	printf("    oem_ptr: 		%x\n", mpch->oem_ptr);
+	printf("    oem_sz: 		%x\n", mpch->oem_sz);
+	printf("    nr_entries: 	%x\n", mpch->nr_entries);
+	printf("    apic paddr: 	%x\n", mpch->lapic_paddr);
+	printf("    ext_length: 	%x\n", mpch->ext_length);
+	printf("    ext_checksum: 	%x\n", mpch->ext_checksum);
+
+	cur = (char *)mpch + sizeof(*mpch);
+	for (i = 0; i < mpch->nr_entries; i++) {
+		mpe.p = cur;
+		switch(*mpe.p) {		
+			case MP_ENTRY_PROC:
+				printf(" MP Processor Entry :\n");
+				printf("	lapic_id: 	%x\n", mpe.proc->lapic_id);
+				printf("	lapic_version:	%x\n", mpe.proc->lapic_version);
+				printf("	proc_flags: 	%x\n", mpe.proc->proc_flags);
+				printf("	proc_signature: %x\n", mpe.proc->proc_signature);
+				printf("	feature_flags: 	%x\n", mpe.proc->feature_flags);
+				cur += sizeof(struct mpe_proc);
+				break;
+			case MP_ENTRY_BUS:
+				printf(" MP Bus Entry :\n");
+				printf("	busid: 		%x\n", mpe.bus->busid);
+				COPYSTR(str, mpe.bus->busname, MPE_BUSNAME_LEN);
+				printf("	busname: 	%s\n", str);
+				cur += sizeof(struct mpe_bus);
+				break;
+			case MP_ENTRY_IOAPIC:
+				printf(" MP IOAPIC Entry :\n");
+				printf("	ioapi_id: 		%x\n", mpe.ioapic->ioapic_id);
+				printf("	ioapi_version: 		%x\n", mpe.ioapic->ioapic_version);
+				printf("	ioapi_flags: 		%x\n", mpe.ioapic->ioapic_flags);
+				printf("	ioapi_paddr: 		%x\n", mpe.ioapic->ioapic_paddr);
+				cur += sizeof(struct mpe_ioapic);
+				break;
+			case MP_ENTRY_IOINT:
+				printf(" MP IO Interrupt Entry :\n");
+				printf("	intr_type: 		%x\n", mpe.ioint->intr_type);
+				printf("	intr_flags: 		%x\n", mpe.ioint->intr_flags);
+				printf("	src_bus_id: 		%x\n", mpe.ioint->src_bus_id);
+				printf("	src_bus_irq: 		%x\n", mpe.ioint->src_bus_irq);
+				printf("	dst_apic_id: 		%x\n", mpe.ioint->dst_apic_id);
+				printf("	dst_apic_intin:		%x\n", mpe.ioint->dst_apic_intin);
+				cur += sizeof(struct mpe_ioint);
+				break;
+			case MP_ENTRY_LINT:
+				printf(" MP Local Interrupt Entry :\n");
+				cur += sizeof(struct mpe_lint);
+				break;
+		}
+
+	}
+}
+#endif
+
+void
+mptable_add_oemtbl(void *tbl, int tblsz)
+{
+
+	oem_tbl_start = tbl;
+	oem_tbl_size = tblsz;
+}
+
+int
+mptable_build(struct vmctx *ctx, int ncpu, int ioapic)
+{
+	mpcth_t			mpch;
+	bus_entry_ptr		mpeb;
+	io_apic_entry_ptr	mpei;
+	bproc_entry_ptr		mpep;
+	mpfps_t			mpfp;
+	char 			*curraddr;
+	char 			*startaddr;
+
+	if (paddr_guest2host(0) == NULL) {
+		printf("mptable requires mapped mem\n");
+		return (ENOMEM);
+	}
+
+	startaddr = curraddr = paddr_guest2host(MPTABLE_BASE);
+
+	mpfp = (mpfps_t)curraddr;
+	mpt_build_mpfp(mpfp, MPTABLE_BASE);
+	curraddr += sizeof(*mpfp);
+
+	mpch = (mpcth_t)curraddr;
+	mpt_build_mpch(mpch);
+	curraddr += sizeof(*mpch);
+
+	mpep = (bproc_entry_ptr)curraddr;
+	mpt_build_proc_entries(mpep, ncpu);
+	curraddr += sizeof(*mpep) * ncpu;
+	mpch->entry_count += ncpu;
+
+	mpeb = (bus_entry_ptr) curraddr;
+	mpt_build_bus_entries(mpeb);
+	curraddr += sizeof(*mpeb) * MPE_NUM_BUSES;
+	mpch->entry_count += MPE_NUM_BUSES;
+
+	if (ioapic) {
+		mpei = (io_apic_entry_ptr)curraddr;
+		mpt_build_ioapic_entries(mpei, ncpu + 1);
+		curraddr += sizeof(*mpei);
+		mpch->entry_count++;
+	}
+
+#ifdef notyet
+	mpt_build_ioint_entries((struct mpe_ioint*)curraddr, MPEII_MAX_IRQ,
+				ncpu + 1);
+	curraddr += sizeof(struct mpe_ioint) * MPEII_MAX_IRQ;
+	mpch->entry_count += MPEII_MAX_IRQ;
+#endif
+
+	if (oem_tbl_start) {
+		mpch->oem_table_pointer = curraddr - startaddr + MPTABLE_BASE;
+		mpch->oem_table_size = oem_tbl_size;
+		memcpy(curraddr, oem_tbl_start, oem_tbl_size);
+	}
+
+	mpch->base_table_length = curraddr - (char *)mpch;
+	mpch->checksum = mpt_compute_checksum(mpch, sizeof(*mpch));
+
+	return (0);
+}
diff --git a/usr.sbin/bhyve/mptbl.h b/usr.sbin/bhyve/mptbl.h
new file mode 100644
index 0000000..3c4c527
--- /dev/null
+++ b/usr.sbin/bhyve/mptbl.h
@@ -0,0 +1,35 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _MPTBL_H_
+#define _MPTBL_H_
+
+int	mptable_build(struct vmctx *ctx, int ncpu, int ioapic);
+void	mptable_add_oemtbl(void *tbl, int tblsz);
+
+#endif /* _MPTBL_H_ */
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
new file mode 100644
index 0000000..e086aeb
--- /dev/null
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -0,0 +1,1117 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <assert.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+#include "inout.h"
+#include "mem.h"
+#include "mptbl.h"
+#include "pci_emul.h"
+#include "ioapic.h"
+
+#define CONF1_ADDR_PORT    0x0cf8
+#define CONF1_DATA_PORT    0x0cfc
+
+#define	CFGWRITE(pi,off,val,b)						\
+do {									\
+	if ((b) == 1) {							\
+		pci_set_cfgdata8((pi),(off),(val));			\
+	} else if ((b) == 2) {						\
+		pci_set_cfgdata16((pi),(off),(val));			\
+	} else {							\
+		pci_set_cfgdata32((pi),(off),(val));			\
+	}								\
+} while (0)
+
+#define MAXSLOTS	(PCI_SLOTMAX + 1)
+#define	MAXFUNCS	(PCI_FUNCMAX + 1)
+
+static struct slotinfo {
+	char	*si_name;
+	char	*si_param;
+	struct pci_devinst *si_devi;
+	int	si_legacy;
+} pci_slotinfo[MAXSLOTS][MAXFUNCS];
+
+/*
+ * Used to keep track of legacy interrupt owners/requestors
+ */
+#define NLIRQ		16
+
+static struct lirqinfo {
+	int	li_generic;
+	int	li_acount;
+	struct pci_devinst *li_owner;	/* XXX should be a list */
+} lirq[NLIRQ];
+
+SET_DECLARE(pci_devemu_set, struct pci_devemu);
+
+static uint64_t pci_emul_iobase;
+static uint64_t pci_emul_membase32;
+static uint64_t pci_emul_membase64;
+
+#define	PCI_EMUL_IOBASE		0x2000
+#define	PCI_EMUL_IOLIMIT	0x10000
+
+#define	PCI_EMUL_MEMBASE32	(lomem_sz)
+#define	PCI_EMUL_MEMLIMIT32	0xE0000000		/* 3.5GB */
+
+#define	PCI_EMUL_MEMBASE64	0xD000000000UL
+#define	PCI_EMUL_MEMLIMIT64	0xFD00000000UL
+
+static int pci_emul_devices;
+
+/*
+ * I/O access
+ */
+
+/*
+ * Slot options are in the form:
+ *
+ *  <slot>[:<func>],<emul>[,<config>]
+ *
+ *  slot is 0..31
+ *  func is 0..7
+ *  emul is a string describing the type of PCI device e.g. virtio-net
+ *  config is an optional string, depending on the device, that can be
+ *  used for configuration.
+ *   Examples are:
+ *     1,virtio-net,tap0
+ *     3:0,dummy
+ */
+static void
+pci_parse_slot_usage(char *aopt)
+{
+	printf("Invalid PCI slot info field \"%s\"\n", aopt);
+	free(aopt);
+}
+
+void
+pci_parse_slot(char *opt, int legacy)
+{
+	char *slot, *func, *emul, *config;
+	char *str, *cpy;
+	int snum, fnum;
+
+	str = cpy = strdup(opt);
+
+	config = NULL;
+
+	if (strchr(str, ':') != NULL) {
+		slot = strsep(&str, ":");
+		func = strsep(&str, ",");
+	} else {
+		slot = strsep(&str, ",");
+		func = NULL;
+	}
+
+	emul = strsep(&str, ",");
+	if (str != NULL) {
+		config = strsep(&str, ",");
+	}
+
+	if (emul == NULL) {
+		pci_parse_slot_usage(cpy);
+		return;
+	}
+
+	snum = atoi(slot);
+	fnum = func ? atoi(func) : 0;
+	if (snum < 0 || snum >= MAXSLOTS || fnum < 0 || fnum >= MAXFUNCS) {
+		pci_parse_slot_usage(cpy);
+	} else {
+		pci_slotinfo[snum][fnum].si_name = emul;
+		pci_slotinfo[snum][fnum].si_param = config;
+		pci_slotinfo[snum][fnum].si_legacy = legacy;
+	}
+}
+
+static int
+pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		    uint32_t *eax, void *arg)
+{
+	struct pci_devinst *pdi = arg;
+	struct pci_devemu *pe = pdi->pi_d;
+	uint64_t offset;
+	int i;
+
+	for (i = 0; i <= PCI_BARMAX; i++) {
+		if (pdi->pi_bar[i].type == PCIBAR_IO &&
+		    port >= pdi->pi_bar[i].addr &&
+		    port + bytes <=
+		        pdi->pi_bar[i].addr + pdi->pi_bar[i].size) {
+			offset = port - pdi->pi_bar[i].addr;
+			if (in)
+				*eax = (*pe->pe_barread)(ctx, vcpu, pdi, i,
+							 offset, bytes);
+			else
+				(*pe->pe_barwrite)(ctx, vcpu, pdi, i, offset,
+						   bytes, *eax);
+			return (0);
+		}
+	}
+	return (-1);
+}
+
+static int
+pci_emul_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+		     int size, uint64_t *val, void *arg1, long arg2)
+{
+	struct pci_devinst *pdi = arg1;
+	struct pci_devemu *pe = pdi->pi_d;
+	uint64_t offset;
+	int bidx = (int) arg2;
+
+	assert(bidx <= PCI_BARMAX);
+	assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 ||
+	       pdi->pi_bar[bidx].type == PCIBAR_MEM64);
+	assert(addr >= pdi->pi_bar[bidx].addr &&
+	       addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size);
+
+	offset = addr - pdi->pi_bar[bidx].addr;
+
+	if (dir == MEM_F_WRITE)
+		(*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset, size, *val);
+	else
+		*val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx, offset, size);
+
+	return (0);
+}
+
+
+static int
+pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size,
+			uint64_t *addr)
+{
+	uint64_t base;
+
+	assert((size & (size - 1)) == 0);	/* must be a power of 2 */
+
+	base = roundup2(*baseptr, size);
+
+	if (base + size <= limit) {
+		*addr = base;
+		*baseptr = base + size;
+		return (0);
+	} else
+		return (-1);
+}
+
+int
+pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type,
+		   uint64_t size)
+{
+
+	return (pci_emul_alloc_pbar(pdi, idx, 0, type, size));
+}
+
+int
+pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
+		    enum pcibar_type type, uint64_t size)
+{
+	int i, error;
+	uint64_t *baseptr, limit, addr, mask, lobits, bar;
+	struct inout_port iop;
+	struct mem_range memp;
+
+	assert(idx >= 0 && idx <= PCI_BARMAX);
+
+	if ((size & (size - 1)) != 0)
+		size = 1UL << flsl(size);	/* round up to a power of 2 */
+
+	switch (type) {
+	case PCIBAR_NONE:
+		baseptr = NULL;
+		addr = mask = lobits = 0;
+		break;
+	case PCIBAR_IO:
+		if (hostbase &&
+		    pci_slotinfo[pdi->pi_slot][pdi->pi_func].si_legacy) {
+			assert(hostbase < PCI_EMUL_IOBASE);
+			baseptr = &hostbase;
+		} else {
+			baseptr = &pci_emul_iobase;
+		}
+		limit = PCI_EMUL_IOLIMIT;
+		mask = PCIM_BAR_IO_BASE;
+		lobits = PCIM_BAR_IO_SPACE;
+		break;
+	case PCIBAR_MEM64:
+		/*
+		 * XXX
+		 * Some drivers do not work well if the 64-bit BAR is allocated
+		 * above 4GB. Allow for this by allocating small requests under
+		 * 4GB unless then allocation size is larger than some arbitrary
+		 * number (32MB currently).
+		 */
+		if (size > 32 * 1024 * 1024) {
+			/*
+			 * XXX special case for device requiring peer-peer DMA
+			 */
+			if (size == 0x100000000UL)
+				baseptr = &hostbase;
+			else
+				baseptr = &pci_emul_membase64;
+			limit = PCI_EMUL_MEMLIMIT64;
+			mask = PCIM_BAR_MEM_BASE;
+			lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
+				 PCIM_BAR_MEM_PREFETCH;
+			break;
+		} else {
+			baseptr = &pci_emul_membase32;
+			limit = PCI_EMUL_MEMLIMIT32;
+			mask = PCIM_BAR_MEM_BASE;
+			lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64;
+		}
+		break;
+	case PCIBAR_MEM32:
+		baseptr = &pci_emul_membase32;
+		limit = PCI_EMUL_MEMLIMIT32;
+		mask = PCIM_BAR_MEM_BASE;
+		lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
+		break;
+	default:
+		printf("pci_emul_alloc_base: invalid bar type %d\n", type);
+		assert(0);
+	}
+
+	if (baseptr != NULL) {
+		error = pci_emul_alloc_resource(baseptr, limit, size, &addr);
+		if (error != 0)
+			return (error);
+	}
+
+	pdi->pi_bar[idx].type = type;
+	pdi->pi_bar[idx].addr = addr;
+	pdi->pi_bar[idx].size = size;
+
+	/* Initialize the BAR register in config space */
+	bar = (addr & mask) | lobits;
+	pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar);
+
+	if (type == PCIBAR_MEM64) {
+		assert(idx + 1 <= PCI_BARMAX);
+		pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64;
+		pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32);
+	}
+	
+	/* add a handler to intercept accesses to the I/O bar */
+	if (type == PCIBAR_IO) {
+		iop.name = pdi->pi_name;
+		iop.flags = IOPORT_F_INOUT;
+		iop.handler = pci_emul_io_handler;
+		iop.arg = pdi;
+
+		for (i = 0; i < size; i++) {
+			iop.port = addr + i;
+			register_inout(&iop);
+		}
+	} else if (type == PCIBAR_MEM32 || type == PCIBAR_MEM64) {
+		/* add memory bar intercept handler */
+		memp.name = pdi->pi_name;
+		memp.flags = MEM_F_RW;
+		memp.base = addr;
+		memp.size = size;
+		memp.handler = pci_emul_mem_handler;
+		memp.arg1 = pdi;
+		memp.arg2 = idx;
+
+		error = register_mem(&memp);
+		assert(error == 0);
+	}
+
+	return (0);
+}
+
+#define	CAP_START_OFFSET	0x40
+static int
+pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen)
+{
+	int i, capoff, capid, reallen;
+	uint16_t sts;
+
+	static u_char endofcap[4] = {
+		PCIY_RESERVED, 0, 0, 0
+	};
+
+	assert(caplen > 0 && capdata[0] != PCIY_RESERVED);
+
+	reallen = roundup2(caplen, 4);		/* dword aligned */
+
+	sts = pci_get_cfgdata16(pi, PCIR_STATUS);
+	if ((sts & PCIM_STATUS_CAPPRESENT) == 0) {
+		capoff = CAP_START_OFFSET;
+		pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff);
+		pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT);
+	} else {
+		capoff = pci_get_cfgdata8(pi, PCIR_CAP_PTR);
+		while (1) {
+			assert((capoff & 0x3) == 0);
+			capid = pci_get_cfgdata8(pi, capoff);
+			if (capid == PCIY_RESERVED)
+				break;
+			capoff = pci_get_cfgdata8(pi, capoff + 1);
+		}
+	}
+
+	/* Check if we have enough space */
+	if (capoff + reallen + sizeof(endofcap) > PCI_REGMAX + 1)
+		return (-1);
+
+	/* Copy the capability */
+	for (i = 0; i < caplen; i++)
+		pci_set_cfgdata8(pi, capoff + i, capdata[i]);
+
+	/* Set the next capability pointer */
+	pci_set_cfgdata8(pi, capoff + 1, capoff + reallen);
+
+	/* Copy of the reserved capability which serves as the end marker */
+	for (i = 0; i < sizeof(endofcap); i++)
+		pci_set_cfgdata8(pi, capoff + reallen + i, endofcap[i]);
+
+	return (0);
+}
+
+static struct pci_devemu *
+pci_emul_finddev(char *name)
+{
+	struct pci_devemu **pdpp, *pdp;
+
+	SET_FOREACH(pdpp, pci_devemu_set) {
+		pdp = *pdpp;
+		if (!strcmp(pdp->pe_emu, name)) {
+			return (pdp);
+		}
+	}
+
+	return (NULL);
+}
+
+static void
+pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int slot, int func,
+	      char *params)
+{
+	struct pci_devinst *pdi;
+	pdi = malloc(sizeof(struct pci_devinst));
+	bzero(pdi, sizeof(*pdi));
+
+	pdi->pi_vmctx = ctx;
+	pdi->pi_bus = 0;
+	pdi->pi_slot = slot;
+	pdi->pi_func = func;
+	pdi->pi_d = pde;
+	snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot);
+
+	/* Disable legacy interrupts */
+	pci_set_cfgdata8(pdi, PCIR_INTLINE, 255);
+	pci_set_cfgdata8(pdi, PCIR_INTPIN, 0);
+
+	pci_set_cfgdata8(pdi, PCIR_COMMAND,
+		    PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN);
+
+	if ((*pde->pe_init)(ctx, pdi, params) != 0) {
+		free(pdi);
+	} else {
+		pci_emul_devices++;
+		pci_slotinfo[slot][func].si_devi = pdi;
+	}	
+}
+
+void
+pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr)
+{
+	int mmc;
+
+	CTASSERT(sizeof(struct msicap) == 14);
+
+	/* Number of msi messages must be a power of 2 between 1 and 32 */
+	assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32);
+	mmc = ffs(msgnum) - 1;
+
+	bzero(msicap, sizeof(struct msicap));
+	msicap->capid = PCIY_MSI;
+	msicap->nextptr = nextptr;
+	msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1);
+}
+
+int
+pci_emul_add_msicap(struct pci_devinst *pi, int msgnum)
+{
+	struct msicap msicap;
+
+	pci_populate_msicap(&msicap, msgnum, 0);
+
+	return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap)));
+}
+
+void
+msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+		 int bytes, uint32_t val)
+{
+	uint16_t msgctrl, rwmask;
+	int off, table_bar;
+        
+	off = offset - capoff;
+	table_bar = pi->pi_msix.table_bar;
+	/* Message Control Register */
+	if (off == 2 && bytes == 2) {
+		rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK;
+		msgctrl = pci_get_cfgdata16(pi, offset);
+		msgctrl &= ~rwmask;
+		msgctrl |= val & rwmask;
+		val = msgctrl;
+
+		pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE;
+	} 
+	
+	CFGWRITE(pi, offset, val, bytes);
+}
+
+void
+msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+		int bytes, uint32_t val)
+{
+	uint16_t msgctrl, rwmask, msgdata, mme;
+	uint32_t addrlo;
+
+	/*
+	 * If guest is writing to the message control register make sure
+	 * we do not overwrite read-only fields.
+	 */
+	if ((offset - capoff) == 2 && bytes == 2) {
+		rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE;
+		msgctrl = pci_get_cfgdata16(pi, offset);
+		msgctrl &= ~rwmask;
+		msgctrl |= val & rwmask;
+		val = msgctrl;
+
+		addrlo = pci_get_cfgdata32(pi, capoff + 4);
+		if (msgctrl & PCIM_MSICTRL_64BIT)
+			msgdata = pci_get_cfgdata16(pi, capoff + 12);
+		else
+			msgdata = pci_get_cfgdata16(pi, capoff + 8);
+
+		/*
+		 * XXX check delivery mode, destination mode etc
+		 */
+		mme = msgctrl & PCIM_MSICTRL_MME_MASK;
+		pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0;
+		if (pi->pi_msi.enabled) {
+			pi->pi_msi.cpu = (addrlo >> 12) & 0xff;
+			pi->pi_msi.vector = msgdata & 0xff;
+			pi->pi_msi.msgnum = 1 << (mme >> 4);
+		} else {
+			pi->pi_msi.cpu = 0;
+			pi->pi_msi.vector = 0;
+			pi->pi_msi.msgnum = 0;
+		}
+	}
+
+	CFGWRITE(pi, offset, val, bytes);
+}
+
+/*
+ * This function assumes that 'coff' is in the capabilities region of the
+ * config space.
+ */
+static void
+pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val)
+{
+	int capid;
+	uint8_t capoff, nextoff;
+
+	/* Do not allow un-aligned writes */
+	if ((offset & (bytes - 1)) != 0)
+		return;
+
+	/* Find the capability that we want to update */
+	capoff = CAP_START_OFFSET;
+	while (1) {
+		capid = pci_get_cfgdata8(pi, capoff);
+		if (capid == PCIY_RESERVED)
+			break;
+
+		nextoff = pci_get_cfgdata8(pi, capoff + 1);
+		if (offset >= capoff && offset < nextoff)
+			break;
+
+		capoff = nextoff;
+	}
+	assert(offset >= capoff);
+
+	/*
+	 * Capability ID and Next Capability Pointer are readonly
+	 */
+	if (offset == capoff || offset == capoff + 1)
+		return;
+
+	switch (capid) {
+	case PCIY_MSI:
+		msicap_cfgwrite(pi, capoff, offset, bytes, val);
+		break;
+	default:
+		break;
+	}
+}
+
+static int
+pci_emul_iscap(struct pci_devinst *pi, int offset)
+{
+	int found;
+	uint16_t sts;
+	uint8_t capid, lastoff;
+
+	found = 0;
+	sts = pci_get_cfgdata16(pi, PCIR_STATUS);
+	if ((sts & PCIM_STATUS_CAPPRESENT) != 0) {
+		lastoff = pci_get_cfgdata8(pi, PCIR_CAP_PTR);
+		while (1) {
+			assert((lastoff & 0x3) == 0);
+			capid = pci_get_cfgdata8(pi, lastoff);
+			if (capid == PCIY_RESERVED)
+				break;
+			lastoff = pci_get_cfgdata8(pi, lastoff + 1);
+		}
+		if (offset >= CAP_START_OFFSET && offset <= lastoff)
+			found = 1;
+	}
+	return (found);
+}
+
+void
+init_pci(struct vmctx *ctx)
+{
+	struct pci_devemu *pde;
+	struct slotinfo *si;
+	int slot, func;
+
+	pci_emul_iobase = PCI_EMUL_IOBASE;
+	pci_emul_membase32 = PCI_EMUL_MEMBASE32;
+	pci_emul_membase64 = PCI_EMUL_MEMBASE64;
+
+	for (slot = 0; slot < MAXSLOTS; slot++) {
+		for (func = 0; func < MAXFUNCS; func++) {
+			si = &pci_slotinfo[slot][func];
+			if (si->si_name != NULL) {
+				pde = pci_emul_finddev(si->si_name);
+				if (pde != NULL) {
+					pci_emul_init(ctx, pde, slot, func,
+						      si->si_param);
+				}
+			}
+		}
+	}
+
+	/*
+	 * Allow ISA IRQs 5,10,11,12, and 15 to be available for
+	 * generic use
+	 */
+	lirq[5].li_generic = 1;
+	lirq[10].li_generic = 1;
+	lirq[11].li_generic = 1;
+	lirq[12].li_generic = 1;
+	lirq[15].li_generic = 1;
+}
+
+int
+pci_msi_enabled(struct pci_devinst *pi)
+{
+	return (pi->pi_msi.enabled);
+}
+
+int
+pci_msi_msgnum(struct pci_devinst *pi)
+{
+	if (pi->pi_msi.enabled)
+		return (pi->pi_msi.msgnum);
+	else
+		return (0);
+}
+
+void
+pci_generate_msi(struct pci_devinst *pi, int msg)
+{
+
+	if (pci_msi_enabled(pi) && msg < pci_msi_msgnum(pi)) {
+		vm_lapic_irq(pi->pi_vmctx,
+			     pi->pi_msi.cpu,
+			     pi->pi_msi.vector + msg);
+	}
+}
+
+int
+pci_is_legacy(struct pci_devinst *pi)
+{
+
+	return (pci_slotinfo[pi->pi_slot][pi->pi_func].si_legacy);
+}
+
+static int
+pci_lintr_alloc(struct pci_devinst *pi, int vec)
+{
+	int i;
+
+	assert(vec < NLIRQ);
+
+	if (vec == -1) {
+		for (i = 0; i < NLIRQ; i++) {
+			if (lirq[i].li_generic &&
+			    lirq[i].li_owner == NULL) {
+				vec = i;
+				break;
+			}
+		}
+	} else {
+		if (lirq[vec].li_owner != NULL) {
+			vec = -1;
+		}
+	}
+	assert(vec != -1);
+
+	lirq[vec].li_owner = pi;
+	pi->pi_lintr_pin = vec;
+
+	return (vec);
+}
+
+int
+pci_lintr_request(struct pci_devinst *pi, int vec)
+{
+
+	vec = pci_lintr_alloc(pi, vec);
+	pci_set_cfgdata8(pi, PCIR_INTLINE, vec);
+	pci_set_cfgdata8(pi, PCIR_INTPIN, 1);
+	return (0);
+}
+
+void
+pci_lintr_assert(struct pci_devinst *pi)
+{
+
+	assert(pi->pi_lintr_pin);
+	ioapic_assert_pin(pi->pi_vmctx, pi->pi_lintr_pin);
+}
+
+void
+pci_lintr_deassert(struct pci_devinst *pi)
+{
+
+	assert(pi->pi_lintr_pin);
+	ioapic_deassert_pin(pi->pi_vmctx, pi->pi_lintr_pin);
+}
+
+/*
+ * Return 1 if the emulated device in 'slot' is a multi-function device.
+ * Return 0 otherwise.
+ */
+static int
+pci_emul_is_mfdev(int slot)
+{
+	int f, numfuncs;
+
+	numfuncs = 0;
+	for (f = 0; f < MAXFUNCS; f++) {
+		if (pci_slotinfo[slot][f].si_devi != NULL) {
+			numfuncs++;
+		}
+	}
+	return (numfuncs > 1);
+}
+
+/*
+ * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on
+ * whether or not is a multi-function being emulated in the pci 'slot'.
+ */
+static void
+pci_emul_hdrtype_fixup(int slot, int off, int bytes, uint32_t *rv)
+{
+	int mfdev;
+
+	if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) {
+		mfdev = pci_emul_is_mfdev(slot);
+		switch (bytes) {
+		case 1:
+		case 2:
+			*rv &= ~PCIM_MFDEV;
+			if (mfdev) {
+				*rv |= PCIM_MFDEV;
+			}
+			break;
+		case 4:
+			*rv &= ~(PCIM_MFDEV << 16);
+			if (mfdev) {
+				*rv |= (PCIM_MFDEV << 16);
+			}
+			break;
+		}
+	}
+}
+
+static int cfgbus, cfgslot, cfgfunc, cfgoff;
+
+static int
+pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		 uint32_t *eax, void *arg)
+{
+	uint32_t x;
+
+	assert(!in);
+
+	if (bytes != 4)
+		return (-1);
+
+	x = *eax;
+	cfgoff = x & PCI_REGMAX;
+	cfgfunc = (x >> 8) & PCI_FUNCMAX;
+	cfgslot = (x >> 11) & PCI_SLOTMAX;
+	cfgbus = (x >> 16) & PCI_BUSMAX;
+
+	return (0);
+}
+INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_OUT, pci_emul_cfgaddr);
+
+static int
+pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		 uint32_t *eax, void *arg)
+{
+	struct pci_devinst *pi;
+	struct pci_devemu *pe;
+	int coff, idx, needcfg;
+	uint64_t mask, bar;
+
+	assert(bytes == 1 || bytes == 2 || bytes == 4);
+	
+	if (cfgbus == 0)
+		pi = pci_slotinfo[cfgslot][cfgfunc].si_devi;
+	else
+		pi = NULL;
+
+	coff = cfgoff + (port - CONF1_DATA_PORT);
+
+#if 0
+	printf("pcicfg-%s from 0x%0x of %d bytes (%d/%d/%d)\n\r",
+		in ? "read" : "write", coff, bytes, cfgbus, cfgslot, cfgfunc);
+#endif
+
+	/*
+	 * Just return if there is no device at this cfgslot:cfgfunc or
+	 * if the guest is doing an un-aligned access
+	 */
+	if (pi == NULL || (coff & (bytes - 1)) != 0) {
+		if (in)
+			*eax = 0xffffffff;
+		return (0);
+	}
+
+	pe = pi->pi_d;
+
+	/*
+	 * Config read
+	 */
+	if (in) {
+		/* Let the device emulation override the default handler */
+		if (pe->pe_cfgread != NULL) {
+			needcfg = pe->pe_cfgread(ctx, vcpu, pi,
+						    coff, bytes, eax);
+		} else {
+			needcfg = 1;
+		}
+
+		if (needcfg) {
+			if (bytes == 1)
+				*eax = pci_get_cfgdata8(pi, coff);
+			else if (bytes == 2)
+				*eax = pci_get_cfgdata16(pi, coff);
+			else
+				*eax = pci_get_cfgdata32(pi, coff);
+		}
+
+		pci_emul_hdrtype_fixup(cfgslot, coff, bytes, eax);
+	} else {
+		/* Let the device emulation override the default handler */
+		if (pe->pe_cfgwrite != NULL &&
+		    (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0)
+			return (0);
+
+		/*
+		 * Special handling for write to BAR registers
+		 */
+		if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) {
+			/*
+			 * Ignore writes to BAR registers that are not
+			 * 4-byte aligned.
+			 */
+			if (bytes != 4 || (coff & 0x3) != 0)
+				return (0);
+			idx = (coff - PCIR_BAR(0)) / 4;
+			switch (pi->pi_bar[idx].type) {
+			case PCIBAR_NONE:
+				bar = 0;
+				break;
+			case PCIBAR_IO:
+				mask = ~(pi->pi_bar[idx].size - 1);
+				mask &= PCIM_BAR_IO_BASE;
+				bar = (*eax & mask) | PCIM_BAR_IO_SPACE;
+				break;
+			case PCIBAR_MEM32:
+				mask = ~(pi->pi_bar[idx].size - 1);
+				mask &= PCIM_BAR_MEM_BASE;
+				bar = *eax & mask;
+				bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
+				break;
+			case PCIBAR_MEM64:
+				mask = ~(pi->pi_bar[idx].size - 1);
+				mask &= PCIM_BAR_MEM_BASE;
+				bar = *eax & mask;
+				bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
+				       PCIM_BAR_MEM_PREFETCH;
+				break;
+			case PCIBAR_MEMHI64:
+				mask = ~(pi->pi_bar[idx - 1].size - 1);
+				mask &= PCIM_BAR_MEM_BASE;
+				bar = ((uint64_t)*eax << 32) & mask;
+				bar = bar >> 32;
+				break;
+			default:
+				assert(0);
+			}
+			pci_set_cfgdata32(pi, coff, bar);
+
+		} else if (pci_emul_iscap(pi, coff)) {
+			pci_emul_capwrite(pi, coff, bytes, *eax);
+		} else {
+			CFGWRITE(pi, coff, *eax, bytes);
+		}
+	}
+
+	return (0);
+}
+
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
+
+/*
+ * I/O ports to configure PCI IRQ routing. We ignore all writes to it.
+ */
+static int
+pci_irq_port_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		     uint32_t *eax, void *arg)
+{
+	assert(in == 0);
+	return (0);
+}
+INOUT_PORT(pci_irq, 0xC00, IOPORT_F_OUT, pci_irq_port_handler);
+INOUT_PORT(pci_irq, 0xC01, IOPORT_F_OUT, pci_irq_port_handler);
+
+#define PCI_EMUL_TEST
+#ifdef PCI_EMUL_TEST
+/*
+ * Define a dummy test device
+ */
+#define DIOSZ	20
+#define DMEMSZ	4096
+struct pci_emul_dsoftc {
+	uint8_t   ioregs[DIOSZ];
+	uint8_t	  memregs[DMEMSZ];
+};
+
+#define	PCI_EMUL_MSI_MSGS	 4
+#define	PCI_EMUL_MSIX_MSGS	16
+
+static int
+pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	int error;
+	struct pci_emul_dsoftc *sc;
+
+	sc = malloc(sizeof(struct pci_emul_dsoftc));
+	memset(sc, 0, sizeof(struct pci_emul_dsoftc));
+
+	pi->pi_arg = sc;
+
+	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001);
+	pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD);
+	pci_set_cfgdata8(pi, PCIR_CLASS, 0x02);
+
+	error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS);
+	assert(error == 0);
+
+	error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ);
+	assert(error == 0);
+
+	error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ);
+	assert(error == 0);
+
+	return (0);
+}
+
+static void
+pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+	      uint64_t offset, int size, uint64_t value)
+{
+	int i;
+	struct pci_emul_dsoftc *sc = pi->pi_arg;
+
+	if (baridx == 0) {
+		if (offset + size > DIOSZ) {
+			printf("diow: iow too large, offset %ld size %d\n",
+			       offset, size);
+			return;
+		}
+
+		if (size == 1) {
+			sc->ioregs[offset] = value & 0xff;
+		} else if (size == 2) {
+			*(uint16_t *)&sc->ioregs[offset] = value & 0xffff;
+		} else if (size == 4) {
+			*(uint32_t *)&sc->ioregs[offset] = value;
+		} else {
+			printf("diow: iow unknown size %d\n", size);
+		}
+
+		/*
+		 * Special magic value to generate an interrupt
+		 */
+		if (offset == 4 && size == 4 && pci_msi_enabled(pi))
+			pci_generate_msi(pi, value % pci_msi_msgnum(pi));
+
+		if (value == 0xabcdef) {
+			for (i = 0; i < pci_msi_msgnum(pi); i++)
+				pci_generate_msi(pi, i);
+		}
+	}
+
+	if (baridx == 1) {
+		if (offset + size > DMEMSZ) {
+			printf("diow: memw too large, offset %ld size %d\n",
+			       offset, size);
+			return;
+		}
+
+		if (size == 1) {
+			sc->memregs[offset] = value;
+		} else if (size == 2) {
+			*(uint16_t *)&sc->memregs[offset] = value;
+		} else if (size == 4) {
+			*(uint32_t *)&sc->memregs[offset] = value;
+		} else if (size == 8) {
+			*(uint64_t *)&sc->memregs[offset] = value;
+		} else {
+			printf("diow: memw unknown size %d\n", size);
+		}
+		
+		/*
+		 * magic interrupt ??
+		 */
+	}
+
+	if (baridx > 1) {
+		printf("diow: unknown bar idx %d\n", baridx);
+	}
+}
+
+static uint64_t
+pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+	      uint64_t offset, int size)
+{
+	struct pci_emul_dsoftc *sc = pi->pi_arg;
+	uint32_t value;
+
+	if (baridx == 0) {
+		if (offset + size > DIOSZ) {
+			printf("dior: ior too large, offset %ld size %d\n",
+			       offset, size);
+			return (0);
+		}
+	
+		if (size == 1) {
+			value = sc->ioregs[offset];
+		} else if (size == 2) {
+			value = *(uint16_t *) &sc->ioregs[offset];
+		} else if (size == 4) {
+			value = *(uint32_t *) &sc->ioregs[offset];
+		} else {
+			printf("dior: ior unknown size %d\n", size);
+		}
+	}
+	
+	if (baridx == 1) {
+		if (offset + size > DMEMSZ) {
+			printf("dior: memr too large, offset %ld size %d\n",
+			       offset, size);
+			return (0);
+		}
+	
+		if (size == 1) {
+			value = sc->memregs[offset];
+		} else if (size == 2) {
+			value = *(uint16_t *) &sc->memregs[offset];
+		} else if (size == 4) {
+			value = *(uint32_t *) &sc->memregs[offset];
+		} else if (size == 8) {
+			value = *(uint64_t *) &sc->memregs[offset];
+		} else {
+			printf("dior: ior unknown size %d\n", size);
+		}
+	}
+
+
+	if (baridx > 1) {
+		printf("dior: unknown bar idx %d\n", baridx);
+		return (0);
+	}
+
+	return (value);
+}
+
+struct pci_devemu pci_dummy = {
+	.pe_emu = "dummy",
+	.pe_init = pci_emul_dinit,
+	.pe_barwrite = pci_emul_diow,
+	.pe_barread = pci_emul_dior
+};
+PCI_EMUL_SET(pci_dummy);
+
+#endif /* PCI_EMUL_TEST */
diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h
new file mode 100644
index 0000000..e924475
--- /dev/null
+++ b/usr.sbin/bhyve/pci_emul.h
@@ -0,0 +1,216 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PCI_EMUL_H_
+#define _PCI_EMUL_H_
+
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/kernel.h>
+
+#include <dev/pci/pcireg.h>
+
+#include <assert.h>
+
+#define	PCI_BARMAX	PCIR_MAX_BAR_0	/* BAR registers in a Type 0 header */
+#define	PCIY_RESERVED	0x00
+
+struct vmctx;
+struct pci_devinst;
+struct memory_region;
+
+struct pci_devemu {
+	char      *pe_emu;		/* Name of device emulation */
+
+	/* instance creation */
+	int       (*pe_init)(struct vmctx *, struct pci_devinst *,
+			     char *opts);
+
+	/* config space read/write callbacks */
+	int	(*pe_cfgwrite)(struct vmctx *ctx, int vcpu,
+			       struct pci_devinst *pi, int offset,
+			       int bytes, uint32_t val);
+	int	(*pe_cfgread)(struct vmctx *ctx, int vcpu,
+			      struct pci_devinst *pi, int offset,
+			      int bytes, uint32_t *retval);
+
+	/* BAR read/write callbacks */
+	void      (*pe_barwrite)(struct vmctx *ctx, int vcpu,
+				 struct pci_devinst *pi, int baridx,
+				 uint64_t offset, int size, uint64_t value);
+	uint64_t  (*pe_barread)(struct vmctx *ctx, int vcpu,
+				struct pci_devinst *pi, int baridx,
+				uint64_t offset, int size);
+};
+#define PCI_EMUL_SET(x)   DATA_SET(pci_devemu_set, x);
+
+enum pcibar_type {
+	PCIBAR_NONE,
+	PCIBAR_IO,
+	PCIBAR_MEM32,
+	PCIBAR_MEM64,
+	PCIBAR_MEMHI64
+};
+
+struct pcibar {
+	enum pcibar_type	type;		/* io or memory */
+	uint64_t		size;
+	uint64_t		addr;
+};
+
+#define PI_NAMESZ	40
+
+struct msix_table_entry {
+	uint64_t	addr;
+	uint32_t	msg_data;
+	uint32_t	vector_control;
+} __packed;
+
+/* 
+ * In case the structure is modified to hold extra information, use a define
+ * for the size that should be emulated.
+ */
+#define MSIX_TABLE_ENTRY_SIZE 16
+#define MAX_MSIX_TABLE_SIZE 2048
+
+struct pci_devinst {
+	struct pci_devemu *pi_d;
+	struct vmctx *pi_vmctx;
+	uint8_t	  pi_bus, pi_slot, pi_func;
+	uint8_t   pi_lintr_pin;
+	char	  pi_name[PI_NAMESZ];
+	uint16_t  pi_iobase;
+	int	  pi_bar_getsize;
+
+	struct {
+		int	enabled;
+		int	cpu;
+		int	vector;
+		int	msgnum;
+	} pi_msi;
+
+	struct {
+		int	enabled;
+		int	table_bar;
+		int	pba_bar;
+		size_t	table_offset;
+		size_t	table_size;
+		int	table_count;
+		size_t	pba_offset;
+		struct msix_table_entry table[MAX_MSIX_TABLE_SIZE];
+	} pi_msix;
+
+	void      *pi_arg;		/* devemu-private data */
+
+	u_char	  pi_cfgdata[PCI_REGMAX + 1];
+	struct pcibar pi_bar[PCI_BARMAX + 1];
+};
+
+struct msicap {
+	uint8_t		capid;
+	uint8_t		nextptr;
+	uint16_t	msgctrl;
+	uint32_t	addrlo;
+	uint32_t	addrhi;
+	uint16_t	msgdata;
+} __packed;
+
+struct msixcap {
+	uint8_t		capid;
+	uint8_t		nextptr;
+	uint16_t	msgctrl;
+	uint32_t	table_offset;
+	uint32_t	pba_offset;
+} __packed;
+
+void	init_pci(struct vmctx *ctx);
+void	msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+	    int bytes, uint32_t val);
+void	msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+	    int bytes, uint32_t val);
+void	pci_callback(void);
+int	pci_emul_alloc_bar(struct pci_devinst *pdi, int idx,
+	    enum pcibar_type type, uint64_t size);
+int	pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx,
+	    uint64_t hostbase, enum pcibar_type type, uint64_t size);
+int	pci_emul_add_msicap(struct pci_devinst *pi, int msgnum);
+int	pci_is_legacy(struct pci_devinst *pi);
+void	pci_generate_msi(struct pci_devinst *pi, int msgnum);
+void	pci_generate_msix(struct pci_devinst *pi, int msgnum);
+void	pci_lintr_assert(struct pci_devinst *pi);
+void	pci_lintr_deassert(struct pci_devinst *pi);
+int	pci_lintr_request(struct pci_devinst *pi, int ivec);
+int	pci_msi_enabled(struct pci_devinst *pi);
+int	pci_msix_enabled(struct pci_devinst *pi);
+int	pci_msi_msgnum(struct pci_devinst *pi);
+void	pci_parse_slot(char *opt, int legacy);
+void	pci_populate_msicap(struct msicap *cap, int msgs, int nextptr);
+
+static __inline void 
+pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val)
+{
+	assert(offset <= PCI_REGMAX);
+	*(uint8_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline void 
+pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val)
+{
+	assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
+	*(uint16_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline void 
+pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val)
+{
+	assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
+	*(uint32_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline uint8_t
+pci_get_cfgdata8(struct pci_devinst *pi, int offset)
+{
+	assert(offset <= PCI_REGMAX);
+	return (*(uint8_t *)(pi->pi_cfgdata + offset));
+}
+
+static __inline uint16_t
+pci_get_cfgdata16(struct pci_devinst *pi, int offset)
+{
+	assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
+	return (*(uint16_t *)(pi->pi_cfgdata + offset));
+}
+
+static __inline uint32_t
+pci_get_cfgdata32(struct pci_devinst *pi, int offset)
+{
+	assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
+	return (*(uint32_t *)(pi->pi_cfgdata + offset));
+}
+
+#endif /* _PCI_EMUL_H_ */
diff --git a/usr.sbin/bhyve/pci_hostbridge.c b/usr.sbin/bhyve/pci_hostbridge.c
new file mode 100644
index 0000000..c77762d
--- /dev/null
+++ b/usr.sbin/bhyve/pci_hostbridge.c
@@ -0,0 +1,52 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "pci_emul.h"
+
+static int
+pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+
+	/* config space */
+	pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275);	/* NetApp */
+	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275);	/* NetApp */
+	pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_BRIDGE);
+	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
+	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST);
+
+	return (0);
+}
+
+struct pci_devemu pci_de_hostbridge = {
+	.pe_emu = "hostbridge",
+	.pe_init = pci_hostbridge_init,
+};
+PCI_EMUL_SET(pci_de_hostbridge);
diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c
new file mode 100644
index 0000000..28abb6b
--- /dev/null
+++ b/usr.sbin/bhyve/pci_passthru.c
@@ -0,0 +1,724 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/pciio.h>
+#include <sys/ioctl.h>
+
+#include <dev/io/iodev.h>
+#include <machine/iodev.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+#include "pci_emul.h"
+#include "mem.h"
+
+#ifndef _PATH_DEVPCI
+#define	_PATH_DEVPCI	"/dev/pci"
+#endif
+
+#ifndef	_PATH_DEVIO
+#define	_PATH_DEVIO	"/dev/io"
+#endif
+
+#define	LEGACY_SUPPORT	1
+
+#define MSIX_TABLE_BIR_MASK 7
+#define MSIX_TABLE_OFFSET_MASK (~MSIX_TABLE_BIR_MASK);
+#define MSIX_TABLE_COUNT(x) (((x) & 0x7FF) + 1)
+#define MSIX_CAPLEN 12
+
+static int pcifd = -1;
+static int iofd = -1;
+
+struct passthru_softc {
+	struct pci_devinst *psc_pi;
+	struct pcibar psc_bar[PCI_BARMAX + 1];
+	struct {
+		int		capoff;
+		int		msgctrl;
+		int		emulated;
+	} psc_msi;
+	struct {
+		int		capoff;
+	} psc_msix;
+	struct pcisel psc_sel;
+};
+
+static int
+msi_caplen(int msgctrl)
+{
+	int len;
+	
+	len = 10;		/* minimum length of msi capability */
+
+	if (msgctrl & PCIM_MSICTRL_64BIT)
+		len += 4;
+
+#if 0
+	/*
+	 * Ignore the 'mask' and 'pending' bits in the MSI capability.
+	 * We'll let the guest manipulate them directly.
+	 */
+	if (msgctrl & PCIM_MSICTRL_VECTOR)
+		len += 10;
+#endif
+
+	return (len);
+}
+
+static uint32_t
+read_config(const struct pcisel *sel, long reg, int width)
+{
+	struct pci_io pi;
+
+	bzero(&pi, sizeof(pi));
+	pi.pi_sel = *sel;
+	pi.pi_reg = reg;
+	pi.pi_width = width;
+
+	if (ioctl(pcifd, PCIOCREAD, &pi) < 0)
+		return (0);				/* XXX */
+	else
+		return (pi.pi_data);
+}
+
+static void
+write_config(const struct pcisel *sel, long reg, int width, uint32_t data)
+{
+	struct pci_io pi;
+
+	bzero(&pi, sizeof(pi));
+	pi.pi_sel = *sel;
+	pi.pi_reg = reg;
+	pi.pi_width = width;
+	pi.pi_data = data;
+
+	(void)ioctl(pcifd, PCIOCWRITE, &pi);		/* XXX */
+}
+
+#ifdef LEGACY_SUPPORT
+static int
+passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
+{
+	int capoff, i;
+	struct msicap msicap;
+	u_char *capdata;
+
+	pci_populate_msicap(&msicap, msgnum, nextptr);
+
+	/*
+	 * XXX
+	 * Copy the msi capability structure in the last 16 bytes of the
+	 * config space. This is wrong because it could shadow something
+	 * useful to the device.
+	 */
+	capoff = 256 - roundup(sizeof(msicap), 4);
+	capdata = (u_char *)&msicap;
+	for (i = 0; i < sizeof(msicap); i++)
+		pci_set_cfgdata8(pi, capoff + i, capdata[i]);
+
+	return (capoff);
+}
+#endif	/* LEGACY_SUPPORT */
+
+static int
+cfginitmsi(struct passthru_softc *sc)
+{
+	int ptr, capptr, cap, sts, caplen;
+	uint32_t u32;
+	struct pcisel sel;
+	struct pci_devinst *pi;
+	struct msixcap msixcap;
+	uint32_t *msixcap_ptr;
+
+	pi = sc->psc_pi;
+	sel = sc->psc_sel;
+
+	/*
+	 * Parse the capabilities and cache the location of the MSI
+	 * and MSI-X capabilities.
+	 */
+	sts = read_config(&sel, PCIR_STATUS, 2);
+	if (sts & PCIM_STATUS_CAPPRESENT) {
+		ptr = read_config(&sel, PCIR_CAP_PTR, 1);
+		while (ptr != 0 && ptr != 0xff) {
+			cap = read_config(&sel, ptr + PCICAP_ID, 1);
+			if (cap == PCIY_MSI) {
+				/*
+				 * Copy the MSI capability into the config
+				 * space of the emulated pci device
+				 */
+				sc->psc_msi.capoff = ptr;
+				sc->psc_msi.msgctrl = read_config(&sel,
+								  ptr + 2, 2);
+				sc->psc_msi.emulated = 0;
+				caplen = msi_caplen(sc->psc_msi.msgctrl);
+				capptr = ptr;
+				while (caplen > 0) {
+					u32 = read_config(&sel, capptr, 4);
+					pci_set_cfgdata32(pi, capptr, u32);
+					caplen -= 4;
+					capptr += 4;
+				}
+			} else if (cap == PCIY_MSIX) {
+				/*
+				 * Copy the MSI-X capability 
+				 */
+				sc->psc_msix.capoff = ptr;
+				caplen = 12;
+				msixcap_ptr = (uint32_t*) &msixcap;
+				capptr = ptr;
+				while (caplen > 0) {
+					u32 = read_config(&sel, capptr, 4);
+					*msixcap_ptr = u32;
+					pci_set_cfgdata32(pi, capptr, u32);
+					caplen -= 4;
+					capptr += 4;
+					msixcap_ptr++;
+				}
+			}
+			ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
+		}
+	}
+
+	if (sc->psc_msix.capoff != 0) {
+		pi->pi_msix.pba_bar =
+		    msixcap.pba_offset & MSIX_TABLE_BIR_MASK;
+		pi->pi_msix.pba_offset =
+		    msixcap.pba_offset & MSIX_TABLE_OFFSET_MASK;
+		pi->pi_msix.table_bar =
+		    msixcap.table_offset & MSIX_TABLE_BIR_MASK;
+		pi->pi_msix.table_offset =
+		    msixcap.table_offset & MSIX_TABLE_OFFSET_MASK;
+		pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
+	}
+
+#ifdef LEGACY_SUPPORT
+	/*
+	 * If the passthrough device does not support MSI then craft a
+	 * MSI capability for it. We link the new MSI capability at the
+	 * head of the list of capabilities.
+	 */
+	if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
+		int origptr, msiptr;
+		origptr = read_config(&sel, PCIR_CAP_PTR, 1);
+		msiptr = passthru_add_msicap(pi, 1, origptr);
+		sc->psc_msi.capoff = msiptr;
+		sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
+		sc->psc_msi.emulated = 1;
+		pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
+	}
+#endif
+
+	/* Make sure one of the capabilities is present */
+	if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) 
+		return (-1);
+	else
+		return (0);
+}
+
+static uint64_t
+msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
+{
+	struct pci_devinst *pi;
+	struct msix_table_entry *entry;
+	uint8_t *src8;
+	uint16_t *src16;
+	uint32_t *src32;
+	uint64_t *src64;
+	uint64_t data;
+	size_t entry_offset;
+	int index;
+
+	pi = sc->psc_pi;
+	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
+	index = offset / MSIX_TABLE_ENTRY_SIZE;
+	entry = &pi->pi_msix.table[index];
+
+	switch(size) {
+	case 1:
+		src8 = (uint8_t *)((void *)entry + entry_offset);
+		data = *src8;
+		break;
+	case 2:
+		src16 = (uint16_t *)((void *)entry + entry_offset);
+		data = *src16;
+		break;
+	case 4:
+		src32 = (uint32_t *)((void *)entry + entry_offset);
+		data = *src32;
+		break;
+	case 8:
+		src64 = (uint64_t *)((void *)entry + entry_offset);
+		data = *src64;
+		break;
+	default:
+		return (-1);
+	}
+
+	return (data);
+}
+
+static void
+msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc,
+		 uint64_t offset, int size, uint64_t data)
+{
+	struct pci_devinst *pi;
+	struct msix_table_entry *entry;
+	uint32_t *dest;
+	size_t entry_offset;
+	uint32_t vector_control;
+	int error, index;
+
+	pi = sc->psc_pi;
+	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
+	index = offset / MSIX_TABLE_ENTRY_SIZE;
+	entry = &pi->pi_msix.table[index];
+
+	/* Only 4 byte naturally-aligned writes are supported */
+	assert(size == 4);
+	assert(entry_offset % 4 == 0);
+
+	vector_control = entry->vector_control;
+	dest = (uint32_t *)((void *)entry + entry_offset);
+	*dest = data;
+	/* If MSI-X hasn't been enabled, do nothing */
+	if (pi->pi_msix.enabled) {
+		/* If the entry is masked, don't set it up */
+		if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
+		    (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+			error = vm_setup_msix(ctx, vcpu, sc->psc_sel.pc_bus,
+					      sc->psc_sel.pc_dev, 
+					      sc->psc_sel.pc_func,
+					      index, entry->msg_data, 
+					      entry->vector_control,
+					      entry->addr);
+		}
+	}
+}
+
+static int
+init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
+{
+	int idx;
+	size_t table_size;
+	vm_paddr_t start;
+	size_t len;
+	struct pci_devinst *pi = sc->psc_pi;
+
+	/* 
+	 * If the MSI-X table BAR maps memory intended for
+	 * other uses, it is at least assured that the table 
+	 * either resides in its own page within the region, 
+	 * or it resides in a page shared with only the PBA.
+	 */
+	if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar && 
+	    ((pi->pi_msix.pba_offset - pi->pi_msix.table_offset) < 4096)) {
+		/* Need to also emulate the PBA, not supported yet */
+		printf("Unsupported MSI-X table and PBA in same page\n");
+		return (-1);
+	}
+
+	/* 
+	 * May need to split the BAR into 3 regions:
+	 * Before the MSI-X table, the MSI-X table, and after it
+	 * XXX for now, assume that the table is not in the middle
+	 */
+	table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
+	pi->pi_msix.table_size = table_size;
+	idx = pi->pi_msix.table_bar;
+
+	/* Round up to page size */
+	table_size = (table_size + 0x1000) & ~0xFFF;
+	if (pi->pi_msix.table_offset == 0) {		
+		/* Map everything after the MSI-X table */
+		start = pi->pi_bar[idx].addr + table_size;
+		len = pi->pi_bar[idx].size - table_size;
+	} else {
+                /* Map everything before the MSI-X table */
+		start = pi->pi_bar[idx].addr;
+		len = pi->pi_msix.table_offset;
+	}
+	return (vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus, 
+				   sc->psc_sel.pc_dev, sc->psc_sel.pc_func, 
+				   start, len, base + table_size));
+}
+
+static int
+cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
+{
+	int i, error;
+	struct pci_devinst *pi;
+	struct pci_bar_io bar;
+	enum pcibar_type bartype;
+	uint64_t base;
+
+	pi = sc->psc_pi;
+
+	/*
+	 * Initialize BAR registers
+	 */
+	for (i = 0; i <= PCI_BARMAX; i++) {
+		bzero(&bar, sizeof(bar));
+		bar.pbi_sel = sc->psc_sel;
+		bar.pbi_reg = PCIR_BAR(i);
+
+		if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0)
+			continue;
+
+		if (PCI_BAR_IO(bar.pbi_base)) {
+			bartype = PCIBAR_IO;
+			base = bar.pbi_base & PCIM_BAR_IO_BASE;
+		} else {
+			switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) {
+			case PCIM_BAR_MEM_64:
+				bartype = PCIBAR_MEM64;
+				break;
+			default:
+				bartype = PCIBAR_MEM32;
+				break;
+			}
+			base = bar.pbi_base & PCIM_BAR_MEM_BASE;
+		}
+
+		/* Cache information about the "real" BAR */
+		sc->psc_bar[i].type = bartype;
+		sc->psc_bar[i].size = bar.pbi_length;
+		sc->psc_bar[i].addr = base;
+
+		/* Allocate the BAR in the guest I/O or MMIO space */
+		error = pci_emul_alloc_pbar(pi, i, base, bartype,
+					    bar.pbi_length);
+		if (error)
+			return (-1);
+
+		/* The MSI-X table needs special handling */
+		if (i == pi->pi_msix.table_bar) {
+			error = init_msix_table(ctx, sc, base);
+			if (error) 
+				return (-1);
+		} else if (bartype != PCIBAR_IO) {
+			/* Map the physical MMIO space in the guest MMIO space */
+			error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
+				sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
+				pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
+			if (error)
+				return (-1);
+		}
+
+		/*
+		 * 64-bit BAR takes up two slots so skip the next one.
+		 */
+		if (bartype == PCIBAR_MEM64) {
+			i++;
+			assert(i <= PCI_BARMAX);
+			sc->psc_bar[i].type = PCIBAR_MEMHI64;
+		}
+	}
+	return (0);
+}
+
+static int
+cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
+{
+	int error;
+	struct passthru_softc *sc;
+
+	error = 1;
+	sc = pi->pi_arg;
+
+	bzero(&sc->psc_sel, sizeof(struct pcisel));
+	sc->psc_sel.pc_bus = bus;
+	sc->psc_sel.pc_dev = slot;
+	sc->psc_sel.pc_func = func;
+
+	if (cfginitmsi(sc) != 0)
+		goto done;
+
+	if (cfginitbar(ctx, sc) != 0)
+		goto done;
+
+	error = 0;				/* success */
+done:
+	return (error);
+}
+
+static int
+passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	int bus, slot, func, error;
+	struct passthru_softc *sc;
+
+	sc = NULL;
+	error = 1;
+
+	if (pcifd < 0) {
+		pcifd = open(_PATH_DEVPCI, O_RDWR, 0);
+		if (pcifd < 0)
+			goto done;
+	}
+
+	if (iofd < 0) {
+		iofd = open(_PATH_DEVIO, O_RDWR, 0);
+		if (iofd < 0)
+			goto done;
+	}
+
+	if (opts == NULL ||
+	    sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3)
+		goto done;
+
+	if (vm_assign_pptdev(ctx, bus, slot, func) != 0)
+		goto done;
+
+	sc = malloc(sizeof(struct passthru_softc));
+	memset(sc, 0, sizeof(struct passthru_softc));
+
+	pi->pi_arg = sc;
+	sc->psc_pi = pi;
+
+	/* initialize config space */
+	if ((error = cfginit(ctx, pi, bus, slot, func)) != 0)
+		goto done;
+	
+	error = 0;		/* success */
+done:
+	if (error) {
+		free(sc);
+		vm_unassign_pptdev(ctx, bus, slot, func);
+	}
+	return (error);
+}
+
+static int
+bar_access(int coff)
+{
+	if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
+		return (1);
+	else
+		return (0);
+}
+
+static int
+msicap_access(struct passthru_softc *sc, int coff)
+{
+	int caplen;
+
+	if (sc->psc_msi.capoff == 0)
+		return (0);
+
+	caplen = msi_caplen(sc->psc_msi.msgctrl);
+
+	if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
+		return (1);
+	else
+		return (0);
+}
+
+static int 
+msixcap_access(struct passthru_softc *sc, int coff)
+{
+	if (sc->psc_msix.capoff == 0) 
+		return (0);
+
+	return (coff >= sc->psc_msix.capoff && 
+	        coff < sc->psc_msix.capoff + MSIX_CAPLEN);
+}
+
+static int
+passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+		 int coff, int bytes, uint32_t *rv)
+{
+	struct passthru_softc *sc;
+
+	sc = pi->pi_arg;
+
+	/*
+	 * PCI BARs and MSI capability is emulated.
+	 */
+	if (bar_access(coff) || msicap_access(sc, coff))
+		return (-1);
+
+#ifdef LEGACY_SUPPORT
+	/*
+	 * Emulate PCIR_CAP_PTR if this device does not support MSI capability
+	 * natively.
+	 */
+	if (sc->psc_msi.emulated) {
+		if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
+			return (-1);
+	}
+#endif
+
+	/* Everything else just read from the device's config space */
+	*rv = read_config(&sc->psc_sel, coff, bytes);
+
+	return (0);
+}
+
+static int
+passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+		  int coff, int bytes, uint32_t val)
+{
+	int error, msix_table_entries, i;
+	struct passthru_softc *sc;
+
+	sc = pi->pi_arg;
+
+	/*
+	 * PCI BARs are emulated
+	 */
+	if (bar_access(coff))
+		return (-1);
+
+	/*
+	 * MSI capability is emulated
+	 */
+	if (msicap_access(sc, coff)) {
+		msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val);
+
+		error = vm_setup_msi(ctx, vcpu, sc->psc_sel.pc_bus,
+			sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_msi.cpu,
+			pi->pi_msi.vector, pi->pi_msi.msgnum);
+		if (error != 0) {
+			printf("vm_setup_msi returned error %d\r\n", errno);
+			exit(1);
+		}
+		return (0);
+	}
+
+	if (msixcap_access(sc, coff)) {
+		msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val);
+		if (pi->pi_msix.enabled) {
+			msix_table_entries = pi->pi_msix.table_count;
+			for (i = 0; i < msix_table_entries; i++) {
+				error = vm_setup_msix(ctx, vcpu, sc->psc_sel.pc_bus,
+						      sc->psc_sel.pc_dev, 
+						      sc->psc_sel.pc_func, i, 
+						      pi->pi_msix.table[i].msg_data,
+						      pi->pi_msix.table[i].vector_control,
+						      pi->pi_msix.table[i].addr);
+		
+				if (error) {
+					printf("vm_setup_msix returned error %d\r\n", errno);
+					exit(1);	
+				}
+			}
+		}
+		return (0);
+	}
+
+#ifdef LEGACY_SUPPORT
+	/*
+	 * If this device does not support MSI natively then we cannot let
+	 * the guest disable legacy interrupts from the device. It is the
+	 * legacy interrupt that is triggering the virtual MSI to the guest.
+	 */
+	if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
+		if (coff == PCIR_COMMAND && bytes == 2)
+			val &= ~PCIM_CMD_INTxDIS;
+	}
+#endif
+
+	write_config(&sc->psc_sel, coff, bytes, val);
+
+	return (0);
+}
+
+static void
+passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+	       uint64_t offset, int size, uint64_t value)
+{
+	struct passthru_softc *sc;
+	struct iodev_pio_req pio;
+
+	sc = pi->pi_arg;
+
+	if (pi->pi_msix.table_bar == baridx) {
+		msix_table_write(ctx, vcpu, sc, offset, size, value);
+	} else {
+		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
+		bzero(&pio, sizeof(struct iodev_pio_req));
+		pio.access = IODEV_PIO_WRITE;
+		pio.port = sc->psc_bar[baridx].addr + offset;
+		pio.width = size;
+		pio.val = value;
+		
+		(void)ioctl(iofd, IODEV_PIO, &pio);
+	}
+}
+
+static uint64_t
+passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+	      uint64_t offset, int size)
+{
+	struct passthru_softc *sc;
+	struct iodev_pio_req pio;
+	uint64_t val;
+
+	sc = pi->pi_arg;
+
+	if (pi->pi_msix.table_bar == baridx) {
+		val = msix_table_read(sc, offset, size);
+	} else {
+		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
+		bzero(&pio, sizeof(struct iodev_pio_req));
+		pio.access = IODEV_PIO_READ;
+		pio.port = sc->psc_bar[baridx].addr + offset;
+		pio.width = size;
+		pio.val = 0;
+
+		(void)ioctl(iofd, IODEV_PIO, &pio);
+
+		val = pio.val;
+	}
+
+	return (val);
+}
+
+struct pci_devemu passthru = {
+	.pe_emu		= "passthru",
+	.pe_init	= passthru_init,
+	.pe_cfgwrite	= passthru_cfgwrite,
+	.pe_cfgread	= passthru_cfgread,
+	.pe_barwrite 	= passthru_write,
+	.pe_barread    	= passthru_read,
+};
+PCI_EMUL_SET(passthru);
diff --git a/usr.sbin/bhyve/pci_uart.c b/usr.sbin/bhyve/pci_uart.c
new file mode 100644
index 0000000..dd30551
--- /dev/null
+++ b/usr.sbin/bhyve/pci_uart.c
@@ -0,0 +1,626 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/select.h>
+#include <dev/ic/ns16550.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <termios.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <pthread.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "mevent.h"
+
+#define	COM1_BASE      	0x3F8
+#define COM1_IRQ	4
+#define	COM2_BASE      	0x2F8
+#define COM2_IRQ	3
+
+#define	DEFAULT_RCLK	1843200
+#define	DEFAULT_BAUD	9600
+
+#define	FCR_RX_MASK	0xC0
+
+#define	MCR_OUT1	0x04
+#define	MCR_OUT2	0x08
+
+#define	MSR_DELTA_MASK	0x0f
+
+#ifndef REG_SCR
+#define REG_SCR		com_scr
+#endif
+
+#define	FIFOSZ	16
+
+/*
+ * Pick a PCI vid/did of a chip with a single uart at
+ * BAR0, that most versions of FreeBSD can understand:
+ * Siig CyberSerial 1-port.
+ */
+#define COM_VENDOR	0x131f
+#define COM_DEV		0x2000
+
+static int pci_uart_stdio;	/* stdio in use for i/o */
+
+static int pci_uart_nldevs;	/* number of legacy devices - 2 max */
+
+static struct {
+	uint64_t	baddr;
+	int		vector;
+} pci_uart_lres[] = {
+	{ COM1_BASE, COM1_IRQ},
+	{ COM2_BASE, COM2_IRQ},
+	{ 0, 0 }
+};
+
+struct fifo {
+	uint8_t	buf[FIFOSZ];
+	int	rindex;		/* index to read from */
+	int	windex;		/* index to write to */
+	int	num;		/* number of characters in the fifo */
+	int	size;		/* size of the fifo */
+};
+
+struct pci_uart_softc {
+	struct pci_devinst *pi;
+	pthread_mutex_t mtx;	/* protects all softc elements */
+	uint8_t data;		/* Data register (R/W) */
+	uint8_t ier;		/* Interrupt enable register (R/W) */
+	uint8_t lcr;		/* Line control register (R/W) */
+	uint8_t mcr;		/* Modem control register (R/W) */
+	uint8_t lsr;		/* Line status register (R/W) */
+	uint8_t msr;		/* Modem status register (R/W) */
+	uint8_t fcr;		/* FIFO control register (W) */
+	uint8_t scr;		/* Scratch register (R/W) */
+
+	uint8_t dll;		/* Baudrate divisor latch LSB */
+	uint8_t dlh;		/* Baudrate divisor latch MSB */
+
+	struct fifo rxfifo;
+
+	int	opened;
+	int	stdio;
+	bool	thre_int_pending;	/* THRE interrupt pending */
+};
+
+static void pci_uart_drain(int fd, enum ev_type ev, void *arg);
+
+static struct termios tio_orig, tio_new;	/* I/O Terminals */
+
+static void
+ttyclose(void)
+{
+	tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig);
+}
+
+static void
+ttyopen(void)
+{
+	tcgetattr(STDIN_FILENO, &tio_orig);
+
+	cfmakeraw(&tio_new);
+	tcsetattr(STDIN_FILENO, TCSANOW, &tio_new);
+
+	atexit(ttyclose);
+}
+
+static bool
+tty_char_available(void)
+{
+	fd_set rfds;
+	struct timeval tv;
+
+	FD_ZERO(&rfds);
+	FD_SET(STDIN_FILENO, &rfds);
+	tv.tv_sec = 0;
+	tv.tv_usec = 0;
+	if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0 ) {
+		return (true);
+	} else {
+		return (false);
+	}
+}
+
+static int
+ttyread(void)
+{
+	char rb;
+
+	if (tty_char_available()) {
+		read(STDIN_FILENO, &rb, 1);
+		return (rb & 0xff);
+	} else {
+		return (-1);
+	}
+}
+
+static void
+ttywrite(unsigned char wb)
+{
+	(void) write(STDIN_FILENO, &wb, 1);
+}
+
+static void
+fifo_reset(struct fifo *fifo, int size)
+{
+	bzero(fifo, sizeof(struct fifo));
+	fifo->size = size;
+}
+
+static int
+fifo_putchar(struct fifo *fifo, uint8_t ch)
+{
+
+	if (fifo->num < fifo->size) {
+		fifo->buf[fifo->windex] = ch;
+		fifo->windex = (fifo->windex + 1) % fifo->size;
+		fifo->num++;
+		return (0);
+	} else
+		return (-1);
+}
+
+static int
+fifo_getchar(struct fifo *fifo)
+{
+	int c;
+
+	if (fifo->num > 0) {
+		c = fifo->buf[fifo->rindex];
+		fifo->rindex = (fifo->rindex + 1) % fifo->size;
+		fifo->num--;
+		return (c);
+	} else
+		return (-1);
+}
+
+static int
+fifo_numchars(struct fifo *fifo)
+{
+
+	return (fifo->num);
+}
+
+static int
+fifo_available(struct fifo *fifo)
+{
+
+	return (fifo->num < fifo->size);
+}
+
+static void
+pci_uart_opentty(struct pci_uart_softc *sc)
+{
+	struct mevent *mev;
+
+	assert(sc->opened == 0);
+	assert(sc->stdio);
+
+	ttyopen();
+	mev = mevent_add(STDIN_FILENO, EVF_READ, pci_uart_drain, sc);
+	assert(mev);
+}
+
+static void
+pci_uart_legacy_res(uint64_t *bar, int *ivec)
+{
+	if (pci_uart_lres[pci_uart_nldevs].baddr != 0) {
+		*bar = pci_uart_lres[pci_uart_nldevs].baddr;
+		*ivec = pci_uart_lres[pci_uart_nldevs].vector;
+		pci_uart_nldevs++;
+	} else {
+		/* TODO: print warning ? */
+		*bar = 0;
+		*ivec= -1;
+	}
+}
+
+/*
+ * The IIR returns a prioritized interrupt reason:
+ * - receive data available
+ * - transmit holding register empty
+ * - modem status change
+ *
+ * Return an interrupt reason if one is available.
+ */
+static int
+pci_uart_intr_reason(struct pci_uart_softc *sc)
+{
+
+	if ((sc->lsr & LSR_OE) != 0 && (sc->ier & IER_ERLS) != 0)
+		return (IIR_RLS);
+	else if (fifo_numchars(&sc->rxfifo) > 0 && (sc->ier & IER_ERXRDY) != 0)
+		return (IIR_RXTOUT);
+	else if (sc->thre_int_pending && (sc->ier & IER_ETXRDY) != 0)
+		return (IIR_TXRDY);
+	else if ((sc->msr & MSR_DELTA_MASK) != 0 && (sc->ier & IER_EMSC) != 0)
+		return (IIR_MLSC);
+	else
+		return (IIR_NOPEND);
+}
+
+static void
+pci_uart_reset(struct pci_uart_softc *sc)
+{
+	uint16_t divisor;
+
+	divisor = DEFAULT_RCLK / DEFAULT_BAUD / 16;
+	sc->dll = divisor;
+	sc->dlh = divisor >> 16;
+
+	fifo_reset(&sc->rxfifo, 1);	/* no fifo until enabled by software */
+}
+
+/*
+ * Toggle the COM port's intr pin depending on whether or not we have an
+ * interrupt condition to report to the processor.
+ */
+static void
+pci_uart_toggle_intr(struct pci_uart_softc *sc)
+{
+	uint8_t intr_reason;
+
+	intr_reason = pci_uart_intr_reason(sc);
+
+	if (intr_reason == IIR_NOPEND)
+		pci_lintr_deassert(sc->pi);
+	else
+		pci_lintr_assert(sc->pi);
+}
+
+static void
+pci_uart_drain(int fd, enum ev_type ev, void *arg)
+{
+	struct pci_uart_softc *sc;
+	int ch;
+
+	sc = arg;	
+
+	assert(fd == STDIN_FILENO);
+	assert(ev == EVF_READ);
+	
+	/*
+	 * This routine is called in the context of the mevent thread
+	 * to take out the softc lock to protect against concurrent
+	 * access from a vCPU i/o exit
+	 */
+	pthread_mutex_lock(&sc->mtx);
+
+	if ((sc->mcr & MCR_LOOPBACK) != 0) {
+		(void) ttyread();
+	} else {
+		while (fifo_available(&sc->rxfifo) &&
+		       ((ch = ttyread()) != -1)) {
+			fifo_putchar(&sc->rxfifo, ch);
+		}
+		pci_uart_toggle_intr(sc);
+	}
+
+	pthread_mutex_unlock(&sc->mtx);
+}
+
+static void
+pci_uart_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+	       int baridx, uint64_t offset, int size, uint64_t value)
+{
+        struct pci_uart_softc *sc;
+	int fifosz;
+	uint8_t msr;
+
+	sc = pi->pi_arg;
+
+	assert(baridx == 0);
+	assert(size == 1);
+
+	/* Open terminal */
+	if (!sc->opened && sc->stdio) {
+		pci_uart_opentty(sc);
+		sc->opened = 1;
+	}
+
+	pthread_mutex_lock(&sc->mtx);
+	
+	/*
+	 * Take care of the special case DLAB accesses first
+	 */
+	if ((sc->lcr & LCR_DLAB) != 0) {
+		if (offset == REG_DLL) {
+			sc->dll = value;
+			goto done;
+		}
+		
+		if (offset == REG_DLH) {
+			sc->dlh = value;
+			goto done;
+		}
+	}
+
+        switch (offset) {
+	case REG_DATA:
+		if (sc->mcr & MCR_LOOPBACK) {
+			if (fifo_putchar(&sc->rxfifo, value) != 0)
+				sc->lsr |= LSR_OE;
+		} else if (sc->stdio) {
+			ttywrite(value);
+		} /* else drop on floor */
+		sc->thre_int_pending = true;
+		break;
+	case REG_IER:
+		/*
+		 * Apply mask so that bits 4-7 are 0
+		 * Also enables bits 0-3 only if they're 1
+		 */
+		sc->ier = value & 0x0F;
+		break;
+		case REG_FCR:
+			/*
+			 * When moving from FIFO and 16450 mode and vice versa,
+			 * the FIFO contents are reset.
+			 */
+			if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) {
+				fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1;
+				fifo_reset(&sc->rxfifo, fifosz);
+			}
+
+			/*
+			 * The FCR_ENABLE bit must be '1' for the programming
+			 * of other FCR bits to be effective.
+			 */
+			if ((value & FCR_ENABLE) == 0) {
+				sc->fcr = 0;
+			} else {
+				if ((value & FCR_RCV_RST) != 0)
+					fifo_reset(&sc->rxfifo, FIFOSZ);
+
+				sc->fcr = value &
+					 (FCR_ENABLE | FCR_DMA | FCR_RX_MASK);
+			}
+			break;
+		case REG_LCR:
+			sc->lcr = value;
+			break;
+		case REG_MCR:
+			/* Apply mask so that bits 5-7 are 0 */
+			sc->mcr = value & 0x1F;
+
+			msr = 0;
+			if (sc->mcr & MCR_LOOPBACK) {
+				/*
+				 * In the loopback mode certain bits from the
+				 * MCR are reflected back into MSR
+				 */
+				if (sc->mcr & MCR_RTS)
+					msr |= MSR_CTS;
+				if (sc->mcr & MCR_DTR)
+					msr |= MSR_DSR;
+				if (sc->mcr & MCR_OUT1)
+					msr |= MSR_RI;
+				if (sc->mcr & MCR_OUT2)
+					msr |= MSR_DCD;
+			}
+
+			/*
+			 * Detect if there has been any change between the
+			 * previous and the new value of MSR. If there is
+			 * then assert the appropriate MSR delta bit.
+			 */
+			if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS))
+				sc->msr |= MSR_DCTS;
+			if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR))
+				sc->msr |= MSR_DDSR;
+			if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD))
+				sc->msr |= MSR_DDCD;
+			if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0)
+				sc->msr |= MSR_TERI;
+
+			/*
+			 * Update the value of MSR while retaining the delta
+			 * bits.
+			 */
+			sc->msr &= MSR_DELTA_MASK;
+			sc->msr |= msr;
+			break;
+		case REG_LSR:
+			/*
+			 * Line status register is not meant to be written to
+			 * during normal operation.
+			 */
+			break;
+		case REG_MSR:
+			/*
+			 * As far as I can tell MSR is a read-only register.
+			 */
+			break;
+		case REG_SCR:
+			sc->scr = value;
+			break;
+		default:
+			break;
+	}
+
+done:
+	pci_uart_toggle_intr(sc);
+	pthread_mutex_unlock(&sc->mtx);
+}
+
+uint64_t
+pci_uart_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+	      int baridx, uint64_t offset, int size)
+{
+	struct pci_uart_softc *sc;
+	uint8_t iir, intr_reason;
+	uint64_t reg;
+
+	sc = pi->pi_arg;
+
+	assert(baridx == 0);
+	assert(size == 1);
+
+	/* Open terminal */
+	if (!sc->opened && sc->stdio) {
+		pci_uart_opentty(sc);
+		sc->opened = 1;
+	}
+
+	pthread_mutex_lock(&sc->mtx);
+
+	/*
+	 * Take care of the special case DLAB accesses first
+	 */
+	if ((sc->lcr & LCR_DLAB) != 0) {
+		if (offset == REG_DLL) {
+			reg = sc->dll;
+			goto done;
+		}
+		
+		if (offset == REG_DLH) {
+			reg = sc->dlh;
+			goto done;
+		}
+	}
+
+	switch (offset) {
+	case REG_DATA:
+		reg = fifo_getchar(&sc->rxfifo);
+		break;
+	case REG_IER:
+		reg = sc->ier;
+		break;
+	case REG_IIR:
+		iir = (sc->fcr & FCR_ENABLE) ? IIR_FIFO_MASK : 0;
+
+		intr_reason = pci_uart_intr_reason(sc);
+			
+		/*
+		 * Deal with side effects of reading the IIR register
+		 */
+		if (intr_reason == IIR_TXRDY)
+			sc->thre_int_pending = false;
+
+		iir |= intr_reason;
+
+		reg = iir;
+		break;
+	case REG_LCR:
+		reg = sc->lcr;
+		break;
+	case REG_MCR:
+		reg = sc->mcr;
+		break;
+	case REG_LSR:
+		/* Transmitter is always ready for more data */
+		sc->lsr |= LSR_TEMT | LSR_THRE;
+
+		/* Check for new receive data */
+		if (fifo_numchars(&sc->rxfifo) > 0)
+			sc->lsr |= LSR_RXRDY;
+		else
+			sc->lsr &= ~LSR_RXRDY;
+
+		reg = sc->lsr;
+
+		/* The LSR_OE bit is cleared on LSR read */
+		sc->lsr &= ~LSR_OE;
+		break;
+	case REG_MSR:
+		/*
+		 * MSR delta bits are cleared on read
+		 */
+		reg = sc->msr;
+		sc->msr &= ~MSR_DELTA_MASK;
+		break;
+	case REG_SCR:
+		reg = sc->scr;
+		break;
+	default:
+		reg = 0xFF;
+		break;
+	}
+
+done:
+	pci_uart_toggle_intr(sc);
+	pthread_mutex_unlock(&sc->mtx);
+
+	return (reg);
+}
+
+static int
+pci_uart_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	struct pci_uart_softc *sc;
+	uint64_t bar;
+	int ivec;
+
+	sc = malloc(sizeof(struct pci_uart_softc));
+	memset(sc, 0, sizeof(struct pci_uart_softc));
+
+	pi->pi_arg = sc;
+	sc->pi = pi;
+
+	pthread_mutex_init(&sc->mtx, NULL);
+
+	/* initialize config space */
+	pci_set_cfgdata16(pi, PCIR_DEVICE, COM_DEV);
+	pci_set_cfgdata16(pi, PCIR_VENDOR, COM_VENDOR);
+	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM);
+	if (pci_is_legacy(pi)) {
+		pci_uart_legacy_res(&bar, &ivec);
+		pci_emul_alloc_pbar(pi, 0, bar, PCIBAR_IO, 8);
+	} else {
+		ivec = -1;
+		pci_emul_alloc_bar(pi, 0, PCIBAR_IO, 8);
+	}
+	pci_lintr_request(pi, ivec);
+
+	if (opts != NULL && !strcmp("stdio", opts) && !pci_uart_stdio) {
+		pci_uart_stdio = 1;
+		sc->stdio = 1;
+	}
+
+	pci_uart_reset(sc);
+
+	return (0);
+}
+
+struct pci_devemu pci_de_com = {
+	.pe_emu =	"uart",
+	.pe_init =	pci_uart_init,
+	.pe_barwrite =	pci_uart_write,
+	.pe_barread =	pci_uart_read
+};
+PCI_EMUL_SET(pci_de_com);
diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c
new file mode 100644
index 0000000..3382097
--- /dev/null
+++ b/usr.sbin/bhyve/pci_virtio_block.c
@@ -0,0 +1,534 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+#include <sys/disk.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "virtio.h"
+
+#define VTBLK_RINGSZ	64
+
+#define VTBLK_CFGSZ	28
+
+#define VTBLK_R_CFG		VTCFG_R_CFG0
+#define VTBLK_R_CFG_END		VTBLK_R_CFG + VTBLK_CFGSZ -1
+#define VTBLK_R_MAX		VTBLK_R_CFG_END
+
+#define VTBLK_REGSZ		VTBLK_R_MAX+1
+
+#define VTBLK_MAXSEGS	32
+
+#define VTBLK_S_OK	0
+#define VTBLK_S_IOERR	1
+
+/*
+ * Host capabilities
+ */
+#define VTBLK_S_HOSTCAPS      \
+  ( 0x00000004 |	/* host maximum request segments */ \
+    0x10000000 )	/* supports indirect descriptors */
+
+struct vring_hqueue {
+	/* Internal state */
+	uint16_t	hq_size;
+	uint16_t	hq_cur_aidx;		/* trails behind 'avail_idx' */
+
+	 /* Host-context pointers to the queue */
+	struct virtio_desc *hq_dtable;
+	uint16_t	*hq_avail_flags;
+	uint16_t	*hq_avail_idx;		/* monotonically increasing */
+	uint16_t	*hq_avail_ring;
+
+	uint16_t	*hq_used_flags;
+	uint16_t	*hq_used_idx;		/* monotonically increasing */
+	struct virtio_used *hq_used_ring;
+};
+
+/*
+ * Config space
+ */
+struct vtblk_config {
+	uint64_t	vbc_capacity;
+	uint32_t	vbc_size_max;
+	uint32_t	vbc_seg_max;
+	uint16_t	vbc_geom_c;
+	uint8_t		vbc_geom_h;
+	uint8_t		vbc_geom_s;
+	uint32_t	vbc_blk_size;
+	uint32_t	vbc_sectors_max;
+} __packed;
+CTASSERT(sizeof(struct vtblk_config) == VTBLK_CFGSZ);
+
+/*
+ * Fixed-size block header
+ */
+struct virtio_blk_hdr {
+#define VBH_OP_READ	0
+#define VBH_OP_WRITE	1
+	uint32_t       	vbh_type;
+	uint32_t	vbh_ioprio;
+	uint64_t	vbh_sector;
+} __packed;
+
+/*
+ * Debug printf
+ */
+static int pci_vtblk_debug;
+#define DPRINTF(params) if (pci_vtblk_debug) printf params
+#define WPRINTF(params) printf params
+
+/*
+ * Per-device softc
+ */
+struct pci_vtblk_softc {
+	struct pci_devinst *vbsc_pi;
+	int		vbsc_fd;
+	int		vbsc_status;
+	int		vbsc_isr;
+	int		vbsc_lastq;
+	uint32_t	vbsc_features;
+	uint64_t	vbsc_pfn;
+	struct vring_hqueue vbsc_q;
+	struct vtblk_config vbsc_cfg;	
+};
+
+/*
+ * Return the number of available descriptors in the vring taking care
+ * of the 16-bit index wraparound.
+ */
+static int
+hq_num_avail(struct vring_hqueue *hq)
+{
+	int ndesc;
+
+	if (*hq->hq_avail_idx >= hq->hq_cur_aidx)
+		ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx;
+	else
+		ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1;
+
+	assert(ndesc >= 0 && ndesc <= hq->hq_size);
+
+	return (ndesc);
+}
+
+static void
+pci_vtblk_update_status(struct pci_vtblk_softc *sc, uint32_t value)
+{
+	if (value == 0) {
+		DPRINTF(("vtblk: device reset requested !\n"));
+	}
+
+	sc->vbsc_status = value;
+}
+
+static void
+pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vring_hqueue *hq)
+{
+	struct iovec iov[VTBLK_MAXSEGS];
+	struct virtio_blk_hdr *vbh;
+	struct virtio_desc *vd, *vid;
+	struct virtio_used *vu;
+	uint8_t *status;
+	int i;
+	int err;
+	int iolen;
+	int nsegs;
+	int uidx, aidx, didx;
+	int writeop;
+	off_t offset;
+
+	uidx = *hq->hq_used_idx;
+	aidx = hq->hq_cur_aidx;
+	didx = hq->hq_avail_ring[aidx % hq->hq_size];
+	assert(didx >= 0 && didx < hq->hq_size);
+
+	vd = &hq->hq_dtable[didx];
+
+	/*
+	 * Verify that the descriptor is indirect, and obtain
+	 * the pointer to the indirect descriptor.
+	 * There has to be space for at least 3 descriptors
+	 * in the indirect descriptor array: the block header,
+	 * 1 or more data descriptors, and a status byte.
+	 */
+	assert(vd->vd_flags & VRING_DESC_F_INDIRECT);
+
+	nsegs = vd->vd_len / sizeof(struct virtio_desc);
+	assert(nsegs >= 3);
+	assert(nsegs < VTBLK_MAXSEGS + 2);
+
+	vid = paddr_guest2host(vd->vd_addr);
+	assert((vid->vd_flags & VRING_DESC_F_INDIRECT) == 0);
+
+	/*
+	 * The first descriptor will be the read-only fixed header
+	 */
+	vbh = paddr_guest2host(vid[0].vd_addr);
+	assert(vid[0].vd_len == sizeof(struct virtio_blk_hdr));
+	assert(vid[0].vd_flags & VRING_DESC_F_NEXT);
+	assert((vid[0].vd_flags & VRING_DESC_F_WRITE) == 0);
+
+	writeop = (vbh->vbh_type == VBH_OP_WRITE);
+
+	offset = vbh->vbh_sector * DEV_BSIZE;
+
+	/*
+	 * Build up the iovec based on the guest's data descriptors
+	 */
+	for (i = 1, iolen = 0; i < nsegs - 1; i++) {
+		iov[i-1].iov_base = paddr_guest2host(vid[i].vd_addr);
+		iov[i-1].iov_len = vid[i].vd_len;
+		iolen += vid[i].vd_len;
+
+		assert(vid[i].vd_flags & VRING_DESC_F_NEXT);
+		assert((vid[i].vd_flags & VRING_DESC_F_INDIRECT) == 0);
+
+		/*
+		 * - write op implies read-only descriptor,
+		 * - read op implies write-only descriptor,
+		 * therefore test the inverse of the descriptor bit
+		 * to the op.
+		 */
+		assert(((vid[i].vd_flags & VRING_DESC_F_WRITE) == 0) ==
+		       writeop);
+	}
+
+	/* Lastly, get the address of the status byte */
+	status = paddr_guest2host(vid[nsegs - 1].vd_addr);
+	assert(vid[nsegs - 1].vd_len == 1);
+	assert((vid[nsegs - 1].vd_flags & VRING_DESC_F_NEXT) == 0);
+	assert(vid[nsegs - 1].vd_flags & VRING_DESC_F_WRITE);
+
+	DPRINTF(("virtio-block: %s op, %d bytes, %d segs, offset %ld\n\r", 
+		 writeop ? "write" : "read", iolen, nsegs - 2, offset));
+
+	if (writeop){
+		err = pwritev(sc->vbsc_fd, iov, nsegs - 2, offset);
+	} else {
+		err = preadv(sc->vbsc_fd, iov, nsegs - 2, offset);
+	}
+
+	*status = err < 0 ? VTBLK_S_IOERR : VTBLK_S_OK;
+
+	/*
+	 * Return the single indirect descriptor back to the host
+	 */
+	vu = &hq->hq_used_ring[uidx % hq->hq_size];
+	vu->vu_idx = didx;
+	vu->vu_tlen = 1;
+	hq->hq_cur_aidx++;
+	*hq->hq_used_idx += 1;
+}
+
+static void
+pci_vtblk_qnotify(struct pci_vtblk_softc *sc)
+{
+	struct vring_hqueue *hq = &sc->vbsc_q;
+	int i;
+	int ndescs;
+
+	/*
+	 * Calculate number of ring entries to process
+	 */
+	ndescs = hq_num_avail(hq);
+
+	if (ndescs == 0)
+		return;
+
+	/*
+	 * Run through all the entries, placing them into iovecs and
+	 * sending when an end-of-packet is found
+	 */
+	for (i = 0; i < ndescs; i++)
+		pci_vtblk_proc(sc, hq);
+
+	/*
+	 * Generate an interrupt if able
+	 */
+	if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0 &&
+		sc->vbsc_isr == 0) {
+		sc->vbsc_isr = 1;
+		pci_generate_msi(sc->vbsc_pi, 0);
+	}
+	
+}
+
+static void
+pci_vtblk_ring_init(struct pci_vtblk_softc *sc, uint64_t pfn)
+{
+	struct vring_hqueue *hq;
+
+	sc->vbsc_pfn = pfn << VRING_PFN;
+	
+	/*
+	 * Set up host pointers to the various parts of the
+	 * queue
+	 */
+	hq = &sc->vbsc_q;
+	hq->hq_size = VTBLK_RINGSZ;
+
+	hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN);
+	hq->hq_avail_flags =  (uint16_t *)(hq->hq_dtable + hq->hq_size);
+	hq->hq_avail_idx = hq->hq_avail_flags + 1;
+	hq->hq_avail_ring = hq->hq_avail_flags + 2;
+	hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
+						 VRING_ALIGN);
+	hq->hq_used_idx = hq->hq_used_flags + 1;
+	hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
+
+	/*
+	 * Initialize queue indexes
+	 */
+	hq->hq_cur_aidx = 0;
+}
+
+static int
+pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	struct stat sbuf;
+	struct pci_vtblk_softc *sc;
+	off_t size;	
+	int fd;
+	int sectsz;
+
+	if (opts == NULL) {
+		printf("virtio-block: backing device required\n");
+		return (1);
+	}
+
+	/*
+	 * Access to guest memory is required. Fail if
+	 * memory not mapped
+	 */
+	if (paddr_guest2host(0) == NULL)
+		return (1);
+
+	/*
+	 * The supplied backing file has to exist
+	 */
+	fd = open(opts, O_RDWR);
+	if (fd < 0) {
+		perror("Could not open backing file");
+		return (1);
+	}
+
+	if (fstat(fd, &sbuf) < 0) {
+		perror("Could not stat backing file");
+		close(fd);
+		return (1);
+	}
+
+	/*
+	 * Deal with raw devices
+	 */
+	size = sbuf.st_size;
+	sectsz = DEV_BSIZE;
+	if (S_ISCHR(sbuf.st_mode)) {
+		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
+		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
+			perror("Could not fetch dev blk/sector size");
+			close(fd);
+			return (1);
+		}
+		assert(size != 0);
+		assert(sectsz != 0);
+	}
+
+	sc = malloc(sizeof(struct pci_vtblk_softc));
+	memset(sc, 0, sizeof(struct pci_vtblk_softc));
+
+	pi->pi_arg = sc;
+	sc->vbsc_pi = pi;
+	sc->vbsc_fd = fd;
+
+	/* setup virtio block config space */
+	sc->vbsc_cfg.vbc_capacity = size / sectsz;
+	sc->vbsc_cfg.vbc_seg_max = VTBLK_MAXSEGS;
+	sc->vbsc_cfg.vbc_blk_size = sectsz;
+	sc->vbsc_cfg.vbc_size_max = 0;	/* not negotiated */
+	sc->vbsc_cfg.vbc_geom_c = 0;	/* no geometry */
+	sc->vbsc_cfg.vbc_geom_h = 0;
+	sc->vbsc_cfg.vbc_geom_s = 0;
+	sc->vbsc_cfg.vbc_sectors_max = 0;
+
+	/* initialize config space */
+	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK);
+	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
+	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK);
+	pci_emul_add_msicap(pi, 1);
+	pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTBLK_REGSZ);
+
+	return (0);
+}
+
+static void
+pci_vtblk_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+		int baridx, uint64_t offset, int size, uint64_t value)
+{
+	struct pci_vtblk_softc *sc = pi->pi_arg;
+
+	assert(baridx == 0);
+
+	if (offset + size > VTBLK_REGSZ) {
+		DPRINTF(("vtblk_write: 2big, offset %ld size %d\n",
+			 offset, size));
+		return;
+	}
+
+	switch (offset) {
+	case VTCFG_R_GUESTCAP:
+		assert(size == 4);
+		sc->vbsc_features = value & VTBLK_S_HOSTCAPS;
+		break;
+	case VTCFG_R_PFN:
+		assert(size == 4);
+		pci_vtblk_ring_init(sc, value);
+		break;
+	case VTCFG_R_QSEL:
+		assert(size == 2);
+		sc->vbsc_lastq = value;
+		break;
+	case VTCFG_R_QNOTIFY:
+		assert(size == 2);
+		assert(value == 0);
+		pci_vtblk_qnotify(sc);
+		break;
+	case VTCFG_R_STATUS:
+		assert(size == 1);
+		pci_vtblk_update_status(sc, value);
+		break;
+	case VTCFG_R_HOSTCAP:
+	case VTCFG_R_QNUM:
+	case VTCFG_R_ISR:
+	case VTBLK_R_CFG ... VTBLK_R_CFG_END:
+		DPRINTF(("vtblk: write to readonly reg %ld\n\r", offset));
+		break;
+	default:
+		DPRINTF(("vtblk: unknown i/o write offset %ld\n\r", offset));
+		value = 0;
+		break;
+	}
+}
+
+uint64_t
+pci_vtblk_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+	       int baridx, uint64_t offset, int size)
+{
+	struct pci_vtblk_softc *sc = pi->pi_arg;
+	void *ptr;
+	uint32_t value;
+
+	assert(baridx == 0);
+
+	if (offset + size > VTBLK_REGSZ) {
+		DPRINTF(("vtblk_read: 2big, offset %ld size %d\n",
+			 offset, size));
+		return (0);
+	}
+
+	switch (offset) {
+	case VTCFG_R_HOSTCAP:
+		assert(size == 4);
+		value = VTBLK_S_HOSTCAPS;
+		break;
+	case VTCFG_R_GUESTCAP:
+		assert(size == 4);
+		value = sc->vbsc_features; /* XXX never read ? */
+		break;
+	case VTCFG_R_PFN:
+		assert(size == 4);
+		value = sc->vbsc_pfn >> VRING_PFN;
+		break;
+	case VTCFG_R_QNUM:
+		value = (sc->vbsc_lastq == 0) ? VTBLK_RINGSZ: 0;
+		break;
+	case VTCFG_R_QSEL:
+		assert(size == 2);
+		value = sc->vbsc_lastq; /* XXX never read ? */
+		break;
+	case VTCFG_R_QNOTIFY:
+		assert(size == 2);
+		value = 0; /* XXX never read ? */
+		break;
+	case VTCFG_R_STATUS:
+		assert(size == 1);
+		value = sc->vbsc_status;
+		break;
+	case VTCFG_R_ISR:
+		assert(size == 1);
+		value = sc->vbsc_isr;
+		sc->vbsc_isr = 0;     /* a read clears this flag */
+		break;
+	case VTBLK_R_CFG ... VTBLK_R_CFG_END:
+		assert(size + offset <= (VTBLK_R_CFG_END + 1));
+		ptr = (uint8_t *)&sc->vbsc_cfg + offset - VTBLK_R_CFG;
+		if (size == 1) {
+			value = *(uint8_t *) ptr;
+		} else if (size == 2) {
+			value = *(uint16_t *) ptr;
+		} else {
+			value = *(uint32_t *) ptr;
+		}
+		break;
+	default:
+		DPRINTF(("vtblk: unknown i/o read offset %ld\n\r", offset));
+		value = 0;
+		break;
+	}
+
+	return (value);
+}
+
+struct pci_devemu pci_de_vblk = {
+	.pe_emu =	"virtio-blk",
+	.pe_init =	pci_vtblk_init,
+	.pe_barwrite =	pci_vtblk_write,
+	.pe_barread =	pci_vtblk_read
+};
+PCI_EMUL_SET(pci_de_vblk);
diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c
new file mode 100644
index 0000000..3f6f88a
--- /dev/null
+++ b/usr.sbin/bhyve/pci_virtio_net.c
@@ -0,0 +1,781 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/select.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <md5.h>
+#include <pthread.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "mevent.h"
+#include "virtio.h"
+
+#define VTNET_RINGSZ	256
+
+#define VTNET_MAXSEGS	32
+
+/*
+ * PCI config-space register offsets
+ */
+#define VTNET_R_CFG0	       20
+#define VTNET_R_CFG1	       21
+#define VTNET_R_CFG2	       22
+#define VTNET_R_CFG3	       23
+#define VTNET_R_CFG4	       24
+#define VTNET_R_CFG5	       25
+#define VTNET_R_CFG6	       26
+#define VTNET_R_CFG7	       27
+#define VTNET_R_MAX	       27
+
+#define VTNET_REGSZ		VTNET_R_MAX+1
+
+/*
+ * Host capabilities
+ */
+#define VTNET_S_HOSTCAPS      \
+  ( 0x00000020 |	/* host supplies MAC */ \
+    0x00008000 |	/* host can merge Rx buffers */ \
+    0x00010000 )	/* config status available */
+
+/*
+ * Queue definitions.
+ */
+#define VTNET_RXQ	0
+#define VTNET_TXQ	1
+#define VTNET_CTLQ	2
+
+#define VTNET_MAXQ	3
+
+struct vring_hqueue {
+	/* Internal state */
+	uint16_t	hq_size;
+	uint16_t	hq_cur_aidx;		/* trails behind 'avail_idx' */
+
+	 /* Host-context pointers to the queue */
+	struct virtio_desc *hq_dtable;
+	uint16_t	*hq_avail_flags;
+	uint16_t	*hq_avail_idx;		/* monotonically increasing */
+	uint16_t	*hq_avail_ring;
+
+	uint16_t	*hq_used_flags;
+	uint16_t	*hq_used_idx;		/* monotonically increasing */
+	struct virtio_used *hq_used_ring;
+};
+
+/*
+ * Fixed network header size
+ */
+struct virtio_net_rxhdr {
+	uint8_t		vrh_flags;
+	uint8_t		vrh_gso_type;
+	uint16_t	vrh_hdr_len;
+	uint16_t	vrh_gso_size;
+	uint16_t	vrh_csum_start;
+	uint16_t	vrh_csum_offset;
+	uint16_t	vrh_bufs;
+} __packed;
+
+/*
+ * Debug printf
+ */
+static int pci_vtnet_debug;
+#define DPRINTF(params) if (pci_vtnet_debug) printf params
+#define WPRINTF(params) printf params
+
+/*
+ * Per-device softc
+ */
+struct pci_vtnet_softc {
+	struct pci_devinst *vsc_pi;
+	pthread_mutex_t vsc_mtx;
+	struct mevent	*vsc_mevp;
+
+	int		vsc_curq;
+	int		vsc_status;
+	int		vsc_isr;
+	int		vsc_tapfd;
+	int		vsc_rx_ready;
+	int		vsc_rxpend;
+
+	uint32_t	vsc_features;
+	uint8_t		vsc_macaddr[6];
+
+	uint64_t	vsc_pfn[VTNET_MAXQ];
+	struct	vring_hqueue vsc_hq[VTNET_MAXQ];
+};
+
+/*
+ * Return the number of available descriptors in the vring taking care
+ * of the 16-bit index wraparound.
+ */
+static int
+hq_num_avail(struct vring_hqueue *hq)
+{
+	int ndesc;
+
+	if (*hq->hq_avail_idx >= hq->hq_cur_aidx)
+		ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx;
+	else
+		ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1;
+
+	assert(ndesc >= 0 && ndesc <= hq->hq_size);
+
+	return (ndesc);
+}
+
+static uint16_t
+pci_vtnet_qsize(int qnum)
+{
+	/* XXX no ctl queue currently */
+	if (qnum == VTNET_CTLQ) {
+		return (0);
+	}
+
+	/* XXX fixed currently. Maybe different for tx/rx/ctl */
+	return (VTNET_RINGSZ);
+}
+
+static void
+pci_vtnet_ring_reset(struct pci_vtnet_softc *sc, int ring)
+{
+	struct vring_hqueue *hq;
+
+	assert(ring < VTNET_MAXQ);
+
+	hq = &sc->vsc_hq[ring];
+
+	/*
+	 * Reset all soft state
+	 */
+	hq->hq_cur_aidx = 0;
+}
+
+static void
+pci_vtnet_update_status(struct pci_vtnet_softc *sc, uint32_t value)
+{
+
+	if (value == 0) {
+		DPRINTF(("vtnet: device reset requested !\n"));
+		pci_vtnet_ring_reset(sc, VTNET_RXQ);
+		pci_vtnet_ring_reset(sc, VTNET_TXQ);
+		sc->vsc_rx_ready = 0;
+	}
+
+	sc->vsc_status = value;
+}
+
+/*
+ * Called to send a buffer chain out to the tap device
+ */
+static void
+pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
+		 int len)
+{
+	char pad[60];
+
+	if (sc->vsc_tapfd == -1)
+		return;
+
+	/*
+	 * If the length is < 60, pad out to that and add the
+	 * extra zero'd segment to the iov. It is guaranteed that
+	 * there is always an extra iov available by the caller.
+	 */
+	if (len < 60) {
+		memset(pad, 0, 60 - len);
+		iov[iovcnt].iov_base = pad;
+		iov[iovcnt].iov_len = 60 - len;
+		iovcnt++;
+	}
+	(void) writev(sc->vsc_tapfd, iov, iovcnt);
+}
+
+/*
+ *  Called when there is read activity on the tap file descriptor.
+ * Each buffer posted by the guest is assumed to be able to contain
+ * an entire ethernet frame + rx header.
+ *  MP note: the dummybuf is only used for discarding frames, so there
+ * is no need for it to be per-vtnet or locked.
+ */
+static uint8_t dummybuf[2048];
+
+static void
+pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
+{
+	struct virtio_desc *vd;
+	struct virtio_used *vu;
+	struct vring_hqueue *hq;
+	struct virtio_net_rxhdr *vrx;
+	uint8_t *buf;
+	int i;
+	int len;
+	int ndescs;
+	int didx, uidx, aidx;	/* descriptor, avail and used index */
+
+	/*
+	 * Should never be called without a valid tap fd
+	 */
+	assert(sc->vsc_tapfd != -1);
+
+	/*
+	 * But, will be called when the rx ring hasn't yet
+	 * been set up. 
+	 */
+	if (sc->vsc_rx_ready == 0) {
+		/*
+		 * Drop the packet and try later.
+		 */
+		(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
+		return;
+	}
+
+	/*
+	 * Calculate the number of available rx buffers
+	 */
+	hq = &sc->vsc_hq[VTNET_RXQ];
+
+	ndescs = hq_num_avail(hq);
+
+	if (ndescs == 0) {
+		/*
+		 * Need to wait for host notification to read
+		 */
+		if (sc->vsc_rxpend == 0) {
+			WPRINTF(("vtnet: no rx descriptors !\n"));
+			sc->vsc_rxpend = 1;
+		}
+
+		/*
+		 * Drop the packet and try later
+		 */
+		(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
+		return;
+	}
+
+	aidx = hq->hq_cur_aidx;
+	uidx = *hq->hq_used_idx;
+	for (i = 0; i < ndescs; i++) {
+		/*
+		 * 'aidx' indexes into the an array of descriptor indexes
+		 */
+		didx = hq->hq_avail_ring[aidx % hq->hq_size];
+		assert(didx >= 0 && didx < hq->hq_size);
+
+		vd = &hq->hq_dtable[didx];
+
+		/*
+		 * Get a pointer to the rx header, and use the
+		 * data immediately following it for the packet buffer.
+		 */
+		vrx = (struct virtio_net_rxhdr *)paddr_guest2host(vd->vd_addr);
+		buf = (uint8_t *)(vrx + 1);
+
+		len = read(sc->vsc_tapfd, buf,
+			   vd->vd_len - sizeof(struct virtio_net_rxhdr));
+
+		if (len < 0 && errno == EWOULDBLOCK) {
+			break;
+		}
+
+		/*
+		 * The only valid field in the rx packet header is the
+		 * number of buffers, which is always 1 without TSO
+		 * support.
+		 */
+		memset(vrx, 0, sizeof(struct virtio_net_rxhdr));
+		vrx->vrh_bufs = 1;
+
+		/*
+		 * Write this descriptor into the used ring
+		 */
+		vu = &hq->hq_used_ring[uidx % hq->hq_size];
+		vu->vu_idx = didx;
+		vu->vu_tlen = len + sizeof(struct virtio_net_rxhdr);
+		uidx++;
+		aidx++;
+	}
+
+	/*
+	 * Update the used pointer, and signal an interrupt if allowed
+	 */
+	*hq->hq_used_idx = uidx;
+	hq->hq_cur_aidx = aidx;
+
+	if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+		sc->vsc_isr |= 1;
+		pci_generate_msi(sc->vsc_pi, 0);
+	}
+}
+
+static void
+pci_vtnet_tap_callback(int fd, enum ev_type type, void *param)
+{
+	struct pci_vtnet_softc *sc = param;
+
+	pthread_mutex_lock(&sc->vsc_mtx);
+	pci_vtnet_tap_rx(sc);
+	pthread_mutex_unlock(&sc->vsc_mtx);
+
+}
+
+static void
+pci_vtnet_ping_rxq(struct pci_vtnet_softc *sc)
+{
+	/*
+	 * A qnotify means that the rx process can now begin
+	 */
+	if (sc->vsc_rx_ready == 0) {
+		sc->vsc_rx_ready = 1;
+	}
+
+	/*
+	 * If the rx queue was empty, attempt to receive a
+	 * packet that was previously blocked due to no rx bufs
+	 * available
+	 */
+	if (sc->vsc_rxpend) {
+		WPRINTF(("vtnet: rx resumed\n\r"));
+		sc->vsc_rxpend = 0;
+		pci_vtnet_tap_rx(sc);
+	}
+}
+
+static void
+pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vring_hqueue *hq)
+{
+	struct iovec iov[VTNET_MAXSEGS + 1];
+	struct virtio_desc *vd;
+	struct virtio_used *vu;
+	int i;
+	int plen;
+	int tlen;
+	int uidx, aidx, didx;
+
+	uidx = *hq->hq_used_idx;
+	aidx = hq->hq_cur_aidx;
+	didx = hq->hq_avail_ring[aidx % hq->hq_size];
+	assert(didx >= 0 && didx < hq->hq_size);
+
+	vd = &hq->hq_dtable[didx];
+
+	/*
+	 * Run through the chain of descriptors, ignoring the
+	 * first header descriptor. However, include the header
+	 * length in the total length that will be put into the
+	 * used queue.
+	 */
+	tlen = vd->vd_len;
+	vd = &hq->hq_dtable[vd->vd_next];
+
+	for (i = 0, plen = 0;
+	     i < VTNET_MAXSEGS;
+	     i++, vd = &hq->hq_dtable[vd->vd_next]) {
+		iov[i].iov_base = paddr_guest2host(vd->vd_addr);
+		iov[i].iov_len = vd->vd_len;
+		plen += vd->vd_len;
+		tlen += vd->vd_len;
+
+		if ((vd->vd_flags & VRING_DESC_F_NEXT) == 0)
+			break;
+	}
+	assert(i < VTNET_MAXSEGS);
+
+	DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, i + 1));
+	pci_vtnet_tap_tx(sc, iov, i + 1, plen);
+
+	/*
+	 * Return this chain back to the host
+	 */
+	vu = &hq->hq_used_ring[uidx % hq->hq_size];
+	vu->vu_idx = didx;
+	vu->vu_tlen = tlen;
+	hq->hq_cur_aidx = aidx + 1;
+	*hq->hq_used_idx = uidx + 1;
+
+	/*
+	 * Generate an interrupt if able
+	 */
+	if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+		sc->vsc_isr |= 1;
+		pci_generate_msi(sc->vsc_pi, 0);
+	}	
+}
+
+static void
+pci_vtnet_ping_txq(struct pci_vtnet_softc *sc)
+{
+	struct vring_hqueue *hq = &sc->vsc_hq[VTNET_TXQ];
+	int i;
+	int ndescs;
+
+	/*
+	 * Calculate number of ring entries to process
+	 */
+	ndescs = hq_num_avail(hq);
+
+	if (ndescs == 0)
+		return;
+
+	/*
+	 * Run through all the entries, placing them into iovecs and
+	 * sending when an end-of-packet is found
+	 */
+	for (i = 0; i < ndescs; i++)
+		pci_vtnet_proctx(sc, hq);
+}
+
+static void
+pci_vtnet_ping_ctlq(struct pci_vtnet_softc *sc)
+{
+
+	DPRINTF(("vtnet: control qnotify!\n\r"));	
+}
+
+static void
+pci_vtnet_ring_init(struct pci_vtnet_softc *sc, uint64_t pfn)
+{
+	struct vring_hqueue *hq;
+	int qnum = sc->vsc_curq;
+
+	assert(qnum < VTNET_MAXQ);
+
+	sc->vsc_pfn[qnum] = pfn << VRING_PFN;
+	
+	/*
+	 * Set up host pointers to the various parts of the
+	 * queue
+	 */
+	hq = &sc->vsc_hq[qnum];
+	hq->hq_size = pci_vtnet_qsize(qnum);
+
+	hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN);
+	hq->hq_avail_flags =  (uint16_t *)(hq->hq_dtable + hq->hq_size);
+	hq->hq_avail_idx = hq->hq_avail_flags + 1;
+	hq->hq_avail_ring = hq->hq_avail_flags + 2;
+	hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
+						 VRING_ALIGN);
+	hq->hq_used_idx = hq->hq_used_flags + 1;
+	hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
+
+	/*
+	 * Initialize queue indexes
+	 */
+	hq->hq_cur_aidx = 0;
+}
+
+static int
+pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+	MD5_CTX mdctx;
+	unsigned char digest[16];
+	char nstr[80];
+	struct pci_vtnet_softc *sc;
+
+	/*
+	 * Access to guest memory is required. Fail if
+	 * memory not mapped
+	 */
+	if (paddr_guest2host(0) == NULL)
+		return (1);
+
+	sc = malloc(sizeof(struct pci_vtnet_softc));
+	memset(sc, 0, sizeof(struct pci_vtnet_softc));
+
+	pi->pi_arg = sc;
+	sc->vsc_pi = pi;
+
+	pthread_mutex_init(&sc->vsc_mtx, NULL);
+
+	/*
+	 * Attempt to open the tap device
+	 */
+	sc->vsc_tapfd = -1;
+	if (opts != NULL) {
+		char tbuf[80];
+
+		strcpy(tbuf, "/dev/");
+		strlcat(tbuf, opts, sizeof(tbuf));
+
+		sc->vsc_tapfd = open(tbuf, O_RDWR);
+		if (sc->vsc_tapfd == -1) {
+			WPRINTF(("open of tap device %s failed\n", tbuf));
+		} else {
+			/*
+			 * Set non-blocking and register for read
+			 * notifications with the event loop
+			 */
+			int opt = 1;
+			if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
+				WPRINTF(("tap device O_NONBLOCK failed\n"));
+				close(sc->vsc_tapfd);
+				sc->vsc_tapfd = -1;
+			}
+
+			sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
+						  EVF_READ,
+						  pci_vtnet_tap_callback,
+						  sc);
+			if (sc->vsc_mevp == NULL) {
+				WPRINTF(("Could not register event\n"));
+				close(sc->vsc_tapfd);
+				sc->vsc_tapfd = -1;
+			}
+		}		
+	}
+
+	/*
+	 * The MAC address is the standard NetApp OUI of 00-a0-98,
+	 * followed by an MD5 of the vm name. The slot/func number is
+	 * prepended to this for slots other than 1:0, so that 
+	 * a bootloader can netboot from the equivalent of slot 1.
+	 */
+	if (pi->pi_slot == 1 && pi->pi_func == 0) {
+		strncpy(nstr, vmname, sizeof(nstr));
+	} else {
+		snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot,
+		    pi->pi_func, vmname);
+	}
+
+	MD5Init(&mdctx);
+	MD5Update(&mdctx, nstr, strlen(nstr));
+	MD5Final(digest, &mdctx);
+
+	sc->vsc_macaddr[0] = 0x00;
+	sc->vsc_macaddr[1] = 0xa0;
+	sc->vsc_macaddr[2] = 0x98;
+	sc->vsc_macaddr[3] = digest[0];
+	sc->vsc_macaddr[4] = digest[1];
+	sc->vsc_macaddr[5] = digest[2];
+
+	/* initialize config space */
+	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
+	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
+	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
+	pci_emul_add_msicap(pi, 1);
+	pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTNET_REGSZ);
+
+	return (0);
+}
+
+/*
+ * Function pointer array to handle queue notifications
+ */
+static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = {
+	pci_vtnet_ping_rxq,
+	pci_vtnet_ping_txq,
+	pci_vtnet_ping_ctlq
+};
+
+static void
+pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+		int baridx, uint64_t offset, int size, uint64_t value)
+{
+	struct pci_vtnet_softc *sc = pi->pi_arg;
+	void *ptr;
+
+	assert(baridx == 0);
+
+	if (offset + size > VTNET_REGSZ) {
+		DPRINTF(("vtnet_write: 2big, offset %ld size %d\n",
+			 offset, size));
+		return;
+	}
+
+	pthread_mutex_lock(&sc->vsc_mtx);
+
+	switch (offset) {
+	case VTCFG_R_GUESTCAP:
+		assert(size == 4);
+		sc->vsc_features = value & VTNET_S_HOSTCAPS;
+		break;
+	case VTCFG_R_PFN:
+		assert(size == 4);
+		pci_vtnet_ring_init(sc, value);
+		break;
+	case VTCFG_R_QSEL:
+		assert(size == 2);
+		assert(value < VTNET_MAXQ);
+		sc->vsc_curq = value;
+		break;
+	case VTCFG_R_QNOTIFY:
+		assert(size == 2);
+		assert(value < VTNET_MAXQ);
+		(*pci_vtnet_qnotify[value])(sc);
+		break;
+	case VTCFG_R_STATUS:
+		assert(size == 1);
+		pci_vtnet_update_status(sc, value);
+		break;
+	case VTNET_R_CFG0:
+	case VTNET_R_CFG1:
+	case VTNET_R_CFG2:
+	case VTNET_R_CFG3:
+	case VTNET_R_CFG4:
+	case VTNET_R_CFG5:
+		assert((size + offset) <= (VTNET_R_CFG5 + 1));
+		ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
+		/*
+		 * The driver is allowed to change the MAC address
+		 */
+		sc->vsc_macaddr[offset - VTNET_R_CFG0] = value;
+		if (size == 1) {
+			*(uint8_t *) ptr = value;
+		} else if (size == 2) {
+			*(uint16_t *) ptr = value;
+		} else {
+			*(uint32_t *) ptr = value;
+		}
+		break;
+	case VTCFG_R_HOSTCAP:
+	case VTCFG_R_QNUM:
+	case VTCFG_R_ISR:
+	case VTNET_R_CFG6:
+	case VTNET_R_CFG7:
+		DPRINTF(("vtnet: write to readonly reg %ld\n\r", offset));
+		break;
+	default:
+		DPRINTF(("vtnet: unknown i/o write offset %ld\n\r", offset));
+		value = 0;
+		break;
+	}
+
+	pthread_mutex_unlock(&sc->vsc_mtx);
+}
+
+uint64_t
+pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+	       int baridx, uint64_t offset, int size)
+{
+	struct pci_vtnet_softc *sc = pi->pi_arg;
+	void *ptr;
+	uint64_t value;
+
+	assert(baridx == 0);
+
+	if (offset + size > VTNET_REGSZ) {
+		DPRINTF(("vtnet_read: 2big, offset %ld size %d\n",
+			 offset, size));
+		return (0);
+	}
+
+	pthread_mutex_lock(&sc->vsc_mtx);
+
+	switch (offset) {
+	case VTCFG_R_HOSTCAP:
+		assert(size == 4);
+		value = VTNET_S_HOSTCAPS;
+		break;
+	case VTCFG_R_GUESTCAP:
+		assert(size == 4);
+		value = sc->vsc_features; /* XXX never read ? */
+		break;
+	case VTCFG_R_PFN:
+		assert(size == 4);
+		value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN;
+		break;
+	case VTCFG_R_QNUM:
+		assert(size == 2);
+		value = pci_vtnet_qsize(sc->vsc_curq);
+		break;
+	case VTCFG_R_QSEL:
+		assert(size == 2);
+		value = sc->vsc_curq;  /* XXX never read ? */
+		break;
+	case VTCFG_R_QNOTIFY:
+		assert(size == 2);
+		value = sc->vsc_curq;  /* XXX never read ? */
+		break;
+	case VTCFG_R_STATUS:
+		assert(size == 1);
+		value = sc->vsc_status;
+		break;
+	case VTCFG_R_ISR:
+		assert(size == 1);
+		value = sc->vsc_isr;
+		sc->vsc_isr = 0;     /* a read clears this flag */
+		break;
+	case VTNET_R_CFG0:
+	case VTNET_R_CFG1:
+	case VTNET_R_CFG2:
+	case VTNET_R_CFG3:
+	case VTNET_R_CFG4:
+	case VTNET_R_CFG5:
+                assert((size + offset) <= (VTNET_R_CFG5 + 1));
+                ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
+                if (size == 1) {
+                        value = *(uint8_t *) ptr;
+                } else if (size == 2) {
+                        value = *(uint16_t *) ptr;
+                } else {
+                        value = *(uint32_t *) ptr;
+                }
+		break;
+	case VTNET_R_CFG6:
+		assert(size != 4);
+		value = 0x01; /* XXX link always up */
+		break;
+	case VTNET_R_CFG7:
+		assert(size == 1);
+		value = 0; /* XXX link status in LSB */
+		break;
+	default:
+		DPRINTF(("vtnet: unknown i/o read offset %ld\n\r", offset));
+		value = 0;
+		break;
+	}
+
+	pthread_mutex_unlock(&sc->vsc_mtx);
+
+	return (value);
+}
+
+struct pci_devemu pci_de_vnet = {
+	.pe_emu = 	"virtio-net",
+	.pe_init =	pci_vtnet_init,
+	.pe_barwrite =	pci_vtnet_write,
+	.pe_barread =	pci_vtnet_read
+};
+PCI_EMUL_SET(pci_de_vnet);
diff --git a/usr.sbin/bhyve/pit_8254.c b/usr.sbin/bhyve/pit_8254.c
new file mode 100644
index 0000000..c96596a
--- /dev/null
+++ b/usr.sbin/bhyve/pit_8254.c
@@ -0,0 +1,198 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/time.h>
+
+#include <machine/clock.h>
+
+#include <stdio.h>
+#include <assert.h>
+
+#include "bhyverun.h"
+#include "inout.h"
+#include "pit_8254.h"
+
+#define	TIMER_SEL_MASK		0xc0
+#define	TIMER_RW_MASK		0x30
+#define	TIMER_MODE_MASK		0x0f
+#define	TIMER_SEL_READBACK	0xc0
+
+#define	TIMER_DIV(freq, hz)	(((freq) + (hz) / 2) / (hz))
+
+#define	PIT_8254_FREQ		1193182
+static const int nsecs_per_tick = 1000000000 / PIT_8254_FREQ;
+
+struct counter {
+	struct timeval	tv;		/* uptime when counter was loaded */
+	uint16_t	initial;	/* initial counter value */
+	uint8_t		cr[2];
+	uint8_t		ol[2];
+	int		crbyte;
+	int		olbyte;
+};
+
+static void
+timevalfix(struct timeval *t1)
+{
+
+	if (t1->tv_usec < 0) {
+		t1->tv_sec--;
+		t1->tv_usec += 1000000;
+	}
+	if (t1->tv_usec >= 1000000) {
+		t1->tv_sec++;
+		t1->tv_usec -= 1000000;
+	}
+}
+
+static void
+timevalsub(struct timeval *t1, const struct timeval *t2)
+{
+
+	t1->tv_sec -= t2->tv_sec;
+	t1->tv_usec -= t2->tv_usec;
+	timevalfix(t1);
+}
+
+static void
+latch(struct counter *c)
+{
+	struct timeval tv2;
+	uint16_t lval;
+	uint64_t delta_nsecs, delta_ticks;
+
+	/* cannot latch a new value until the old one has been consumed */
+	if (c->olbyte != 0)
+		return;
+
+	if (c->initial == 0 || c->initial == 1) {
+		/*
+		 * XXX the program that runs the VM can be stopped and
+		 * restarted at any time. This means that state that was
+		 * created by the guest is destroyed between invocations
+		 * of the program.
+		 *
+		 * If the counter's initial value is not programmed we
+		 * assume a value that would be set to generate 'guest_hz'
+		 * interrupts per second.
+		 */
+		c->initial = TIMER_DIV(PIT_8254_FREQ, guest_hz);
+		gettimeofday(&c->tv, NULL);
+	}
+	
+	(void)gettimeofday(&tv2, NULL);
+	timevalsub(&tv2, &c->tv);
+	delta_nsecs = tv2.tv_sec * 1000000000 + tv2.tv_usec * 1000;
+	delta_ticks = delta_nsecs / nsecs_per_tick;
+
+	lval = c->initial - delta_ticks % c->initial;
+	c->olbyte = 2;
+	c->ol[1] = lval;		/* LSB */
+	c->ol[0] = lval >> 8;		/* MSB */
+}
+
+static int
+pit_8254_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		 uint32_t *eax, void *arg)
+{
+	int sel, rw, mode;
+	uint8_t val;
+	struct counter *c;
+
+	static struct counter counter[3];
+
+	if (bytes != 1)
+		return (-1);
+
+	val = *eax;
+
+	if (port == TIMER_MODE) {
+		assert(in == 0);
+		sel = val & TIMER_SEL_MASK;
+		rw = val & TIMER_RW_MASK;
+		mode = val & TIMER_MODE_MASK;
+
+		if (sel == TIMER_SEL_READBACK)
+			return (-1);
+		if (rw != TIMER_LATCH && rw != TIMER_16BIT)
+			return (-1);
+
+		if (rw != TIMER_LATCH) {
+			/*
+			 * Counter mode is not affected when issuing a
+			 * latch command.
+			 */
+			if (mode != TIMER_RATEGEN && mode != TIMER_SQWAVE)
+				return (-1);
+		}
+		
+		c = &counter[sel >> 6];
+		if (rw == TIMER_LATCH)
+			latch(c);
+		else
+			c->olbyte = 0;	/* reset latch after reprogramming */
+		
+		return (0);
+	}
+
+	/* counter ports */
+	assert(port >= TIMER_CNTR0 && port <= TIMER_CNTR2);
+	c = &counter[port - TIMER_CNTR0];
+
+	if (in) {
+		/*
+		 * XXX
+		 * The spec says that once the output latch is completely
+		 * read it should revert to "following" the counter. We don't
+		 * do this because it is hard and any reasonable OS should
+		 * always latch the counter before trying to read it.
+		 */
+		if (c->olbyte == 0)
+			c->olbyte = 2;
+		*eax = c->ol[--c->olbyte];
+	} else {
+		c->cr[c->crbyte++] = *eax;
+		if (c->crbyte == 2) {
+			c->crbyte = 0;
+			c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8;
+			if (c->initial == 0)
+				c->initial = 0xffff;
+			gettimeofday(&c->tv, NULL);
+		}
+	}
+
+	return (0);
+}
+
+INOUT_PORT(8254, TIMER_MODE, IOPORT_F_OUT, pit_8254_handler);
+INOUT_PORT(8254, TIMER_CNTR0, IOPORT_F_INOUT, pit_8254_handler);
+INOUT_PORT(8254, TIMER_CNTR1, IOPORT_F_INOUT, pit_8254_handler);
+INOUT_PORT(8254, TIMER_CNTR2, IOPORT_F_INOUT, pit_8254_handler);
diff --git a/usr.sbin/bhyve/pit_8254.h b/usr.sbin/bhyve/pit_8254.h
new file mode 100644
index 0000000..61bd15d
--- /dev/null
+++ b/usr.sbin/bhyve/pit_8254.h
@@ -0,0 +1,45 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PIT_8254_H_
+#define	_PIT_8254_H_
+
+/*
+ * Borrowed from amd64/include/timerreg.h because in that file it is
+ * conditionally compiled for #ifdef _KERNEL only.
+ */
+
+#include <dev/ic/i8253reg.h>
+
+#define	IO_TIMER1	0x40		/* 8253 Timer #1 */
+#define	TIMER_CNTR0	(IO_TIMER1 + TIMER_REG_CNTR0)
+#define	TIMER_CNTR1	(IO_TIMER1 + TIMER_REG_CNTR1)
+#define	TIMER_CNTR2	(IO_TIMER1 + TIMER_REG_CNTR2)
+#define	TIMER_MODE	(IO_TIMER1 + TIMER_REG_MODE)
+
+#endif	/* _PIT_8254_H_ */
diff --git a/usr.sbin/bhyve/pmtmr.c b/usr.sbin/bhyve/pmtmr.c
new file mode 100644
index 0000000..78d14eb
--- /dev/null
+++ b/usr.sbin/bhyve/pmtmr.c
@@ -0,0 +1,108 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/time.h>
+#include <machine/cpufunc.h>
+
+#include <stdio.h>
+#include <time.h>
+#include <assert.h>
+#include <pthread.h>
+
+#include "inout.h"
+
+/*
+ * The ACPI Power Management timer is a free-running 24- or 32-bit
+ * timer with a frequency of 3.579545MHz
+ *
+ * This implementation will be 32-bits
+ */
+
+#define	IO_PMTMR	0x408	/* 4-byte i/o port for the timer */
+
+#define PMTMR_FREQ	3579545  /* 3.579545MHz */
+
+static pthread_mutex_t pmtmr_mtx;
+static uint64_t	pmtmr_tscf;
+static uint64_t	pmtmr_old;
+static uint64_t	pmtmr_tsc_old;
+
+static uint32_t
+pmtmr_val(void)
+{
+	uint64_t	pmtmr_tsc_new;
+	uint64_t	pmtmr_new;
+	static int	inited = 0;
+
+	if (!inited) {
+		size_t len;
+		uint32_t tmpf;
+
+		inited = 1;
+		pthread_mutex_init(&pmtmr_mtx, NULL);
+		len = sizeof(tmpf);
+		sysctlbyname("machdep.tsc_freq", &tmpf, &len,
+		    NULL, 0);
+		pmtmr_tscf = tmpf;
+		pmtmr_tsc_old = rdtsc();
+		pmtmr_old = pmtmr_tsc_old / pmtmr_tscf * PMTMR_FREQ;
+		return (pmtmr_old);
+	}
+
+	pthread_mutex_lock(&pmtmr_mtx);
+	pmtmr_tsc_new = rdtsc();
+	pmtmr_new = (pmtmr_tsc_new - pmtmr_tsc_old) * PMTMR_FREQ / pmtmr_tscf +
+	    pmtmr_old;
+	pmtmr_old = pmtmr_new;
+	pmtmr_tsc_old = pmtmr_tsc_new;
+	pthread_mutex_unlock(&pmtmr_mtx);
+
+	return (pmtmr_new); 
+}
+
+static int
+pmtmr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+	          uint32_t *eax, void *arg)
+{
+	assert(in == 1);
+
+	if (bytes != 4)
+		return (-1);
+
+	*eax = pmtmr_val();
+
+	return (0);
+}
+
+INOUT_PORT(pmtmr, IO_PMTMR, IOPORT_F_IN, pmtmr_handler);
+
diff --git a/usr.sbin/bhyve/post.c b/usr.sbin/bhyve/post.c
new file mode 100644
index 0000000..092a551
--- /dev/null
+++ b/usr.sbin/bhyve/post.c
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <assert.h>
+
+#include "inout.h"
+
+static int
+post_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		  uint32_t *eax, void *arg)
+{
+	assert(in == 1);
+
+	if (bytes != 1)
+		return (-1);
+
+	*eax = 0xff;		/* return some garbage */
+	return (0);
+}
+
+INOUT_PORT(post, 0x84, IOPORT_F_IN, post_data_handler);
diff --git a/usr.sbin/bhyve/rtc.c b/usr.sbin/bhyve/rtc.c
new file mode 100644
index 0000000..f8b894e
--- /dev/null
+++ b/usr.sbin/bhyve/rtc.c
@@ -0,0 +1,274 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/time.h>
+
+#include <stdio.h>
+#include <time.h>
+#include <assert.h>
+
+#include "inout.h"
+
+#define	IO_RTC	0x70
+
+#define RTC_SEC		0x00	/* seconds */
+#define	RTC_MIN		0x02
+#define	RTC_HRS		0x04
+#define	RTC_WDAY	0x06
+#define	RTC_DAY		0x07
+#define	RTC_MONTH	0x08
+#define	RTC_YEAR	0x09
+#define	RTC_CENTURY	0x32	/* current century */
+
+#define RTC_STATUSA	0xA
+#define  RTCSA_TUP	 0x80	/* time update, don't look now */
+
+#define	RTC_STATUSB	0xB
+#define	 RTCSB_DST	 0x01
+#define	 RTCSB_24HR	 0x02
+#define	 RTCSB_BIN	 0x04	/* 0 = BCD, 1 = Binary */
+#define	 RTCSB_PINTR	 0x40	/* 1 = enable periodic clock interrupt */
+#define	 RTCSB_HALT      0x80	/* stop clock updates */
+
+#define RTC_INTR	0x0c	/* status register C (R) interrupt source */
+
+#define RTC_STATUSD	0x0d	/* status register D (R) Lost Power */
+#define  RTCSD_PWR	 0x80	/* clock power OK */
+
+#define	RTC_DIAG	0x0e
+
+#define RTC_RSTCODE	0x0f
+
+#define	RTC_EQUIPMENT	0x14
+
+static int addr;
+
+/* XXX initialize these to default values as they would be from BIOS */
+static uint8_t status_a, status_b, rstcode;
+
+static u_char const bin2bcd_data[] = {
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99
+};
+#define	bin2bcd(bin)	(bin2bcd_data[bin])
+
+#define	rtcout(val)	((status_b & RTCSB_BIN) ? (val) : bin2bcd((val)))
+
+static void
+timevalfix(struct timeval *t1)
+{
+
+	if (t1->tv_usec < 0) {
+		t1->tv_sec--;
+		t1->tv_usec += 1000000;
+	}
+	if (t1->tv_usec >= 1000000) {
+		t1->tv_sec++;
+		t1->tv_usec -= 1000000;
+	}
+}
+
+static void
+timevalsub(struct timeval *t1, const struct timeval *t2)
+{
+
+	t1->tv_sec -= t2->tv_sec;
+	t1->tv_usec -= t2->tv_usec;
+	timevalfix(t1);
+}
+
+static int
+rtc_addr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		 uint32_t *eax, void *arg)
+{
+	assert(in == 0);
+
+	if (bytes != 1)
+		return (-1);
+
+	switch (*eax) {
+	case RTC_SEC:
+	case RTC_MIN:
+	case RTC_HRS:
+	case RTC_WDAY:
+	case RTC_DAY:
+	case RTC_MONTH:
+	case RTC_YEAR:
+	case RTC_CENTURY:
+	case RTC_STATUSA:
+	case RTC_STATUSB:
+	case RTC_INTR:
+	case RTC_STATUSD:
+	case RTC_DIAG:
+	case RTC_RSTCODE:
+	case RTC_EQUIPMENT:
+		break;
+	default:
+		return (-1);
+	}
+
+	addr = *eax;
+	return (0);
+}
+
+static int
+rtc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		 uint32_t *eax, void *arg)
+{
+	int hour;
+	time_t t;
+	struct timeval cur, delta;
+
+	static struct timeval last;
+	static struct tm tm;
+
+	if (bytes != 1)
+		return (-1);
+
+	gettimeofday(&cur, NULL);
+
+	/*
+	 * Increment the cached time only once per second so we can guarantee
+	 * that the guest has at least one second to read the hour:min:sec
+	 * separately and still get a coherent view of the time.
+	 */
+	delta = cur;
+	timevalsub(&delta, &last);
+	if (delta.tv_sec >= 1 && (status_b & RTCSB_HALT) == 0) {
+		t = cur.tv_sec;
+		localtime_r(&t, &tm);
+		last = cur;
+	}
+
+	if (in) {
+		switch (addr) {
+		case RTC_SEC:
+			*eax = rtcout(tm.tm_sec);
+			return (0);
+		case RTC_MIN:
+			*eax = rtcout(tm.tm_min);
+			return (0);
+		case RTC_HRS:
+			if (status_b & RTCSB_24HR)
+				hour = tm.tm_hour;
+			else
+				hour = (tm.tm_hour % 12) + 1;
+			
+			*eax = rtcout(hour);
+
+			/*
+			 * If we are representing time in the 12-hour format
+			 * then set the MSB to indicate PM.
+			 */
+			if ((status_b & RTCSB_24HR) == 0 && tm.tm_hour >= 12)
+				*eax |= 0x80;
+
+			return (0);
+		case RTC_WDAY:
+			*eax = rtcout(tm.tm_wday + 1);
+			return (0);
+		case RTC_DAY:
+			*eax = rtcout(tm.tm_mday);
+			return (0);
+		case RTC_MONTH:
+			*eax = rtcout(tm.tm_mon + 1);
+			return (0);
+		case RTC_YEAR:
+			*eax = rtcout(tm.tm_year % 100);
+			return (0);
+		case RTC_CENTURY:
+			*eax = rtcout(tm.tm_year / 100);
+			break;
+		case RTC_STATUSA:
+			*eax = status_a;
+			return (0);
+		case RTC_INTR:
+			*eax = 0;
+			return (0);
+		case RTC_STATUSD:
+			*eax = RTCSD_PWR;
+			return (0);
+		case RTC_DIAG:
+			*eax = 0;
+			return (0);
+		case RTC_RSTCODE:
+			*eax = rstcode;
+			return (0);
+		case RTC_EQUIPMENT:
+			*eax = 0;
+			return (0);
+		default:
+			return (-1);
+		}
+	}
+
+	switch (addr) {
+	case RTC_STATUSA:
+		status_a = *eax & ~RTCSA_TUP;
+		break;
+	case RTC_STATUSB:
+		/* XXX not implemented yet XXX */
+		if (*eax & RTCSB_PINTR)
+			return (-1);
+		status_b = *eax;
+		break;
+	case RTC_RSTCODE:
+		rstcode = *eax;
+		break;
+	case RTC_SEC:
+	case RTC_MIN:
+	case RTC_HRS:
+	case RTC_WDAY:
+	case RTC_DAY:
+	case RTC_MONTH:
+	case RTC_YEAR:
+	case RTC_CENTURY:
+		/*
+		 * Ignore writes to the time of day registers
+		 */
+		break;
+	default:
+		return (-1);
+	}
+	return (0);
+}
+
+INOUT_PORT(rtc, IO_RTC, IOPORT_F_OUT, rtc_addr_handler);
+INOUT_PORT(rtc, IO_RTC + 1, IOPORT_F_INOUT, rtc_data_handler);
diff --git a/usr.sbin/bhyve/spinup_ap.c b/usr.sbin/bhyve/spinup_ap.c
new file mode 100644
index 0000000..2632aed
--- /dev/null
+++ b/usr.sbin/bhyve/spinup_ap.c
@@ -0,0 +1,119 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "bhyverun.h"
+#include "spinup_ap.h"
+
+static void
+spinup_ap_realmode(struct vmctx *ctx, int newcpu, uint64_t *rip)
+{
+	int vector, error;
+	uint16_t cs;
+	uint64_t desc_base;
+	uint32_t desc_limit, desc_access;
+
+	vector = *rip >> PAGE_SHIFT;
+	*rip = 0;
+
+	/*
+	 * Update the %cs and %rip of the guest so that it starts
+	 * executing real mode code at at 'vector << 12'.
+	 */
+	error = vm_set_register(ctx, newcpu, VM_REG_GUEST_RIP, *rip);
+	assert(error == 0);
+
+	error = vm_get_desc(ctx, newcpu, VM_REG_GUEST_CS, &desc_base,
+			    &desc_limit, &desc_access);
+	assert(error == 0);
+
+	desc_base = vector << PAGE_SHIFT;
+	error = vm_set_desc(ctx, newcpu, VM_REG_GUEST_CS,
+			    desc_base, desc_limit, desc_access);
+	assert(error == 0);
+
+	cs = (vector << PAGE_SHIFT) >> 4;
+	error = vm_set_register(ctx, newcpu, VM_REG_GUEST_CS, cs);
+	assert(error == 0);
+}
+
+int
+spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip)
+{
+	int error;
+
+	assert(newcpu != 0);
+	assert(newcpu < guest_ncpus);
+
+	error = vcpu_reset(ctx, newcpu);
+	assert(error == 0);
+
+	/* Set up capabilities */
+	if (fbsdrun_vmexit_on_hlt()) {
+		error = vm_set_capability(ctx, newcpu, VM_CAP_HALT_EXIT, 1);
+		assert(error == 0);
+	}
+
+	if (fbsdrun_vmexit_on_pause()) {
+		error = vm_set_capability(ctx, newcpu, VM_CAP_PAUSE_EXIT, 1);
+		assert(error == 0);
+	}
+
+	if (fbsdrun_disable_x2apic())
+		error = vm_set_x2apic_state(ctx, newcpu, X2APIC_DISABLED);
+	else
+		error = vm_set_x2apic_state(ctx, newcpu, X2APIC_ENABLED);
+	assert(error == 0);
+
+	/*
+	 * Enable the 'unrestricted guest' mode for 'newcpu'.
+	 *
+	 * Set up the processor state in power-on 16-bit mode, with the CS:IP
+	 * init'd to the specified low-mem 4K page.
+	 */
+	error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
+	assert(error == 0);
+
+	spinup_ap_realmode(ctx, newcpu, &rip);
+
+	fbsdrun_addcpu(ctx, newcpu, rip);
+
+	return (newcpu);
+}
diff --git a/usr.sbin/bhyve/spinup_ap.h b/usr.sbin/bhyve/spinup_ap.h
new file mode 100644
index 0000000..2749ee9
--- /dev/null
+++ b/usr.sbin/bhyve/spinup_ap.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_SPINUP_AP_H_
+#define	_SPINUP_AP_H_
+
+int spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip);
+
+#endif
diff --git a/usr.sbin/bhyve/uart.c b/usr.sbin/bhyve/uart.c
new file mode 100644
index 0000000..640f3bf
--- /dev/null
+++ b/usr.sbin/bhyve/uart.c
@@ -0,0 +1,60 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <assert.h>
+
+#include "inout.h"
+
+#define	COM1		0x3F8
+#define	COM2		0x2F8
+
+#define	REG_IIR		2
+
+static int
+com_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+	    uint32_t *eax, void *arg)
+{
+	assert(in);
+
+	if (bytes != 1)
+		return (-1);
+
+	/*
+	 * COM port is not implemented so we return 0xFF for all registers
+	 */
+	*eax = 0xFF;
+
+	return (0);
+}
+
+INOUT_PORT(uart, COM1 + REG_IIR, IOPORT_F_IN, com_handler);
+INOUT_PORT(uart, COM2 + REG_IIR, IOPORT_F_IN, com_handler);
diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h
new file mode 100644
index 0000000..474e244
--- /dev/null
+++ b/usr.sbin/bhyve/virtio.h
@@ -0,0 +1,85 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_VIRTIO_H_
+#define	_VIRTIO_H_
+
+#define VRING_ALIGN	4096
+
+#define VRING_DESC_F_NEXT	(1 << 0)
+#define VRING_DESC_F_WRITE	(1 << 1)
+#define VRING_DESC_F_INDIRECT	(1 << 2)
+
+#define VRING_AVAIL_F_NO_INTERRUPT   1
+
+struct virtio_desc {
+	uint64_t	vd_addr;
+	uint32_t	vd_len;
+	uint16_t	vd_flags;
+	uint16_t	vd_next;
+} __packed;
+
+struct virtio_used {
+	uint32_t	vu_idx;
+	uint32_t	vu_tlen;
+} __packed;
+
+/*
+ * PFN register shift amount
+ */
+#define VRING_PFN		12
+
+/*
+ * Virtio device types
+ */
+#define VIRTIO_TYPE_NET		1
+#define VIRTIO_TYPE_BLOCK	2
+
+/*
+ * PCI vendor/device IDs
+ */
+#define VIRTIO_VENDOR		0x1AF4
+#define VIRTIO_DEV_NET		0x1000
+#define VIRTIO_DEV_BLOCK	0x1001
+
+/*
+ * PCI config space constants
+ */
+#define VTCFG_R_HOSTCAP		0
+#define VTCFG_R_GUESTCAP	4
+#define VTCFG_R_PFN		8
+#define VTCFG_R_QNUM		12
+#define VTCFG_R_QSEL		14
+#define VTCFG_R_QNOTIFY		16
+#define VTCFG_R_STATUS		18
+#define VTCFG_R_ISR		19
+#define VTCFG_R_CFG0		20	/* No MSI-X */
+#define VTCFG_R_CFG1		24	/* With MSI-X */
+#define VTCFG_R_MSIX		20
+
+#endif	/* _VIRTIO_H_ */
diff --git a/usr.sbin/bhyve/xmsr.c b/usr.sbin/bhyve/xmsr.c
new file mode 100644
index 0000000..9c05f02
--- /dev/null
+++ b/usr.sbin/bhyve/xmsr.c
@@ -0,0 +1,48 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "xmsr.h"
+
+int
+emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val)
+{
+
+	printf("Unknown WRMSR code %x, val %lx, cpu %d\n", code, val, vcpu);
+	exit(1);
+}
diff --git a/usr.sbin/bhyve/xmsr.h b/usr.sbin/bhyve/xmsr.h
new file mode 100644
index 0000000..8cebcea
--- /dev/null
+++ b/usr.sbin/bhyve/xmsr.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_XMSR_H_
+#define	_XMSR_H_
+
+int emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val);
+
+#endif
diff --git a/usr.sbin/bhyvectl/Makefile b/usr.sbin/bhyvectl/Makefile
new file mode 100644
index 0000000..9fde12c
--- /dev/null
+++ b/usr.sbin/bhyvectl/Makefile
@@ -0,0 +1,17 @@
+#
+# $FreeBSD$
+#
+
+PROG=	bhyvectl
+SRCS=	bhyvectl.c
+
+NO_MAN=
+
+DPADD=	${LIBVMMAPI}
+LDADD=	-lvmmapi
+
+WARNS?=	3
+
+CFLAGS+= -I${.CURDIR}/../../sys/amd64/vmm
+
+.include <bsd.prog.mk>
diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c
new file mode 100644
index 0000000..d5e0503
--- /dev/null
+++ b/usr.sbin/bhyvectl/bhyvectl.c
@@ -0,0 +1,1524 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/errno.h>
+#include <sys/mman.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <libutil.h>
+#include <fcntl.h>
+#include <string.h>
+#include <getopt.h>
+#include <assert.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "intel/vmcs.h"
+
+#define	MB	(1UL << 20)
+#define	GB	(1UL << 30)
+
+#define	REQ_ARG		required_argument
+#define	NO_ARG		no_argument
+#define	OPT_ARG		optional_argument
+
+static const char *progname;
+
+static void
+usage(void)
+{
+
+	(void)fprintf(stderr,
+	"Usage: %s --vm=<name>\n"
+	"       [--cpu=<vcpu_number>]\n"
+	"       [--create]\n"
+	"       [--destroy]\n"
+	"	[--get-all]\n"
+	"       [--get-stats]\n"
+	"       [--set-desc-ds]\n"
+	"       [--get-desc-ds]\n"
+	"       [--set-desc-es]\n"
+	"       [--get-desc-es]\n"
+	"       [--set-desc-gs]\n"
+	"       [--get-desc-gs]\n"
+	"       [--set-desc-fs]\n"
+	"       [--get-desc-fs]\n"
+	"       [--set-desc-cs]\n"
+	"       [--get-desc-cs]\n"
+	"       [--set-desc-ss]\n"
+	"       [--get-desc-ss]\n"
+	"       [--set-desc-tr]\n"
+	"       [--get-desc-tr]\n"
+	"       [--set-desc-ldtr]\n"
+	"       [--get-desc-ldtr]\n"
+	"       [--set-desc-gdtr]\n"
+	"       [--get-desc-gdtr]\n"
+	"       [--set-desc-idtr]\n"
+	"       [--get-desc-idtr]\n"
+	"       [--run]\n"
+	"       [--capname=<capname>]\n"
+	"       [--getcap]\n"
+	"       [--setcap=<0|1>]\n"
+	"       [--desc-base=<BASE>]\n"
+	"       [--desc-limit=<LIMIT>]\n"
+	"       [--desc-access=<ACCESS>]\n"
+	"       [--set-cr0=<CR0>]\n"
+	"       [--get-cr0]\n"
+	"       [--set-cr3=<CR3>]\n"
+	"       [--get-cr3]\n"
+	"       [--set-cr4=<CR4>]\n"
+	"       [--get-cr4]\n"
+	"       [--set-dr7=<DR7>]\n"
+	"       [--get-dr7]\n"
+	"       [--set-rsp=<RSP>]\n"
+	"       [--get-rsp]\n"
+	"       [--set-rip=<RIP>]\n"
+	"       [--get-rip]\n"
+	"       [--get-rax]\n"
+	"       [--set-rax=<RAX>]\n"
+	"       [--get-rbx]\n"
+	"       [--get-rcx]\n"
+	"       [--get-rdx]\n"
+	"       [--get-rsi]\n"
+	"       [--get-rdi]\n"
+	"       [--get-rbp]\n"
+	"       [--get-r8]\n"
+	"       [--get-r9]\n"
+	"       [--get-r10]\n"
+	"       [--get-r11]\n"
+	"       [--get-r12]\n"
+	"       [--get-r13]\n"
+	"       [--get-r14]\n"
+	"       [--get-r15]\n"
+	"       [--set-rflags=<RFLAGS>]\n"
+	"       [--get-rflags]\n"
+	"       [--set-cs]\n"
+	"       [--get-cs]\n"
+	"       [--set-ds]\n"
+	"       [--get-ds]\n"
+	"       [--set-es]\n"
+	"       [--get-es]\n"
+	"       [--set-fs]\n"
+	"       [--get-fs]\n"
+	"       [--set-gs]\n"
+	"       [--get-gs]\n"
+	"       [--set-ss]\n"
+	"       [--get-ss]\n"
+	"       [--get-tr]\n"
+	"       [--get-ldtr]\n"
+	"       [--get-vmcs-pinbased-ctls]\n"
+	"       [--get-vmcs-procbased-ctls]\n"
+	"       [--get-vmcs-procbased-ctls2]\n"
+	"       [--get-vmcs-entry-interruption-info]\n"
+	"       [--set-vmcs-entry-interruption-info=<info>]\n"
+	"       [--get-vmcs-eptp]\n"
+	"       [--get-vmcs-guest-physical-address\n"
+	"       [--get-vmcs-guest-linear-address\n"
+	"       [--set-vmcs-exception-bitmap]\n"
+	"       [--get-vmcs-exception-bitmap]\n"
+	"       [--get-vmcs-io-bitmap-address]\n"
+	"       [--get-vmcs-tsc-offset]\n"
+	"       [--get-vmcs-guest-pat]\n"
+	"       [--get-vmcs-host-pat]\n"
+	"       [--get-vmcs-host-cr0]\n"
+	"       [--get-vmcs-host-cr3]\n"
+	"       [--get-vmcs-host-cr4]\n"
+	"       [--get-vmcs-host-rip]\n"
+	"       [--get-vmcs-host-rsp]\n"
+	"       [--get-vmcs-cr0-mask]\n"
+	"       [--get-vmcs-cr0-shadow]\n"
+	"       [--get-vmcs-cr4-mask]\n"
+	"       [--get-vmcs-cr4-shadow]\n"
+	"       [--get-vmcs-cr3-targets]\n"
+	"       [--get-vmcs-apic-access-address]\n"
+	"       [--get-vmcs-virtual-apic-address]\n"
+	"       [--get-vmcs-tpr-threshold]\n"
+	"       [--get-vmcs-msr-bitmap]\n"
+	"       [--get-vmcs-msr-bitmap-address]\n"
+	"       [--get-vmcs-vpid]\n"
+	"       [--get-vmcs-ple-gap]\n"
+	"       [--get-vmcs-ple-window]\n"
+	"       [--get-vmcs-instruction-error]\n"
+	"       [--get-vmcs-exit-ctls]\n"
+	"       [--get-vmcs-entry-ctls]\n"
+	"       [--get-vmcs-guest-sysenter]\n"
+	"       [--get-vmcs-link]\n"
+	"       [--get-vmcs-exit-reason]\n"
+	"       [--get-vmcs-exit-qualification]\n"
+	"       [--get-vmcs-exit-interruption-info]\n"
+	"       [--get-vmcs-exit-interruption-error]\n"
+	"       [--get-vmcs-interruptibility]\n"
+	"       [--set-pinning=<host_cpuid>]\n"
+	"       [--get-pinning]\n"
+	"       [--set-x2apic-state=<state>]\n"
+	"       [--get-x2apic-state]\n"
+	"       [--set-lowmem=<memory below 4GB in units of MB>]\n"
+	"       [--get-lowmem]\n"
+	"       [--set-highmem=<memory above 4GB in units of MB>]\n"
+	"       [--get-highmem]\n",
+	progname);
+	exit(1);
+}
+
+static int get_stats, getcap, setcap, capval;
+static const char *capname;
+static int create, destroy, get_lowmem, get_highmem;
+static uint64_t lowmem, highmem;
+static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4;
+static int set_efer, get_efer;
+static int set_dr7, get_dr7;
+static int set_rsp, get_rsp, set_rip, get_rip, set_rflags, get_rflags;
+static int set_rax, get_rax;
+static int get_rbx, get_rcx, get_rdx, get_rsi, get_rdi, get_rbp;
+static int get_r8, get_r9, get_r10, get_r11, get_r12, get_r13, get_r14, get_r15;
+static int set_desc_ds, get_desc_ds;
+static int set_desc_es, get_desc_es;
+static int set_desc_fs, get_desc_fs;
+static int set_desc_gs, get_desc_gs;
+static int set_desc_cs, get_desc_cs;
+static int set_desc_ss, get_desc_ss;
+static int set_desc_gdtr, get_desc_gdtr;
+static int set_desc_idtr, get_desc_idtr;
+static int set_desc_tr, get_desc_tr;
+static int set_desc_ldtr, get_desc_ldtr;
+static int set_cs, set_ds, set_es, set_fs, set_gs, set_ss, set_tr, set_ldtr;
+static int get_cs, get_ds, get_es, get_fs, get_gs, get_ss, get_tr, get_ldtr;
+static int set_pinning, get_pinning, pincpu;
+static int set_x2apic_state, get_x2apic_state;
+enum x2apic_state x2apic_state;
+static int run;
+
+/*
+ * VMCS-specific fields
+ */
+static int get_pinbased_ctls, get_procbased_ctls, get_procbased_ctls2;
+static int get_eptp, get_io_bitmap, get_tsc_offset;
+static int get_vmcs_entry_interruption_info, set_vmcs_entry_interruption_info;
+static int get_vmcs_interruptibility;
+uint32_t vmcs_entry_interruption_info;
+static int get_vmcs_gpa, get_vmcs_gla;
+static int get_exception_bitmap, set_exception_bitmap, exception_bitmap;
+static int get_cr0_mask, get_cr0_shadow;
+static int get_cr4_mask, get_cr4_shadow;
+static int get_cr3_targets;
+static int get_apic_access_addr, get_virtual_apic_addr, get_tpr_threshold;
+static int get_msr_bitmap, get_msr_bitmap_address;
+static int get_vpid, get_ple_gap, get_ple_window;
+static int get_inst_err, get_exit_ctls, get_entry_ctls;
+static int get_host_cr0, get_host_cr3, get_host_cr4;
+static int get_host_rip, get_host_rsp;
+static int get_guest_pat, get_host_pat;
+static int get_guest_sysenter, get_vmcs_link;
+static int get_vmcs_exit_reason, get_vmcs_exit_qualification;
+static int get_vmcs_exit_interruption_info, get_vmcs_exit_interruption_error;
+
+static uint64_t desc_base;
+static uint32_t desc_limit, desc_access;
+
+static int get_all;
+
+static void
+dump_vm_run_exitcode(struct vm_exit *vmexit, int vcpu)
+{
+	printf("vm exit[%d]\n", vcpu);
+	printf("\trip\t\t0x%016lx\n", vmexit->rip);
+	printf("\tinst_length\t%d\n", vmexit->inst_length);
+	switch (vmexit->exitcode) {
+	case VM_EXITCODE_INOUT:
+		printf("\treason\t\tINOUT\n");
+		printf("\tdirection\t%s\n", vmexit->u.inout.in ? "IN" : "OUT");
+		printf("\tbytes\t\t%d\n", vmexit->u.inout.bytes);
+		printf("\tflags\t\t%s%s\n",
+			vmexit->u.inout.string ? "STRING " : "",
+			vmexit->u.inout.rep ? "REP " : "");
+		printf("\tport\t\t0x%04x\n", vmexit->u.inout.port);
+		printf("\teax\t\t0x%08x\n", vmexit->u.inout.eax);
+		break;
+	case VM_EXITCODE_VMX:
+		printf("\treason\t\tVMX\n");
+		printf("\terror\t\t%d\n", vmexit->u.vmx.error);
+		printf("\texit_reason\t0x%08x (%u)\n",
+		    vmexit->u.vmx.exit_reason, vmexit->u.vmx.exit_reason);
+		printf("\tqualification\t0x%016lx\n",
+			vmexit->u.vmx.exit_qualification);
+		break;
+	default:
+		printf("*** unknown vm run exitcode %d\n", vmexit->exitcode);
+		break;
+	}
+}
+
+static int
+dump_vmcs_msr_bitmap(int vcpu, u_long addr)
+{
+	int error, fd, byte, bit, readable, writeable;
+	u_int msr;
+	const char *bitmap;
+
+	error = -1;
+	bitmap = MAP_FAILED;
+
+	fd = open("/dev/mem", O_RDONLY, 0);
+	if (fd < 0)
+		goto done;
+
+	bitmap = mmap(NULL, PAGE_SIZE, PROT_READ, 0, fd, addr);
+	if (bitmap == MAP_FAILED)
+		goto done;
+
+	for (msr = 0; msr < 0x2000; msr++) {
+		byte = msr / 8;
+		bit = msr & 0x7;
+
+		/* Look at MSRs in the range 0x00000000 to 0x00001FFF */
+		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
+		writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1;
+		if (readable || writeable) {
+			printf("msr 0x%08x[%d]\t\t%c%c\n", msr, vcpu,
+				readable ? 'R' : '-',
+				writeable ? 'W' : '-');
+		}
+
+		/* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */
+		byte += 1024;
+		readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
+		writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1;
+		if (readable || writeable) {
+			printf("msr 0x%08x[%d]\t\t%c%c\n",
+				0xc0000000 + msr, vcpu,
+				readable ? 'R' : '-',
+				writeable ? 'W' : '-');
+		}
+	}
+
+	error = 0;
+done:
+	if (bitmap != MAP_FAILED)
+		munmap((void *)bitmap, PAGE_SIZE);
+	if (fd >= 0)
+		close(fd);
+	return (error);
+}
+
+static int
+vm_get_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t *ret_val)
+{
+
+	return (vm_get_register(ctx, vcpu, VMCS_IDENT(field), ret_val));
+}
+
+static int
+vm_set_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t val)
+{
+
+	return (vm_set_register(ctx, vcpu, VMCS_IDENT(field), val));
+}
+
+enum {
+	VMNAME = 1000,	/* avoid collision with return values from getopt */
+	VCPU,
+	SET_LOWMEM,
+	SET_HIGHMEM,
+	SET_EFER,
+	SET_CR0,
+	SET_CR3,
+	SET_CR4,
+	SET_DR7,
+	SET_RSP,
+	SET_RIP,
+	SET_RAX,
+	SET_RFLAGS,
+	DESC_BASE,
+	DESC_LIMIT,
+	DESC_ACCESS,
+	SET_CS,
+	SET_DS,
+	SET_ES,
+	SET_FS,
+	SET_GS,
+	SET_SS,
+	SET_TR,
+	SET_LDTR,
+	SET_PINNING,
+	SET_X2APIC_STATE,
+	SET_VMCS_EXCEPTION_BITMAP,
+	SET_VMCS_ENTRY_INTERRUPTION_INFO,
+	SET_CAP,
+	CAPNAME,
+};
+
+int
+main(int argc, char *argv[])
+{
+	char *vmname;
+	int error, ch, vcpu;
+	vm_paddr_t gpa;
+	size_t len;
+	struct vm_exit vmexit;
+	uint64_t ctl, eptp, bm, addr, u64;
+	struct vmctx *ctx;
+
+	uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat;
+	uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp;
+	uint64_t r8, r9, r10, r11, r12, r13, r14, r15;
+	uint64_t cs, ds, es, fs, gs, ss, tr, ldtr;
+
+	struct option opts[] = {
+		{ "vm",		REQ_ARG,	0,	VMNAME },
+		{ "cpu",	REQ_ARG,	0,	VCPU },
+		{ "set-lowmem",	REQ_ARG,	0,	SET_LOWMEM },
+		{ "set-highmem",REQ_ARG,	0,	SET_HIGHMEM },
+		{ "set-efer",	REQ_ARG,	0,	SET_EFER },
+		{ "set-cr0",	REQ_ARG,	0,	SET_CR0 },
+		{ "set-cr3",	REQ_ARG,	0,	SET_CR3 },
+		{ "set-cr4",	REQ_ARG,	0,	SET_CR4 },
+		{ "set-dr7",	REQ_ARG,	0,	SET_DR7 },
+		{ "set-rsp",	REQ_ARG,	0,	SET_RSP },
+		{ "set-rip",	REQ_ARG,	0,	SET_RIP },
+		{ "set-rax",	REQ_ARG,	0,	SET_RAX },
+		{ "set-rflags",	REQ_ARG,	0,	SET_RFLAGS },
+		{ "desc-base",	REQ_ARG,	0,	DESC_BASE },
+		{ "desc-limit",	REQ_ARG,	0,	DESC_LIMIT },
+		{ "desc-access",REQ_ARG,	0,	DESC_ACCESS },
+		{ "set-cs",	REQ_ARG,	0,	SET_CS },
+		{ "set-ds",	REQ_ARG,	0,	SET_DS },
+		{ "set-es",	REQ_ARG,	0,	SET_ES },
+		{ "set-fs",	REQ_ARG,	0,	SET_FS },
+		{ "set-gs",	REQ_ARG,	0,	SET_GS },
+		{ "set-ss",	REQ_ARG,	0,	SET_SS },
+		{ "set-tr",	REQ_ARG,	0,	SET_TR },
+		{ "set-ldtr",	REQ_ARG,	0,	SET_LDTR },
+		{ "set-pinning",REQ_ARG,	0,	SET_PINNING },
+		{ "set-x2apic-state",REQ_ARG,	0,	SET_X2APIC_STATE },
+		{ "set-vmcs-exception-bitmap",
+				REQ_ARG,	0, SET_VMCS_EXCEPTION_BITMAP },
+		{ "set-vmcs-entry-interruption-info",
+				REQ_ARG, 0, SET_VMCS_ENTRY_INTERRUPTION_INFO },
+		{ "capname",	REQ_ARG,	0,	CAPNAME },
+		{ "setcap",	REQ_ARG,	0,	SET_CAP },
+		{ "getcap",	NO_ARG,		&getcap,	1 },
+		{ "get-stats",	NO_ARG,		&get_stats,	1 },
+		{ "get-desc-ds",NO_ARG,		&get_desc_ds,	1 },
+		{ "set-desc-ds",NO_ARG,		&set_desc_ds,	1 },
+		{ "get-desc-es",NO_ARG,		&get_desc_es,	1 },
+		{ "set-desc-es",NO_ARG,		&set_desc_es,	1 },
+		{ "get-desc-ss",NO_ARG,		&get_desc_ss,	1 },
+		{ "set-desc-ss",NO_ARG,		&set_desc_ss,	1 },
+		{ "get-desc-cs",NO_ARG,		&get_desc_cs,	1 },
+		{ "set-desc-cs",NO_ARG,		&set_desc_cs,	1 },
+		{ "get-desc-fs",NO_ARG,		&get_desc_fs,	1 },
+		{ "set-desc-fs",NO_ARG,		&set_desc_fs,	1 },
+		{ "get-desc-gs",NO_ARG,		&get_desc_gs,	1 },
+		{ "set-desc-gs",NO_ARG,		&set_desc_gs,	1 },
+		{ "get-desc-tr",NO_ARG,		&get_desc_tr,	1 },
+		{ "set-desc-tr",NO_ARG,		&set_desc_tr,	1 },
+		{ "set-desc-ldtr", NO_ARG,	&set_desc_ldtr,	1 },
+		{ "get-desc-ldtr", NO_ARG,	&get_desc_ldtr,	1 },
+		{ "set-desc-gdtr", NO_ARG,	&set_desc_gdtr, 1 },
+		{ "get-desc-gdtr", NO_ARG,	&get_desc_gdtr, 1 },
+		{ "set-desc-idtr", NO_ARG,	&set_desc_idtr, 1 },
+		{ "get-desc-idtr", NO_ARG,	&get_desc_idtr, 1 },
+		{ "get-lowmem", NO_ARG,		&get_lowmem,	1 },
+		{ "get-highmem",NO_ARG,		&get_highmem,	1 },
+		{ "get-efer",	NO_ARG,		&get_efer,	1 },
+		{ "get-cr0",	NO_ARG,		&get_cr0,	1 },
+		{ "get-cr3",	NO_ARG,		&get_cr3,	1 },
+		{ "get-cr4",	NO_ARG,		&get_cr4,	1 },
+		{ "get-dr7",	NO_ARG,		&get_dr7,	1 },
+		{ "get-rsp",	NO_ARG,		&get_rsp,	1 },
+		{ "get-rip",	NO_ARG,		&get_rip,	1 },
+		{ "get-rax",	NO_ARG,		&get_rax,	1 },
+		{ "get-rbx",	NO_ARG,		&get_rbx,	1 },
+		{ "get-rcx",	NO_ARG,		&get_rcx,	1 },
+		{ "get-rdx",	NO_ARG,		&get_rdx,	1 },
+		{ "get-rsi",	NO_ARG,		&get_rsi,	1 },
+		{ "get-rdi",	NO_ARG,		&get_rdi,	1 },
+		{ "get-rbp",	NO_ARG,		&get_rbp,	1 },
+		{ "get-r8",	NO_ARG,		&get_r8,	1 },
+		{ "get-r9",	NO_ARG,		&get_r9,	1 },
+		{ "get-r10",	NO_ARG,		&get_r10,	1 },
+		{ "get-r11",	NO_ARG,		&get_r11,	1 },
+		{ "get-r12",	NO_ARG,		&get_r12,	1 },
+		{ "get-r13",	NO_ARG,		&get_r13,	1 },
+		{ "get-r14",	NO_ARG,		&get_r14,	1 },
+		{ "get-r15",	NO_ARG,		&get_r15,	1 },
+		{ "get-rflags",	NO_ARG,		&get_rflags,	1 },
+		{ "get-cs",	NO_ARG,		&get_cs,	1 },
+		{ "get-ds",	NO_ARG,		&get_ds,	1 },
+		{ "get-es",	NO_ARG,		&get_es,	1 },
+		{ "get-fs",	NO_ARG,		&get_fs,	1 },
+		{ "get-gs",	NO_ARG,		&get_gs,	1 },
+		{ "get-ss",	NO_ARG,		&get_ss,	1 },
+		{ "get-tr",	NO_ARG,		&get_tr,	1 },
+		{ "get-ldtr",	NO_ARG,		&get_ldtr,	1 },
+		{ "get-vmcs-pinbased-ctls",
+				NO_ARG,		&get_pinbased_ctls, 1 },
+		{ "get-vmcs-procbased-ctls",
+				NO_ARG,		&get_procbased_ctls, 1 },
+		{ "get-vmcs-procbased-ctls2",
+				NO_ARG,		&get_procbased_ctls2, 1 },
+		{ "get-vmcs-guest-linear-address",
+				NO_ARG,		&get_vmcs_gla,	1 },
+		{ "get-vmcs-guest-physical-address",
+				NO_ARG,		&get_vmcs_gpa,	1 },
+		{ "get-vmcs-entry-interruption-info",
+				NO_ARG, &get_vmcs_entry_interruption_info, 1},
+		{ "get-vmcs-eptp", NO_ARG,	&get_eptp,	1 },
+		{ "get-vmcs-exception-bitmap",
+				NO_ARG,		&get_exception_bitmap, 1 },
+		{ "get-vmcs-io-bitmap-address",
+				NO_ARG,		&get_io_bitmap,	1 },
+		{ "get-vmcs-tsc-offset", NO_ARG,&get_tsc_offset, 1 },
+		{ "get-vmcs-cr0-mask", NO_ARG,	&get_cr0_mask,	1 },
+		{ "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 },
+		{ "get-vmcs-cr4-mask", NO_ARG,	&get_cr4_mask,	1 },
+		{ "get-vmcs-cr4-shadow", NO_ARG,&get_cr4_shadow, 1 },
+		{ "get-vmcs-cr3-targets", NO_ARG, &get_cr3_targets, 1},
+		{ "get-vmcs-apic-access-address",
+				NO_ARG,		&get_apic_access_addr, 1},
+		{ "get-vmcs-virtual-apic-address",
+				NO_ARG,		&get_virtual_apic_addr, 1},
+		{ "get-vmcs-tpr-threshold",
+				NO_ARG,		&get_tpr_threshold, 1 },
+		{ "get-vmcs-msr-bitmap",
+				NO_ARG,		&get_msr_bitmap, 1 },
+		{ "get-vmcs-msr-bitmap-address",
+				NO_ARG,		&get_msr_bitmap_address, 1 },
+		{ "get-vmcs-vpid", NO_ARG,	&get_vpid,	1 },
+		{ "get-vmcs-ple-gap", NO_ARG,	&get_ple_gap,	1 },
+		{ "get-vmcs-ple-window", NO_ARG,&get_ple_window,1 },
+		{ "get-vmcs-instruction-error",
+				NO_ARG,		&get_inst_err,	1 },
+		{ "get-vmcs-exit-ctls", NO_ARG,	&get_exit_ctls,	1 },
+		{ "get-vmcs-entry-ctls",
+					NO_ARG,	&get_entry_ctls, 1 },
+		{ "get-vmcs-guest-pat",	NO_ARG,	&get_guest_pat,	1 },
+		{ "get-vmcs-host-pat",	NO_ARG,	&get_host_pat,	1 },
+		{ "get-vmcs-host-cr0",
+				NO_ARG,		&get_host_cr0,	1 },
+		{ "get-vmcs-host-cr3",
+				NO_ARG,		&get_host_cr3,	1 },
+		{ "get-vmcs-host-cr4",
+				NO_ARG,		&get_host_cr4,	1 },
+		{ "get-vmcs-host-rip",
+				NO_ARG,		&get_host_rip,	1 },
+		{ "get-vmcs-host-rsp",
+				NO_ARG,		&get_host_rsp,	1 },
+		{ "get-vmcs-guest-sysenter",
+				NO_ARG,		&get_guest_sysenter, 1 },
+		{ "get-vmcs-link", NO_ARG,	&get_vmcs_link, 1 },
+		{ "get-vmcs-exit-reason",
+				NO_ARG,		&get_vmcs_exit_reason, 1 },
+		{ "get-vmcs-exit-qualification",
+			NO_ARG,		&get_vmcs_exit_qualification, 1 },
+		{ "get-vmcs-exit-interruption-info",
+				NO_ARG,	&get_vmcs_exit_interruption_info, 1},
+		{ "get-vmcs-exit-interruption-error",
+				NO_ARG,	&get_vmcs_exit_interruption_error, 1},
+		{ "get-vmcs-interruptibility",
+				NO_ARG, &get_vmcs_interruptibility, 1 },
+		{ "get-pinning",NO_ARG,		&get_pinning,	1 },
+		{ "get-x2apic-state",NO_ARG,	&get_x2apic_state, 1 },
+		{ "get-all",	NO_ARG,		&get_all,	1 },
+		{ "run",	NO_ARG,		&run,		1 },
+		{ "create",	NO_ARG,		&create,	1 },
+		{ "destroy",	NO_ARG,		&destroy,	1 },
+		{ NULL,		0,		NULL,		0 }
+	};
+
+	vcpu = 0;
+	progname = basename(argv[0]);
+
+	while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) {
+		switch (ch) {
+		case 0:
+			break;
+		case VMNAME:
+			vmname = optarg;
+			break;
+		case VCPU:
+			vcpu = atoi(optarg);
+			break;
+		case SET_LOWMEM:
+			lowmem = atoi(optarg) * MB;
+			lowmem = roundup(lowmem, 2 * MB);
+			break;
+		case SET_HIGHMEM:
+			highmem = atoi(optarg) * MB;
+			highmem = roundup(highmem, 2 * MB);
+			break;
+		case SET_EFER:
+			efer = strtoul(optarg, NULL, 0);
+			set_efer = 1;
+			break;
+		case SET_CR0:
+			cr0 = strtoul(optarg, NULL, 0);
+			set_cr0 = 1;
+			break;
+		case SET_CR3:
+			cr3 = strtoul(optarg, NULL, 0);
+			set_cr3 = 1;
+			break;
+		case SET_CR4:
+			cr4 = strtoul(optarg, NULL, 0);
+			set_cr4 = 1;
+			break;
+		case SET_DR7:
+			dr7 = strtoul(optarg, NULL, 0);
+			set_dr7 = 1;
+			break;
+		case SET_RSP:
+			rsp = strtoul(optarg, NULL, 0);
+			set_rsp = 1;
+			break;
+		case SET_RIP:
+			rip = strtoul(optarg, NULL, 0);
+			set_rip = 1;
+			break;
+		case SET_RAX:
+			rax = strtoul(optarg, NULL, 0);
+			set_rax = 1;
+			break;
+		case SET_RFLAGS:
+			rflags = strtoul(optarg, NULL, 0);
+			set_rflags = 1;
+			break;
+		case DESC_BASE:
+			desc_base = strtoul(optarg, NULL, 0);
+			break;
+		case DESC_LIMIT:
+			desc_limit = strtoul(optarg, NULL, 0);
+			break;
+		case DESC_ACCESS:
+			desc_access = strtoul(optarg, NULL, 0);
+			break;
+		case SET_CS:
+			cs = strtoul(optarg, NULL, 0);
+			set_cs = 1;
+			break;
+		case SET_DS:
+			ds = strtoul(optarg, NULL, 0);
+			set_ds = 1;
+			break;
+		case SET_ES:
+			es = strtoul(optarg, NULL, 0);
+			set_es = 1;
+			break;
+		case SET_FS:
+			fs = strtoul(optarg, NULL, 0);
+			set_fs = 1;
+			break;
+		case SET_GS:
+			gs = strtoul(optarg, NULL, 0);
+			set_gs = 1;
+			break;
+		case SET_SS:
+			ss = strtoul(optarg, NULL, 0);
+			set_ss = 1;
+			break;
+		case SET_TR:
+			tr = strtoul(optarg, NULL, 0);
+			set_tr = 1;
+			break;
+		case SET_LDTR:
+			ldtr = strtoul(optarg, NULL, 0);
+			set_ldtr = 1;
+			break;
+		case SET_PINNING:
+			pincpu = strtol(optarg, NULL, 0);
+			set_pinning = 1;
+			break;
+		case SET_X2APIC_STATE:
+			x2apic_state = strtol(optarg, NULL, 0);
+			set_x2apic_state = 1;
+			break;
+		case SET_VMCS_EXCEPTION_BITMAP:
+			exception_bitmap = strtoul(optarg, NULL, 0);
+			set_exception_bitmap = 1;
+			break;
+		case SET_VMCS_ENTRY_INTERRUPTION_INFO:
+			vmcs_entry_interruption_info = strtoul(optarg, NULL, 0);
+			set_vmcs_entry_interruption_info = 1;
+			break;
+		case SET_CAP:
+			capval = strtoul(optarg, NULL, 0);
+			setcap = 1;
+			break;
+		case CAPNAME:
+			capname = optarg;
+			break;
+		default:
+			usage();
+		}
+	}
+	argc -= optind;
+	argv += optind;
+
+	if (vmname == NULL)
+		usage();
+
+	error = 0;
+
+	if (!error && create)
+		error = vm_create(vmname);
+
+	if (!error) {
+		ctx = vm_open(vmname);
+		if (ctx == NULL)
+			error = -1;
+	}
+
+	if (!error && lowmem)
+		error = vm_setup_memory(ctx, 0, lowmem, NULL);
+
+	if (!error && highmem)
+		error = vm_setup_memory(ctx, 4 * GB, highmem, NULL);
+
+	if (!error && set_efer)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_EFER, efer);
+
+	if (!error && set_cr0)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR0, cr0);
+
+	if (!error && set_cr3)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR3, cr3);
+
+	if (!error && set_cr4)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR4, cr4);
+
+	if (!error && set_dr7)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR7, dr7);
+
+	if (!error && set_rsp)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RSP, rsp);
+
+	if (!error && set_rip)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, rip);
+
+	if (!error && set_rax)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, rax);
+
+	if (!error && set_rflags) {
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RFLAGS,
+					rflags);
+	}
+
+	if (!error && set_desc_ds) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_DS,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_es) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_ES,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_ss) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_SS,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_cs) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_CS,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_fs) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_FS,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_gs) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GS,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_tr) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_TR,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_ldtr) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_LDTR,
+				    desc_base, desc_limit, desc_access);
+	}
+
+	if (!error && set_desc_gdtr) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GDTR,
+				    desc_base, desc_limit, 0);
+	}
+
+	if (!error && set_desc_idtr) {
+		error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_IDTR,
+				    desc_base, desc_limit, 0);
+	}
+
+	if (!error && set_cs)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CS, cs);
+
+	if (!error && set_ds)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DS, ds);
+
+	if (!error && set_es)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ES, es);
+
+	if (!error && set_fs)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_FS, fs);
+
+	if (!error && set_gs)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_GS, gs);
+
+	if (!error && set_ss)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_SS, ss);
+
+	if (!error && set_tr)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_TR, tr);
+
+	if (!error && set_ldtr)
+		error = vm_set_register(ctx, vcpu, VM_REG_GUEST_LDTR, ldtr);
+
+	if (!error && set_pinning)
+		error = vm_set_pinning(ctx, vcpu, pincpu);
+
+	if (!error && set_x2apic_state)
+		error = vm_set_x2apic_state(ctx, vcpu, x2apic_state);
+
+	if (!error && set_exception_bitmap) {
+		error = vm_set_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP,
+					  exception_bitmap);
+	}
+
+	if (!error && set_vmcs_entry_interruption_info) {
+		error = vm_set_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,
+					  vmcs_entry_interruption_info);
+	}
+
+	if (!error && (get_lowmem || get_all)) {
+		gpa = 0;
+		error = vm_get_memory_seg(ctx, gpa, &len);
+		if (error == 0)
+			printf("lowmem\t\t0x%016lx/%ld\n", gpa, len);
+	}
+
+	if (!error && (get_highmem || get_all)) {
+		gpa = 4 * GB;
+		error = vm_get_memory_seg(ctx, gpa, &len);
+		if (error == 0)
+			printf("highmem\t\t0x%016lx/%ld\n", gpa, len);
+	}
+
+	if (!error && (get_efer || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_EFER, &efer);
+		if (error == 0)
+			printf("efer[%d]\t\t0x%016lx\n", vcpu, efer);
+	}
+
+	if (!error && (get_cr0 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR0, &cr0);
+		if (error == 0)
+			printf("cr0[%d]\t\t0x%016lx\n", vcpu, cr0);
+	}
+
+	if (!error && (get_cr3 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR3, &cr3);
+		if (error == 0)
+			printf("cr3[%d]\t\t0x%016lx\n", vcpu, cr3);
+	}
+
+	if (!error && (get_cr4 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR4, &cr4);
+		if (error == 0)
+			printf("cr4[%d]\t\t0x%016lx\n", vcpu, cr4);
+	}
+
+	if (!error && (get_dr7 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR7, &dr7);
+		if (error == 0)
+			printf("dr7[%d]\t\t0x%016lx\n", vcpu, dr7);
+	}
+
+	if (!error && (get_rsp || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSP, &rsp);
+		if (error == 0)
+			printf("rsp[%d]\t\t0x%016lx\n", vcpu, rsp);
+	}
+
+	if (!error && (get_rip || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
+		if (error == 0)
+			printf("rip[%d]\t\t0x%016lx\n", vcpu, rip);
+	}
+
+	if (!error && (get_rax || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RAX, &rax);
+		if (error == 0)
+			printf("rax[%d]\t\t0x%016lx\n", vcpu, rax);
+	}
+
+	if (!error && (get_rbx || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBX, &rbx);
+		if (error == 0)
+			printf("rbx[%d]\t\t0x%016lx\n", vcpu, rbx);
+	}
+
+	if (!error && (get_rcx || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RCX, &rcx);
+		if (error == 0)
+			printf("rcx[%d]\t\t0x%016lx\n", vcpu, rcx);
+	}
+
+	if (!error && (get_rdx || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDX, &rdx);
+		if (error == 0)
+			printf("rdx[%d]\t\t0x%016lx\n", vcpu, rdx);
+	}
+
+	if (!error && (get_rsi || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSI, &rsi);
+		if (error == 0)
+			printf("rsi[%d]\t\t0x%016lx\n", vcpu, rsi);
+	}
+
+	if (!error && (get_rdi || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDI, &rdi);
+		if (error == 0)
+			printf("rdi[%d]\t\t0x%016lx\n", vcpu, rdi);
+	}
+
+	if (!error && (get_rbp || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBP, &rbp);
+		if (error == 0)
+			printf("rbp[%d]\t\t0x%016lx\n", vcpu, rbp);
+	}
+
+	if (!error && (get_r8 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R8, &r8);
+		if (error == 0)
+			printf("r8[%d]\t\t0x%016lx\n", vcpu, r8);
+	}
+
+	if (!error && (get_r9 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R9, &r9);
+		if (error == 0)
+			printf("r9[%d]\t\t0x%016lx\n", vcpu, r9);
+	}
+
+	if (!error && (get_r10 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R10, &r10);
+		if (error == 0)
+			printf("r10[%d]\t\t0x%016lx\n", vcpu, r10);
+	}
+
+	if (!error && (get_r11 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R11, &r11);
+		if (error == 0)
+			printf("r11[%d]\t\t0x%016lx\n", vcpu, r11);
+	}
+
+	if (!error && (get_r12 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R12, &r12);
+		if (error == 0)
+			printf("r12[%d]\t\t0x%016lx\n", vcpu, r12);
+	}
+
+	if (!error && (get_r13 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R13, &r13);
+		if (error == 0)
+			printf("r13[%d]\t\t0x%016lx\n", vcpu, r13);
+	}
+
+	if (!error && (get_r14 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R14, &r14);
+		if (error == 0)
+			printf("r14[%d]\t\t0x%016lx\n", vcpu, r14);
+	}
+
+	if (!error && (get_r15 || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R15, &r15);
+		if (error == 0)
+			printf("r15[%d]\t\t0x%016lx\n", vcpu, r15);
+	}
+
+	if (!error && (get_rflags || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RFLAGS,
+					&rflags);
+		if (error == 0)
+			printf("rflags[%d]\t0x%016lx\n", vcpu, rflags);
+	}
+
+	if (!error && (get_stats || get_all)) {
+		int i, num_stats;
+		uint64_t *stats;
+		struct timeval tv;
+		const char *desc;
+
+		stats = vm_get_stats(ctx, vcpu, &tv, &num_stats);
+		if (stats != NULL) {
+			printf("vcpu%d\n", vcpu);
+			for (i = 0; i < num_stats; i++) {
+				desc = vm_get_stat_desc(ctx, i);
+				printf("%-32s\t%ld\n", desc, stats[i]);
+			}
+		}
+	}
+
+	if (!error && (get_desc_ds || get_all)) {
+		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_DS,
+				    &desc_base, &desc_limit, &desc_access);
+		if (error == 0) {
+			printf("ds desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+			       vcpu, desc_base, desc_limit, desc_access);	
+		}
+	}
+
+	if (!error && (get_desc_es || get_all)) {
+		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_ES,
+				    &desc_base, &desc_limit, &desc_access);
+		if (error == 0) {
+			printf("es desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+			       vcpu, desc_base, desc_limit, desc_access);	
+		}
+	}
+
+	if (!error && (get_desc_fs || get_all)) {
+		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_FS,
+				    &desc_base, &desc_limit, &desc_access);
+		if (error == 0) {
+			printf("fs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+			       vcpu, desc_base, desc_limit, desc_access);	
+		}
+	}
+
+	if (!error && (get_desc_gs || get_all)) {
+		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GS,
+				    &desc_base, &desc_limit, &desc_access);
+		if (error == 0) {
+			printf("gs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+			       vcpu, desc_base, desc_limit, desc_access);	
+		}
+	}
+
+	if (!error && (get_desc_ss || get_all)) {
+		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS,
+				    &desc_base, &desc_limit, &desc_access);
+		if (error == 0) {
+			printf("ss desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+			       vcpu, desc_base, desc_limit, desc_access);	
+		}
+	}
+
+	if (!error && (get_desc_cs || get_all)) {
+		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_CS,
+				    &desc_base, &desc_limit, &desc_access);
+		if (error == 0) {
+			printf("cs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+			       vcpu, desc_base, desc_limit, desc_access);	
+		}
+	}
+
+	if (!error && (get_desc_tr || get_all)) {
+		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR,
+				    &desc_base, &desc_limit, &desc_access);
+		if (error == 0) {
+			printf("tr desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+			       vcpu, desc_base, desc_limit, desc_access);	
+		}
+	}
+
+	if (!error && (get_desc_ldtr || get_all)) {
+		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_LDTR,
+				    &desc_base, &desc_limit, &desc_access);
+		if (error == 0) {
+			printf("ldtr desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+			       vcpu, desc_base, desc_limit, desc_access);	
+		}
+	}
+
+	if (!error && (get_desc_gdtr || get_all)) {
+		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GDTR,
+				    &desc_base, &desc_limit, &desc_access);
+		if (error == 0) {
+			printf("gdtr[%d]\t\t0x%016lx/0x%08x\n",
+			       vcpu, desc_base, desc_limit);	
+		}
+	}
+
+	if (!error && (get_desc_idtr || get_all)) {
+		error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_IDTR,
+				    &desc_base, &desc_limit, &desc_access);
+		if (error == 0) {
+			printf("idtr[%d]\t\t0x%016lx/0x%08x\n",
+			       vcpu, desc_base, desc_limit);	
+		}
+	}
+
+	if (!error && (get_cs || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CS, &cs);
+		if (error == 0)
+			printf("cs[%d]\t\t0x%04lx\n", vcpu, cs);
+	}
+
+	if (!error && (get_ds || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DS, &ds);
+		if (error == 0)
+			printf("ds[%d]\t\t0x%04lx\n", vcpu, ds);
+	}
+
+	if (!error && (get_es || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_ES, &es);
+		if (error == 0)
+			printf("es[%d]\t\t0x%04lx\n", vcpu, es);
+	}
+
+	if (!error && (get_fs || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_FS, &fs);
+		if (error == 0)
+			printf("fs[%d]\t\t0x%04lx\n", vcpu, fs);
+	}
+
+	if (!error && (get_gs || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_GS, &gs);
+		if (error == 0)
+			printf("gs[%d]\t\t0x%04lx\n", vcpu, gs);
+	}
+
+	if (!error && (get_ss || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_SS, &ss);
+		if (error == 0)
+			printf("ss[%d]\t\t0x%04lx\n", vcpu, ss);
+	}
+
+	if (!error && (get_tr || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_TR, &tr);
+		if (error == 0)
+			printf("tr[%d]\t\t0x%04lx\n", vcpu, tr);
+	}
+
+	if (!error && (get_ldtr || get_all)) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_LDTR, &ldtr);
+		if (error == 0)
+			printf("ldtr[%d]\t\t0x%04lx\n", vcpu, ldtr);
+	}
+
+	if (!error && (get_pinning || get_all)) {
+		error = vm_get_pinning(ctx, vcpu, &pincpu);
+		if (error == 0) {
+			if (pincpu < 0)
+				printf("pincpu[%d]\tunpinned\n", vcpu);
+			else
+				printf("pincpu[%d]\t%d\n", vcpu, pincpu);
+		}
+	}
+
+	if (!error && (get_x2apic_state || get_all)) {
+		error = vm_get_x2apic_state(ctx, vcpu, &x2apic_state);
+		if (error == 0)
+			printf("x2apic_state[%d]\t%d\n", vcpu, x2apic_state);
+	}
+
+	if (!error && (get_pinbased_ctls || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_PIN_BASED_CTLS, &ctl);
+		if (error == 0)
+			printf("pinbased_ctls[%d]\t0x%08lx\n", vcpu, ctl);
+	}
+
+	if (!error && (get_procbased_ctls || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu,
+					  VMCS_PRI_PROC_BASED_CTLS, &ctl);
+		if (error == 0)
+			printf("procbased_ctls[%d]\t0x%08lx\n", vcpu, ctl);
+	}
+
+	if (!error && (get_procbased_ctls2 || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu,
+					  VMCS_SEC_PROC_BASED_CTLS, &ctl);
+		if (error == 0)
+			printf("procbased_ctls2[%d]\t0x%08lx\n", vcpu, ctl);
+	}
+
+	if (!error && (get_vmcs_gla || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu,
+					  VMCS_GUEST_LINEAR_ADDRESS, &u64);
+		if (error == 0)
+			printf("gla[%d]\t\t0x%016lx\n", vcpu, u64);
+	}
+
+	if (!error && (get_vmcs_gpa || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu,
+					  VMCS_GUEST_PHYSICAL_ADDRESS, &u64);
+		if (error == 0)
+			printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64);
+	}
+
+	if (!error && (get_vmcs_entry_interruption_info || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64);
+		if (error == 0) {
+			printf("entry_interruption_info[%d]\t0x%08lx\n",
+				vcpu, u64);
+		}
+	}
+
+	if (!error && (get_eptp || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EPTP, &eptp);
+		if (error == 0)
+			printf("eptp[%d]\t\t0x%016lx\n", vcpu, eptp);
+	}
+
+	if (!error && (get_exception_bitmap || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP,
+					  &bm);
+		if (error == 0)
+			printf("exception_bitmap[%d]\t0x%08lx\n", vcpu, bm);
+	}
+
+	if (!error && (get_io_bitmap || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_A, &bm);
+		if (error == 0)
+			printf("io_bitmap_a[%d]\t0x%08lx\n", vcpu, bm);
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_B, &bm);
+		if (error == 0)
+			printf("io_bitmap_b[%d]\t0x%08lx\n", vcpu, bm);
+	}
+
+	if (!error && (get_tsc_offset || get_all)) {
+		uint64_t tscoff;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_TSC_OFFSET, &tscoff);
+		if (error == 0)
+			printf("tsc_offset[%d]\t0x%016lx\n", vcpu, tscoff);
+	}
+
+	if (!error && (get_cr0_mask || get_all)) {
+		uint64_t cr0mask;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_MASK, &cr0mask);
+		if (error == 0)
+			printf("cr0_mask[%d]\t\t0x%016lx\n", vcpu, cr0mask);
+	}
+
+	if (!error && (get_cr0_shadow || get_all)) {
+		uint64_t cr0shadow;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_SHADOW,
+					  &cr0shadow);
+		if (error == 0)
+			printf("cr0_shadow[%d]\t\t0x%016lx\n", vcpu, cr0shadow);
+	}
+
+	if (!error && (get_cr4_mask || get_all)) {
+		uint64_t cr4mask;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_MASK, &cr4mask);
+		if (error == 0)
+			printf("cr4_mask[%d]\t\t0x%016lx\n", vcpu, cr4mask);
+	}
+
+	if (!error && (get_cr4_shadow || get_all)) {
+		uint64_t cr4shadow;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_SHADOW,
+					  &cr4shadow);
+		if (error == 0)
+			printf("cr4_shadow[%d]\t\t0x%016lx\n", vcpu, cr4shadow);
+	}
+	
+	if (!error && (get_cr3_targets || get_all)) {
+		uint64_t target_count, target_addr;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET_COUNT,
+					  &target_count);
+		if (error == 0) {
+			printf("cr3_target_count[%d]\t0x%08lx\n",
+				vcpu, target_count);
+		}
+
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET0,
+					  &target_addr);
+		if (error == 0) {
+			printf("cr3_target0[%d]\t\t0x%016lx\n",
+				vcpu, target_addr);
+		}
+
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET1,
+					  &target_addr);
+		if (error == 0) {
+			printf("cr3_target1[%d]\t\t0x%016lx\n",
+				vcpu, target_addr);
+		}
+
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET2,
+					  &target_addr);
+		if (error == 0) {
+			printf("cr3_target2[%d]\t\t0x%016lx\n",
+				vcpu, target_addr);
+		}
+
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET3,
+					  &target_addr);
+		if (error == 0) {
+			printf("cr3_target3[%d]\t\t0x%016lx\n",
+				vcpu, target_addr);
+		}
+	}
+
+	if (!error && (get_apic_access_addr || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_APIC_ACCESS, &addr);
+		if (error == 0)
+			printf("apic_access_addr[%d]\t0x%016lx\n", vcpu, addr);
+	}
+
+	if (!error && (get_virtual_apic_addr || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_VIRTUAL_APIC, &addr);
+		if (error == 0)
+			printf("virtual_apic_addr[%d]\t0x%016lx\n", vcpu, addr);
+	}
+
+	if (!error && (get_tpr_threshold || get_all)) {
+		uint64_t threshold;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD,
+					  &threshold);
+		if (error == 0)
+			printf("tpr_threshold[%d]\t0x%08lx\n", vcpu, threshold);
+	}
+
+	if (!error && (get_msr_bitmap_address || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr);
+		if (error == 0)
+			printf("msr_bitmap[%d]\t\t0x%016lx\n", vcpu, addr);
+	}
+
+	if (!error && (get_msr_bitmap || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr);
+		if (error == 0)
+			error = dump_vmcs_msr_bitmap(vcpu, addr);
+	}
+
+	if (!error && (get_vpid || get_all)) {
+		uint64_t vpid;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid);
+		if (error == 0)
+			printf("vpid[%d]\t\t0x%04lx\n", vcpu, vpid);
+	}
+	
+	if (!error && (get_ple_window || get_all)) {
+		uint64_t window;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_WINDOW, &window);
+		if (error == 0)
+			printf("ple_window[%d]\t\t0x%08lx\n", vcpu, window);
+	}
+
+	if (!error && (get_ple_gap || get_all)) {
+		uint64_t gap;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_GAP, &gap);
+		if (error == 0)
+			printf("ple_gap[%d]\t\t0x%08lx\n", vcpu, gap);
+	}
+
+	if (!error && (get_inst_err || get_all)) {
+		uint64_t insterr;
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_INSTRUCTION_ERROR,
+					  &insterr);
+		if (error == 0) {
+			printf("instruction_error[%d]\t0x%08lx\n",
+				vcpu, insterr);
+		}
+	}
+
+	if (!error && (get_exit_ctls || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_CTLS, &ctl);
+		if (error == 0)
+			printf("exit_ctls[%d]\t\t0x%08lx\n", vcpu, ctl);
+	}
+
+	if (!error && (get_entry_ctls || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_CTLS, &ctl);
+		if (error == 0)
+			printf("entry_ctls[%d]\t\t0x%08lx\n", vcpu, ctl);
+	}
+
+	if (!error && (get_host_pat || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_IA32_PAT, &pat);
+		if (error == 0)
+			printf("host_pat[%d]\t\t0x%016lx\n", vcpu, pat);
+	}
+
+	if (!error && (get_guest_pat || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_IA32_PAT, &pat);
+		if (error == 0)
+			printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat);
+	}
+
+	if (!error && (get_host_cr0 || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR0, &cr0);
+		if (error == 0)
+			printf("host_cr0[%d]\t\t0x%016lx\n", vcpu, cr0);
+	}
+
+	if (!error && (get_host_cr3 || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR3, &cr3);
+		if (error == 0)
+			printf("host_cr3[%d]\t\t0x%016lx\n", vcpu, cr3);
+	}
+
+	if (!error && (get_host_cr4 || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR4, &cr4);
+		if (error == 0)
+			printf("host_cr4[%d]\t\t0x%016lx\n", vcpu, cr4);
+	}
+
+	if (!error && (get_host_rip || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RIP, &rip);
+		if (error == 0)
+			printf("host_rip[%d]\t\t0x%016lx\n", vcpu, rip);
+	}
+
+	if (!error && (get_host_rsp || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RSP, &rsp);
+		if (error == 0)
+			printf("host_rsp[%d]\t\t0x%016lx\n", vcpu, rsp);
+	}
+
+	if (!error && (get_guest_sysenter || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu,
+					  VMCS_GUEST_IA32_SYSENTER_CS, &cs);
+		if (error == 0)
+			printf("guest_sysenter_cs[%d]\t0x%08lx\n", vcpu, cs);
+
+		error = vm_get_vmcs_field(ctx, vcpu,
+					  VMCS_GUEST_IA32_SYSENTER_ESP, &rsp);
+		if (error == 0)
+			printf("guest_sysenter_sp[%d]\t0x%016lx\n", vcpu, rsp);
+		error = vm_get_vmcs_field(ctx, vcpu,
+					  VMCS_GUEST_IA32_SYSENTER_EIP, &rip);
+		if (error == 0)
+			printf("guest_sysenter_ip[%d]\t0x%016lx\n", vcpu, rip);
+	}
+
+	if (!error && (get_vmcs_link || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_LINK_POINTER, &addr);
+		if (error == 0)
+			printf("vmcs_pointer[%d]\t0x%016lx\n", vcpu, addr);
+	}
+
+	if (!error && (get_vmcs_exit_reason || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON, &u64);
+		if (error == 0)
+			printf("vmcs_exit_reason[%d]\t0x%016lx\n", vcpu, u64);
+	}
+
+	if (!error && (get_vmcs_exit_qualification || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION,
+					  &u64);
+		if (error == 0)
+			printf("vmcs_exit_qualification[%d]\t0x%016lx\n",
+				vcpu, u64);
+	}
+
+	if (!error && (get_vmcs_exit_interruption_info || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu,
+					  VMCS_EXIT_INTERRUPTION_INFO, &u64);
+		if (error == 0) {
+			printf("vmcs_exit_interruption_info[%d]\t0x%08lx\n",
+				vcpu, u64);
+		}
+	}
+
+	if (!error && (get_vmcs_exit_interruption_error || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu,
+					  VMCS_EXIT_INTERRUPTION_ERROR, &u64);
+		if (error == 0) {
+			printf("vmcs_exit_interruption_error[%d]\t0x%08lx\n",
+				vcpu, u64);
+		}
+	}
+
+	if (!error && (get_vmcs_interruptibility || get_all)) {
+		error = vm_get_vmcs_field(ctx, vcpu,
+					  VMCS_GUEST_INTERRUPTIBILITY, &u64);
+		if (error == 0) {
+			printf("vmcs_guest_interruptibility[%d]\t0x%08lx\n",
+				vcpu, u64);
+		}
+	}
+
+	if (!error && setcap) {
+		int captype;
+		captype = vm_capability_name2type(capname);
+		error = vm_set_capability(ctx, vcpu, captype, capval);
+		if (error != 0 && errno == ENOENT)
+			printf("Capability \"%s\" is not available\n", capname);
+	}
+
+	if (!error && (getcap || get_all)) {
+		int captype, val, getcaptype;
+
+		if (getcap && capname)
+			getcaptype = vm_capability_name2type(capname);
+		else
+			getcaptype = -1;
+
+		for (captype = 0; captype < VM_CAP_MAX; captype++) {
+			if (getcaptype >= 0 && captype != getcaptype)
+				continue;
+			error = vm_get_capability(ctx, vcpu, captype, &val);
+			if (error == 0) {
+				printf("Capability \"%s\" is %s on vcpu %d\n",
+					vm_capability_type2name(captype),
+					val ? "set" : "not set", vcpu);
+			} else if (errno == ENOENT) {
+				printf("Capability \"%s\" is not available\n",
+					vm_capability_type2name(captype));
+			} else {
+				break;
+			}
+		}
+	}
+
+	if (!error && run) {
+		error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
+		assert(error == 0);
+
+		error = vm_run(ctx, vcpu, rip, &vmexit);
+		if (error == 0)
+			dump_vm_run_exitcode(&vmexit, vcpu);
+		else
+			printf("vm_run error %d\n", error);
+	}
+
+	if (error)
+		printf("errno = %d\n", errno);
+
+	if (!error && destroy)
+		vm_destroy(ctx);
+
+	exit(error);
+}
diff --git a/usr.sbin/bhyveload/Makefile b/usr.sbin/bhyveload/Makefile
new file mode 100644
index 0000000..7b00818
--- /dev/null
+++ b/usr.sbin/bhyveload/Makefile
@@ -0,0 +1,14 @@
+# $FreeBSD$
+
+PROG=	bhyveload
+SRCS=	bhyveload.c
+MAN=	bhyveload.8
+
+DPADD+=	${LIBVMMAPI}
+LDADD+=	-lvmmapi
+
+WARNS?=	3
+
+CFLAGS+=-I${.CURDIR}/../../sys/boot/userboot
+
+.include <bsd.prog.mk>
diff --git a/usr.sbin/bhyveload/bhyveload.8 b/usr.sbin/bhyveload/bhyveload.8
new file mode 100644
index 0000000..2918c4c
--- /dev/null
+++ b/usr.sbin/bhyveload/bhyveload.8
@@ -0,0 +1,130 @@
+.\"
+.\" Copyright (c) 2012 NetApp Inc
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd January 7, 2012
+.Dt BHYVELOAD 8
+.Os
+.Sh NAME
+.Nm bhyveload
+.Nd load a
+.Fx
+guest inside a bhyve virtual machine
+.Sh SYNOPSIS
+.Nm
+.Op Fl m Ar lowmem
+.Op Fl M Ar highmem
+.Op Fl d Ar disk-path
+.Op Fl h Ar host-path
+.Ar vmname
+.Sh DESCRIPTION
+.Nm
+is used to load a
+.Fx
+guest inside a
+.Xr bhyve 4
+virtual machine.
+.Pp
+.Nm
+is based on
+.Xr loader 8
+and will present an interface identical to
+.Fx
+loader on the user's terminal.
+.Pp
+The virtual machine is identified as
+.Ar vmname
+and will be created if it does not already exist.
+.Sh OPTIONS
+The following options are available:
+.Bl -tag -width indent
+.It Fl m Ar lowmem
+.Ar lowmem
+is the amount of memory allocated below 4GB in the guest's physical address
+space.
+.Pp
+The default value of
+.Ar lowmem
+is 256MB.
+.It Fl M Ar highmem
+.Ar highmem
+is the amount of memory allocated above 4GB in the guest's physical address
+space.
+.Pp
+The default value of
+.Ar highmem
+is 0MB.
+.It Fl d Ar disk-path
+The
+.Ar disk-path
+is the pathname of the guest's boot disk image.
+.It Fl h Ar host-path
+The
+.Ar host-path
+is the directory at the top of the guest's boot filesystem.
+
+.Sh EXAMPLES
+To create a virtual machine named
+.Ar freebsd-vm 
+that boots off the ISO image
+.Pa /freebsd/release.iso 
+and has 1GB memory allocated to it:
+
+.Dl "bhyveload -m 256 -M 768 -d /freebsd/release.iso freebsd-vm
+
+In the example above the 1GB allocation is split in two segments:
+.Bl -dash -compact
+.It
+256MB below the 4GB boundary (0MB - 256MB)
+.It
+768MB above the 4GB boundary (4096MB - 4864MB)
+.El
+
+.Sh SEE ALSO
+.Xr bhyve 4 ,
+.Xr bhyve 8 ,
+.Xr loader 8 ,
+.Xr vmm 4
+
+.Sh HISTORY
+.Nm
+first appeared in
+.Fx 10.0 ,
+and was developed at NetApp Inc.
+
+.Sh AUTHORS
+.Nm
+was developed by
+.An -nosplit
+.An "Neel Natu" Aq neel@FreeBSD.org
+at NetApp Inc with a lot of help from
+.An Doug Rabson Aq dfr@FreeBSD.org
+
+.Sh BUGS
+.Nm
+can load only
+.Fx
+as a guest.
diff --git a/usr.sbin/bhyveload/bhyveload.c b/usr.sbin/bhyveload/bhyveload.c
new file mode 100644
index 0000000..ef12d9f
--- /dev/null
+++ b/usr.sbin/bhyveload/bhyveload.c
@@ -0,0 +1,652 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*-
+ * Copyright (c) 2011 Google, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/disk.h>
+
+#include <machine/specialreg.h>
+#include <machine/vmm.h>
+
+#include <dirent.h>
+#include <dlfcn.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <termios.h>
+#include <unistd.h>
+
+#include <vmmapi.h>
+
+#include "userboot.h"
+
+#define	MB	(1024 * 1024UL)
+#define	GB	(1024 * 1024 * 1024UL)
+#define	BSP	0
+
+static char *host_base = "/";
+static struct termios term, oldterm;
+static int disk_fd = -1;
+
+static char *vmname, *progname, *membase;
+static uint64_t lowmem, highmem;
+static struct vmctx *ctx;
+
+static uint64_t gdtbase, cr3, rsp;
+
+static void cb_exit(void *arg, int v);
+
+/*
+ * Console i/o callbacks
+ */
+
+static void
+cb_putc(void *arg, int ch)
+{
+	char c = ch;
+
+	write(1, &c, 1);
+}
+
+static int
+cb_getc(void *arg)
+{
+	char c;
+
+	if (read(0, &c, 1) == 1)
+		return (c);
+	return (-1);
+}
+
+static int
+cb_poll(void *arg)
+{
+	int n;
+
+	if (ioctl(0, FIONREAD, &n) >= 0)
+		return (n > 0);
+	return (0);
+}
+
+/*
+ * Host filesystem i/o callbacks
+ */
+
+struct cb_file {
+	int cf_isdir;
+	size_t cf_size;
+	struct stat cf_stat;
+	union {
+		int fd;
+		DIR *dir;
+	} cf_u;
+};
+
+static int
+cb_open(void *arg, const char *filename, void **hp)
+{
+	struct stat st;
+	struct cb_file *cf;
+	char path[PATH_MAX];
+
+	if (!host_base)
+		return (ENOENT);
+
+	strlcpy(path, host_base, PATH_MAX);
+	if (path[strlen(path) - 1] == '/')
+		path[strlen(path) - 1] = 0;
+	strlcat(path, filename, PATH_MAX);
+	cf = malloc(sizeof(struct cb_file));
+	if (stat(path, &cf->cf_stat) < 0) {
+		free(cf);
+		return (errno);
+	}
+
+	cf->cf_size = st.st_size;
+	if (S_ISDIR(cf->cf_stat.st_mode)) {
+		cf->cf_isdir = 1;
+		cf->cf_u.dir = opendir(path);
+		if (!cf->cf_u.dir)
+			goto out;
+		*hp = cf;
+		return (0);
+	}
+	if (S_ISREG(cf->cf_stat.st_mode)) {
+		cf->cf_isdir = 0;
+		cf->cf_u.fd = open(path, O_RDONLY);
+		if (cf->cf_u.fd < 0)
+			goto out;
+		*hp = cf;
+		return (0);
+	}
+
+out:
+	free(cf);
+	return (EINVAL);
+}
+
+static int
+cb_close(void *arg, void *h)
+{
+	struct cb_file *cf = h;
+
+	if (cf->cf_isdir)
+		closedir(cf->cf_u.dir);
+	else
+		close(cf->cf_u.fd);
+	free(cf);
+
+	return (0);
+}
+
+static int
+cb_isdir(void *arg, void *h)
+{
+	struct cb_file *cf = h;
+
+	return (cf->cf_isdir);
+}
+
+static int
+cb_read(void *arg, void *h, void *buf, size_t size, size_t *resid)
+{
+	struct cb_file *cf = h;
+	ssize_t sz;
+
+	if (cf->cf_isdir)
+		return (EINVAL);
+	sz = read(cf->cf_u.fd, buf, size);
+	if (sz < 0)
+		return (EINVAL);
+	*resid = size - sz;
+	return (0);
+}
+
+static int
+cb_readdir(void *arg, void *h, uint32_t *fileno_return, uint8_t *type_return,
+	   size_t *namelen_return, char *name)
+{
+	struct cb_file *cf = h;
+	struct dirent *dp;
+
+	if (!cf->cf_isdir)
+		return (EINVAL);
+
+	dp = readdir(cf->cf_u.dir);
+	if (!dp)
+		return (ENOENT);
+
+	/*
+	 * Note: d_namlen is in the range 0..255 and therefore less
+	 * than PATH_MAX so we don't need to test before copying.
+	 */
+	*fileno_return = dp->d_fileno;
+	*type_return = dp->d_type;
+	*namelen_return = dp->d_namlen;
+	memcpy(name, dp->d_name, dp->d_namlen);
+	name[dp->d_namlen] = 0;
+
+	return (0);
+}
+
+static int
+cb_seek(void *arg, void *h, uint64_t offset, int whence)
+{
+	struct cb_file *cf = h;
+
+	if (cf->cf_isdir)
+		return (EINVAL);
+	if (lseek(cf->cf_u.fd, offset, whence) < 0)
+		return (errno);
+	return (0);
+}
+
+static int
+cb_stat(void *arg, void *h, int *mode, int *uid, int *gid, uint64_t *size)
+{
+	struct cb_file *cf = h;
+
+	*mode = cf->cf_stat.st_mode;
+	*uid = cf->cf_stat.st_uid;
+	*gid = cf->cf_stat.st_gid;
+	*size = cf->cf_stat.st_size;
+	return (0);
+}
+
+/*
+ * Disk image i/o callbacks
+ */
+
+static int
+cb_diskread(void *arg, int unit, uint64_t from, void *to, size_t size,
+	    size_t *resid)
+{
+	ssize_t n;
+
+	if (unit != 0 || disk_fd == -1)
+		return (EIO);
+	n = pread(disk_fd, to, size, from);
+	if (n < 0)
+		return (errno);
+	*resid = size - n;
+	return (0);
+}
+
+static int
+cb_diskioctl(void *arg, int unit, u_long cmd, void *data)
+{
+	struct stat sb;
+
+	if (unit != 0 || disk_fd == -1)
+		return (EBADF);
+
+	switch (cmd) {
+	case DIOCGSECTORSIZE:
+		*(u_int *)data = 512;
+		break;
+	case DIOCGMEDIASIZE:
+		if (fstat(disk_fd, &sb) == 0)
+			*(off_t *)data = sb.st_size;
+		else
+			return (ENOTTY);
+		break;
+	default:
+		return (ENOTTY);
+	}
+
+	return (0);
+}
+
+/*
+ * Guest virtual machine i/o callbacks
+ */
+static int
+cb_copyin(void *arg, const void *from, uint64_t to, size_t size)
+{
+
+	to &= 0x7fffffff;
+	if (to > lowmem)
+		return (EFAULT);
+	if (to + size > lowmem)
+		size = lowmem - to;
+
+	memcpy(&membase[to], from, size);
+
+	return (0);
+}
+
+static int
+cb_copyout(void *arg, uint64_t from, void *to, size_t size)
+{
+
+	from &= 0x7fffffff;
+	if (from > lowmem)
+		return (EFAULT);
+	if (from + size > lowmem)
+		size = lowmem - from;
+
+	memcpy(to, &membase[from], size);
+
+	return (0);
+}
+
+static void
+cb_setreg(void *arg, int r, uint64_t v)
+{
+	int error;
+	enum vm_reg_name vmreg;
+	
+	vmreg = VM_REG_LAST;
+
+	switch (r) {
+	case 4:
+		vmreg = VM_REG_GUEST_RSP;
+		rsp = v;
+		break;
+	default:
+		break;
+	}
+
+	if (vmreg == VM_REG_LAST) {
+		printf("test_setreg(%d): not implemented\n", r);
+		cb_exit(NULL, USERBOOT_EXIT_QUIT);
+	}
+
+	error = vm_set_register(ctx, BSP, vmreg, v);
+	if (error) {
+		perror("vm_set_register");
+		cb_exit(NULL, USERBOOT_EXIT_QUIT);
+	}
+}
+
+static void
+cb_setmsr(void *arg, int r, uint64_t v)
+{
+	int error;
+	enum vm_reg_name vmreg;
+	
+	vmreg = VM_REG_LAST;
+
+	switch (r) {
+	case MSR_EFER:
+		vmreg = VM_REG_GUEST_EFER;
+		break;
+	default:
+		break;
+	}
+
+	if (vmreg == VM_REG_LAST) {
+		printf("test_setmsr(%d): not implemented\n", r);
+		cb_exit(NULL, USERBOOT_EXIT_QUIT);
+	}
+
+	error = vm_set_register(ctx, BSP, vmreg, v);
+	if (error) {
+		perror("vm_set_msr");
+		cb_exit(NULL, USERBOOT_EXIT_QUIT);
+	}
+}
+
+static void
+cb_setcr(void *arg, int r, uint64_t v)
+{
+	int error;
+	enum vm_reg_name vmreg;
+	
+	vmreg = VM_REG_LAST;
+
+	switch (r) {
+	case 0:
+		vmreg = VM_REG_GUEST_CR0;
+		break;
+	case 3:
+		vmreg = VM_REG_GUEST_CR3;
+		cr3 = v;
+		break;
+	case 4:
+		vmreg = VM_REG_GUEST_CR4;
+		break;
+	default:
+		break;
+	}
+
+	if (vmreg == VM_REG_LAST) {
+		printf("test_setcr(%d): not implemented\n", r);
+		cb_exit(NULL, USERBOOT_EXIT_QUIT);
+	}
+
+	error = vm_set_register(ctx, BSP, vmreg, v);
+	if (error) {
+		perror("vm_set_cr");
+		cb_exit(NULL, USERBOOT_EXIT_QUIT);
+	}
+}
+
+static void
+cb_setgdt(void *arg, uint64_t base, size_t size)
+{
+	int error;
+
+	error = vm_set_desc(ctx, BSP, VM_REG_GUEST_GDTR, base, size - 1, 0);
+	if (error != 0) {
+		perror("vm_set_desc(gdt)");
+		cb_exit(NULL, USERBOOT_EXIT_QUIT);
+	}
+
+	gdtbase = base;
+}
+
+static void
+cb_exec(void *arg, uint64_t rip)
+{
+	int error;
+
+	error = vm_setup_freebsd_registers(ctx, BSP, rip, cr3, gdtbase, rsp);
+	if (error) {
+		perror("vm_setup_freebsd_registers");
+		cb_exit(NULL, USERBOOT_EXIT_QUIT);
+	}
+
+	cb_exit(NULL, 0);
+}
+
+/*
+ * Misc
+ */
+
+static void
+cb_delay(void *arg, int usec)
+{
+
+	usleep(usec);
+}
+
+static void
+cb_exit(void *arg, int v)
+{
+
+	tcsetattr(0, TCSAFLUSH, &oldterm);
+	exit(v);
+}
+
+static void
+cb_getmem(void *arg, uint64_t *ret_lowmem, uint64_t *ret_highmem)
+{
+
+	*ret_lowmem = lowmem;
+	*ret_highmem = highmem;
+}
+
+static const char *
+cb_getenv(void *arg, int num)
+{
+	int max;
+
+	static const char * var[] = {
+		"smbios.bios.vendor=BHYVE",
+		"boot_serial=1",
+		NULL
+	};
+
+	max = sizeof(var) / sizeof(var[0]);
+
+	if (num < max)
+		return (var[num]);
+	else
+		return (NULL);
+}
+
+static struct loader_callbacks cb = {
+	.getc = cb_getc,
+	.putc = cb_putc,
+	.poll = cb_poll,
+
+	.open = cb_open,
+	.close = cb_close,
+	.isdir = cb_isdir,
+	.read = cb_read,
+	.readdir = cb_readdir,
+	.seek = cb_seek,
+	.stat = cb_stat,
+
+	.diskread = cb_diskread,
+	.diskioctl = cb_diskioctl,
+
+	.copyin = cb_copyin,
+	.copyout = cb_copyout,
+	.setreg = cb_setreg,
+	.setmsr = cb_setmsr,
+	.setcr = cb_setcr,
+	.setgdt = cb_setgdt,
+	.exec = cb_exec,
+
+	.delay = cb_delay,
+	.exit = cb_exit,
+	.getmem = cb_getmem,
+
+	.getenv = cb_getenv,
+};
+
+static void
+usage(void)
+{
+
+	printf("usage: %s [-d <disk image path>] [-h <host filesystem path>] "
+	       "[-m <lowmem>][-M <highmem>] "
+	       "<vmname>\n", progname);
+	exit(1);
+}
+
+int
+main(int argc, char** argv)
+{
+	void *h;
+	void (*func)(struct loader_callbacks *, void *, int, int);
+	int opt, error;
+	char *disk_image;
+
+	progname = argv[0];
+
+	lowmem = 128 * MB;
+	highmem = 0;
+	disk_image = NULL;
+
+	while ((opt = getopt(argc, argv, "d:h:m:M:")) != -1) {
+		switch (opt) {
+		case 'd':
+			disk_image = optarg;
+			break;
+
+		case 'h':
+			host_base = optarg;
+			break;
+
+		case 'm':
+			lowmem = strtoul(optarg, NULL, 0) * MB;
+			break;
+		
+		case 'M':
+			highmem = strtoul(optarg, NULL, 0) * MB;
+			break;
+
+		case '?':
+			usage();
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc != 1)
+		usage();
+
+	vmname = argv[0];
+
+	error = vm_create(vmname);
+	if (error != 0 && errno != EEXIST) {
+		perror("vm_create");
+		exit(1);
+
+	}
+
+	ctx = vm_open(vmname);
+	if (ctx == NULL) {
+		perror("vm_open");
+		exit(1);
+	}
+
+	error = vm_setup_memory(ctx, 0, lowmem, &membase);
+	if (error) {
+		perror("vm_setup_memory(lowmem)");
+		exit(1);
+	}
+
+	if (highmem != 0) {
+		error = vm_setup_memory(ctx, 4 * GB, highmem, NULL);
+		if (error) {
+			perror("vm_setup_memory(highmem)");
+			exit(1);
+		}
+	}
+
+	tcgetattr(0, &term);
+	oldterm = term;
+	term.c_lflag &= ~(ICANON|ECHO);
+	term.c_iflag &= ~ICRNL;
+	tcsetattr(0, TCSAFLUSH, &term);
+	h = dlopen("/boot/userboot.so", RTLD_LOCAL);
+	if (!h) {
+		printf("%s\n", dlerror());
+		return (1);
+	}
+	func = dlsym(h, "loader_main");
+	if (!func) {
+		printf("%s\n", dlerror());
+		return (1);
+	}
+
+	if (disk_image) {
+		disk_fd = open(disk_image, O_RDONLY);
+	}
+	func(&cb, NULL, USERBOOT_VERSION_3, disk_fd >= 0);
+}
author	neel <neel@FreeBSD.org>	2013-01-19 04:18:52 +0000
committer	neel <neel@FreeBSD.org>	2013-01-19 04:18:52 +0000
commit	363335d53e3c955602378aa434d0054c48a6e0d6 (patch)
tree	5af6fe77acd4da3002c907484fd64133deb95c8d
parent	3600c83b820e00959d61600e67e9dcb32ef6b518 (diff)
parent	dde8bf641fc7c8e9541167cd7c01523973d0b569 (diff)
download	FreeBSD-src-363335d53e3c955602378aa434d0054c48a6e0d6.zip FreeBSD-src-363335d53e3c955602378aa434d0054c48a6e0d6.tar.gz