43 files changed, 10025 insertions, 0 deletions
diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h
index 895619c..c95fee0 100644
--- a/sys/amd64/include/specialreg.h
+++ b/sys/amd64/include/specialreg.h
@@ -297,6 +297,7 @@
  */
 #define	APICBASE_RESERVED	0x000006ff
 #define	APICBASE_BSP		0x00000100
+#define APICBASE_X2APIC		0x00000400
 #define	APICBASE_ENABLED	0x00000800
 #define	APICBASE_ADDRESS	0xfffff000
 
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
new file mode 100644
index 0000000..0f4c356
--- /dev/null
+++ b/sys/amd64/include/vmm.h
@@ -0,0 +1,268 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: vmm.h 482 2011-05-09 21:22:43Z grehan $
+ */
+
+#ifndef _VMM_H_
+#define	_VMM_H_
+
+#ifdef _KERNEL
+
+#define	VM_MAX_NAMELEN	32
+
+struct vm;
+struct vm_memory_segment;
+struct seg_desc;
+struct vm_exit;
+struct vm_run;
+struct vlapic;
+
+typedef int	(*vmm_init_func_t)(void);
+typedef int	(*vmm_cleanup_func_t)(void);
+typedef void *	(*vmi_init_func_t)(struct vm *vm); /* instance specific apis */
+typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
+				  struct vm_exit *vmexit);
+typedef void	(*vmi_cleanup_func_t)(void *vmi);
+typedef int	(*vmi_mmap_func_t)(void *vmi, vm_paddr_t gpa, vm_paddr_t hpa,
+				   size_t length, vm_memattr_t attr,
+				   int prot, boolean_t superpages_ok);
+typedef int	(*vmi_get_register_t)(void *vmi, int vcpu, int num,
+				      uint64_t *retval);
+typedef int	(*vmi_set_register_t)(void *vmi, int vcpu, int num,
+				      uint64_t val);
+typedef int	(*vmi_get_desc_t)(void *vmi, int vcpu, int num,
+				  struct seg_desc *desc);
+typedef int	(*vmi_set_desc_t)(void *vmi, int vcpu, int num,
+				  struct seg_desc *desc);
+typedef int	(*vmi_inject_event_t)(void *vmi, int vcpu,
+				      int type, int vector,
+				      uint32_t code, int code_valid);
+typedef	int	(*vmi_inject_nmi_t)(void *vmi, int vcpu);
+typedef int	(*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
+typedef int	(*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
+
+struct vmm_ops {
+	vmm_init_func_t		init;		/* module wide initialization */
+	vmm_cleanup_func_t	cleanup;
+
+	vmi_init_func_t		vminit;		/* vm-specific initialization */
+	vmi_run_func_t		vmrun;
+	vmi_cleanup_func_t	vmcleanup;
+	vmi_mmap_func_t		vmmmap;
+	vmi_get_register_t	vmgetreg;
+	vmi_set_register_t	vmsetreg;
+	vmi_get_desc_t		vmgetdesc;
+	vmi_set_desc_t		vmsetdesc;
+	vmi_inject_event_t	vminject;
+	vmi_inject_nmi_t	vmnmi;
+	vmi_get_cap_t		vmgetcap;
+	vmi_set_cap_t		vmsetcap;
+};
+
+extern struct vmm_ops vmm_ops_intel;
+extern struct vmm_ops vmm_ops_amd;
+
+struct vm *vm_create(const char *name);
+void vm_destroy(struct vm *vm);
+const char *vm_name(struct vm *vm);
+int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t *ret_hpa);
+int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
+vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size);
+int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
+	      struct vm_memory_segment *seg);
+int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
+int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
+int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
+		    struct seg_desc *ret_desc);
+int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
+		    struct seg_desc *desc);
+int vm_get_pinning(struct vm *vm, int vcpu, int *cpuid);
+int vm_set_pinning(struct vm *vm, int vcpu, int cpuid);
+int vm_run(struct vm *vm, struct vm_run *vmrun);
+int vm_inject_event(struct vm *vm, int vcpu, int type,
+		    int vector, uint32_t error_code, int error_code_valid);
+int vm_inject_nmi(struct vm *vm, int vcpu);
+uint64_t *vm_guest_msrs(struct vm *vm, int cpu);
+struct vlapic *vm_lapic(struct vm *vm, int cpu);
+int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
+int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
+void vm_activate_cpu(struct vm *vm, int vcpu);
+cpumask_t vm_active_cpus(struct vm *vm);
+
+/*
+ * Return 1 if device indicated by bus/slot/func is supposed to be a
+ * pci passthrough device.
+ *
+ * Return 0 otherwise.
+ */
+int vmm_is_pptdev(int bus, int slot, int func);
+
+void *vm_iommu_domain(struct vm *vm);
+
+#define	VCPU_STOPPED	0
+#define	VCPU_RUNNING	1
+void vm_set_run_state(struct vm *vm, int vcpu, int running);
+int vm_get_run_state(struct vm *vm, int vcpu, int *hostcpu);
+
+void *vcpu_stats(struct vm *vm, int vcpu);
+
+static int __inline
+vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
+{
+	return (vm_get_run_state(vm, vcpu, hostcpu) == VCPU_RUNNING);
+}
+
+static cpumask_t __inline
+vcpu_mask(int vcpuid)
+{
+	return ((cpumask_t)1 << vcpuid);
+}
+
+#endif	/* KERNEL */
+
+#define	VM_MAXCPU	8			/* maximum virtual cpus */
+
+/*
+ * Identifiers for events that can be injected into the VM
+ */
+enum vm_event_type {
+	VM_EVENT_NONE,
+	VM_HW_INTR,
+	VM_NMI,
+	VM_HW_EXCEPTION,
+	VM_SW_INTR,
+	VM_PRIV_SW_EXCEPTION,
+	VM_SW_EXCEPTION,
+	VM_EVENT_MAX
+};
+
+/*
+ * Identifiers for architecturally defined registers.
+ */
+enum vm_reg_name {
+	VM_REG_GUEST_RAX,
+	VM_REG_GUEST_RBX,
+	VM_REG_GUEST_RCX,
+	VM_REG_GUEST_RDX,
+	VM_REG_GUEST_RSI,
+	VM_REG_GUEST_RDI,
+	VM_REG_GUEST_RBP,
+	VM_REG_GUEST_R8,
+	VM_REG_GUEST_R9,
+	VM_REG_GUEST_R10,
+	VM_REG_GUEST_R11,
+	VM_REG_GUEST_R12,
+	VM_REG_GUEST_R13,
+	VM_REG_GUEST_R14,
+	VM_REG_GUEST_R15,
+	VM_REG_GUEST_CR0,
+	VM_REG_GUEST_CR3,
+	VM_REG_GUEST_CR4,
+	VM_REG_GUEST_DR7,
+	VM_REG_GUEST_RSP,
+	VM_REG_GUEST_RIP,
+	VM_REG_GUEST_RFLAGS,
+	VM_REG_GUEST_ES,
+	VM_REG_GUEST_CS,
+	VM_REG_GUEST_SS,
+	VM_REG_GUEST_DS,
+	VM_REG_GUEST_FS,
+	VM_REG_GUEST_GS,
+	VM_REG_GUEST_LDTR,
+	VM_REG_GUEST_TR,
+	VM_REG_GUEST_IDTR,
+	VM_REG_GUEST_GDTR,
+	VM_REG_GUEST_EFER,
+	VM_REG_LAST
+};
+
+/*
+ * Identifiers for optional vmm capabilities
+ */
+enum vm_cap_type {
+	VM_CAP_HALT_EXIT,
+	VM_CAP_MTRAP_EXIT,
+	VM_CAP_PAUSE_EXIT,
+	VM_CAP_UNRESTRICTED_GUEST,
+	VM_CAP_MAX
+};
+
+/*
+ * The 'access' field has the format specified in Table 21-2 of the Intel
+ * Architecture Manual vol 3b.
+ *
+ * XXX The contents of the 'access' field are architecturally defined except
+ * bit 16 - Segment Unusable.
+ */
+struct seg_desc {
+	uint64_t	base;
+	uint32_t	limit;
+	uint32_t	access;
+};
+
+enum vm_exitcode {
+	VM_EXITCODE_INOUT,
+	VM_EXITCODE_VMX,
+	VM_EXITCODE_BOGUS,
+	VM_EXITCODE_RDMSR,
+	VM_EXITCODE_WRMSR,
+	VM_EXITCODE_HLT,
+	VM_EXITCODE_MTRAP,
+	VM_EXITCODE_PAUSE,
+	VM_EXITCODE_MAX,
+};
+
+struct vm_exit {
+	enum vm_exitcode	exitcode;
+	int			inst_length;	/* 0 means unknown */
+	uint64_t		rip;
+	union {
+		struct {
+			uint16_t	bytes:3;	/* 1 or 2 or 4 */
+			uint16_t	in:1;		/* out is 0, in is 1 */
+			uint16_t	string:1;
+			uint16_t	rep:1;
+			uint16_t	port;
+			uint32_t	eax;		/* valid for out */
+		} inout;
+		/*
+		 * VMX specific payload. Used when there is no "better"
+		 * exitcode to represent the VM-exit.
+		 */
+		struct {
+			int		error;		/* vmx inst error */
+			uint32_t	exit_reason;
+			uint64_t	exit_qualification;
+		} vmx;
+		struct {
+			uint32_t	code;		/* ecx value */
+			uint64_t	wval;
+		} msr;
+	} u;
+};
+
+#endif	/* _VMM_H_ */
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
new file mode 100644
index 0000000..1b143b5
--- /dev/null
+++ b/sys/amd64/include/vmm_dev.h
@@ -0,0 +1,191 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: vmm_dev.h 482 2011-05-09 21:22:43Z grehan $
+ */
+
+#ifndef	_VMM_DEV_H_
+#define	_VMM_DEV_H_
+
+#ifdef _KERNEL
+void	vmmdev_init(void);
+void	vmmdev_cleanup(void);
+#endif
+
+struct vm_memory_segment {
+	vm_paddr_t	hpa;	/* out */
+	vm_paddr_t	gpa;	/* in */
+	size_t		len;	/* in */
+};
+
+struct vm_register {
+	int		cpuid;
+	int		regnum;		/* enum vm_reg_name */
+	uint64_t	regval;
+};
+
+struct vm_seg_desc {			/* data or code segment */
+	int		cpuid;
+	int		regnum;		/* enum vm_reg_name */
+	struct seg_desc desc;
+};
+
+struct vm_pin {
+	int		vm_cpuid;
+	int		host_cpuid;	/* -1 to unpin */
+};
+
+struct vm_run {
+	int		cpuid;
+	uint64_t	rip;		/* start running here */
+	struct vm_exit	vm_exit;
+};
+
+struct vm_event {
+	int		cpuid;
+	enum vm_event_type type;
+	int		vector;
+	uint32_t	error_code;
+	int		error_code_valid;
+};
+
+struct vm_lapic_irq {
+	int		cpuid;
+	int		vector;
+};
+
+struct vm_capability {
+	int		cpuid;
+	enum vm_cap_type captype;
+	int		capval;
+	int		allcpus;
+};
+
+struct vm_pptdev {
+	int		bus;
+	int		slot;
+	int		func;
+};
+
+struct vm_pptdev_mmio {
+	int		bus;
+	int		slot;
+	int		func;
+	vm_paddr_t	gpa;
+	vm_paddr_t	hpa;
+	size_t		len;
+};
+
+struct vm_pptdev_msi {
+	int		vcpu;
+	int		bus;
+	int		slot;
+	int		func;
+	int		numvec;		/* 0 means disabled */
+	int		vector;
+	int		destcpu;
+};
+
+struct vm_nmi {
+	int		cpuid;
+};
+
+#define	MAX_VM_STATS	64
+struct vm_stats {
+	int		cpuid;				/* in */
+	int		num_entries;			/* out */
+	struct timeval	tv;
+	uint64_t	statbuf[MAX_VM_STATS];
+};
+
+struct vm_stat_desc {
+	int		index;				/* in */
+	char		desc[128];			/* out */
+};
+
+enum {
+	IOCNUM_RUN,
+	IOCNUM_SET_PINNING,
+	IOCNUM_GET_PINNING,
+	IOCNUM_MAP_MEMORY,
+	IOCNUM_GET_MEMORY_SEG,
+	IOCNUM_SET_REGISTER,
+	IOCNUM_GET_REGISTER,
+	IOCNUM_SET_SEGMENT_DESCRIPTOR,
+	IOCNUM_GET_SEGMENT_DESCRIPTOR,
+	IOCNUM_INJECT_EVENT,
+	IOCNUM_LAPIC_IRQ,
+	IOCNUM_SET_CAPABILITY,
+	IOCNUM_GET_CAPABILITY,
+	IOCNUM_BIND_PPTDEV,
+	IOCNUM_UNBIND_PPTDEV,
+	IOCNUM_MAP_PPTDEV_MMIO,
+	IOCNUM_PPTDEV_MSI,
+	IOCNUM_INJECT_NMI,
+	IOCNUM_VM_STATS,
+	IOCNUM_VM_STAT_DESC,
+};
+
+#define	VM_RUN		\
+	_IOWR('v', IOCNUM_RUN, struct vm_run)
+#define	VM_SET_PINNING	\
+	_IOW('v', IOCNUM_SET_PINNING, struct vm_pin)
+#define	VM_GET_PINNING	\
+	_IOWR('v', IOCNUM_GET_PINNING, struct vm_pin)
+#define	VM_MAP_MEMORY	\
+	_IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment)
+#define	VM_GET_MEMORY_SEG \
+	_IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment)
+#define	VM_SET_REGISTER \
+	_IOW('v', IOCNUM_SET_REGISTER, struct vm_register)
+#define	VM_GET_REGISTER \
+	_IOWR('v', IOCNUM_GET_REGISTER, struct vm_register)
+#define	VM_SET_SEGMENT_DESCRIPTOR \
+	_IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define	VM_GET_SEGMENT_DESCRIPTOR \
+	_IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define	VM_INJECT_EVENT	\
+	_IOW('v', IOCNUM_INJECT_EVENT, struct vm_event)
+#define	VM_LAPIC_IRQ 		\
+	_IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq)
+#define	VM_SET_CAPABILITY \
+	_IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability)
+#define	VM_GET_CAPABILITY \
+	_IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability)
+#define	VM_BIND_PPTDEV \
+	_IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev)
+#define	VM_UNBIND_PPTDEV \
+	_IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev)
+#define	VM_MAP_PPTDEV_MMIO \
+	_IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio)
+#define	VM_PPTDEV_MSI \
+	_IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi)
+#define VM_INJECT_NMI \
+	_IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi)
+#define	VM_STATS \
+	_IOWR('v', IOCNUM_VM_STATS, struct vm_stats)
+#define	VM_STAT_DESC \
+	_IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc)
+#endif
diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c
new file mode 100644
index 0000000..41e937a
--- /dev/null
+++ b/sys/amd64/vmm/amd/amdv.c
@@ -0,0 +1,247 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+
+#include <machine/vmm.h>
+#include "io/iommu.h"
+
+static int
+amdv_init(void)
+{
+
+	printf("amdv_init: not implemented\n");
+	return (ENXIO);
+}
+
+static int
+amdv_cleanup(void)
+{
+
+	printf("amdv_cleanup: not implemented\n");
+	return (ENXIO);
+}
+
+static void *
+amdv_vminit(struct vm *vm)
+{
+
+	printf("amdv_vminit: not implemented\n");
+	return (NULL);
+}
+
+static int
+amdv_vmrun(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit)
+{
+
+	printf("amdv_vmrun: not implemented\n");
+	return (ENXIO);
+}
+
+static void
+amdv_vmcleanup(void *arg)
+{
+
+	printf("amdv_vmcleanup: not implemented\n");
+	return;
+}
+
+static int
+amdv_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+	    vm_memattr_t attr, int prot, boolean_t spok)
+{
+
+	printf("amdv_vmmmap: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval)
+{
+	
+	printf("amdv_getreg: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_setreg(void *arg, int vcpu, int regnum, uint64_t val)
+{
+	
+	printf("amdv_setreg: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_getdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
+{
+
+	printf("amdv_get_desc: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_setdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
+{
+
+	printf("amdv_get_desc: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_inject_event(void *vmi, int vcpu, int type, int vector,
+		  uint32_t error_code, int error_code_valid)
+{
+
+	printf("amdv_inject_event: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_nmi(void *arg, int vcpu)
+{
+
+	printf("amdv_nmi: not implemented\n");	
+        return (EINVAL);
+}
+
+static int
+amdv_getcap(void *arg, int vcpu, int type, int *retval)
+{
+
+	printf("amdv_getcap: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_setcap(void *arg, int vcpu, int type, int val)
+{
+
+	printf("amdv_setcap: not implemented\n");
+	return (EINVAL);
+}
+
+struct vmm_ops vmm_ops_amd = {
+	amdv_init,
+	amdv_cleanup,
+	amdv_vminit,
+	amdv_vmrun,
+	amdv_vmcleanup,
+	amdv_vmmmap,
+	amdv_getreg,
+	amdv_setreg,
+	amdv_getdesc,
+	amdv_setdesc,
+	amdv_inject_event,
+	amdv_nmi,
+	amdv_getcap,
+	amdv_setcap
+};
+
+static int
+amd_iommu_init(void)
+{
+
+	printf("amd_iommu_init: not implemented\n");
+	return (ENXIO);
+}
+
+static void
+amd_iommu_cleanup(void)
+{
+
+	printf("amd_iommu_cleanup: not implemented\n");
+}
+
+static void
+amd_iommu_enable(void)
+{
+
+	printf("amd_iommu_enable: not implemented\n");
+}
+
+static void
+amd_iommu_disable(void)
+{
+
+	printf("amd_iommu_disable: not implemented\n");
+}
+
+static void *
+amd_iommu_create_domain(vm_paddr_t maxaddr)
+{
+
+	printf("amd_iommu_create_domain: not implemented\n");
+	return (NULL);
+}
+
+static void
+amd_iommu_destroy_domain(void *domain)
+{
+
+	printf("amd_iommu_destroy_domain: not implemented\n");
+}
+
+static uint64_t
+amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa,
+			 uint64_t len)
+{
+
+	printf("amd_iommu_create_mapping: not implemented\n");
+	return (0);
+}
+
+static void
+amd_iommu_add_device(void *domain, int bus, int slot, int func)
+{
+
+	printf("amd_iommu_add_device: not implemented\n");
+}
+
+static void
+amd_iommu_remove_device(void *domain, int bus, int slot, int func)
+{
+
+	printf("amd_iommu_remove_device: not implemented\n");
+}
+
+struct iommu_ops iommu_ops_amd = {
+	amd_iommu_init,
+	amd_iommu_cleanup,
+	amd_iommu_enable,
+	amd_iommu_disable,
+	amd_iommu_create_domain,
+	amd_iommu_destroy_domain,
+	amd_iommu_create_mapping,
+	amd_iommu_add_device,
+	amd_iommu_remove_device,
+};
diff --git a/sys/amd64/vmm/intel/ept.c b/sys/amd64/vmm/intel/ept.c
new file mode 100644
index 0000000..c9fca9d
--- /dev/null
+++ b/sys/amd64/vmm/intel/ept.c
@@ -0,0 +1,312 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/smp.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/param.h>
+#include <machine/cpufunc.h>
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include "vmx_cpufunc.h"
+#include "vmx_msr.h"
+#include "vmx.h"
+#include "ept.h"
+
+#define	EPT_PWL4(cap)			((cap) & (1UL << 6))
+#define	EPT_MEMORY_TYPE_WB(cap)		((cap) & (1UL << 14))
+#define	EPT_PDE_SUPERPAGE(cap)		((cap) & (1UL << 16))	/* 2MB pages */
+#define	EPT_PDPTE_SUPERPAGE(cap)	((cap) & (1UL << 17))	/* 1GB pages */
+#define	INVVPID_SUPPORTED(cap)		((cap) & (1UL << 32))
+#define	INVEPT_SUPPORTED(cap)		((cap) & (1UL << 20))
+
+#define	INVVPID_ALL_TYPES_MASK		0xF0000000000UL
+#define	INVVPID_ALL_TYPES_SUPPORTED(cap)	\
+	(((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK)
+
+#define	INVEPT_ALL_TYPES_MASK		0x6000000UL
+#define	INVEPT_ALL_TYPES_SUPPORTED(cap)		\
+	(((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
+
+#define	EPT_PG_RD			(1 << 0)
+#define	EPT_PG_WR			(1 << 1)
+#define	EPT_PG_EX			(1 << 2)
+#define	EPT_PG_MEMORY_TYPE(x)		((x) << 3)
+#define	EPT_PG_IGNORE_PAT		(1 << 6)
+#define	EPT_PG_SUPERPAGE		(1 << 7)
+
+#define	EPT_ADDR_MASK			((uint64_t)-1 << 12)
+
+MALLOC_DECLARE(M_VMX);
+
+static uint64_t page_sizes_mask;
+
+int
+ept_init(void)
+{
+	int page_shift;
+	uint64_t cap;
+
+	cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
+
+	/*
+	 * Verify that:
+	 * - page walk length is 4 steps
+	 * - extended page tables can be laid out in write-back memory
+	 * - invvpid instruction with all possible types is supported
+	 * - invept instruction with all possible types is supported
+	 */
+	if (!EPT_PWL4(cap) ||
+	    !EPT_MEMORY_TYPE_WB(cap) ||
+	    !INVVPID_SUPPORTED(cap) ||
+	    !INVVPID_ALL_TYPES_SUPPORTED(cap) ||
+	    !INVEPT_SUPPORTED(cap) ||
+	    !INVEPT_ALL_TYPES_SUPPORTED(cap))
+		return (EINVAL);
+
+	/* Set bits in 'page_sizes_mask' for each valid page size */
+	page_shift = PAGE_SHIFT;
+	page_sizes_mask = 1UL << page_shift;		/* 4KB page */
+
+	page_shift += 9;
+	if (EPT_PDE_SUPERPAGE(cap))
+		page_sizes_mask |= 1UL << page_shift;	/* 2MB superpage */
+
+	page_shift += 9;
+	if (EPT_PDPTE_SUPERPAGE(cap))
+		page_sizes_mask |= 1UL << page_shift;	/* 1GB superpage */
+
+	return (0);
+}
+
+static size_t
+ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+		   vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
+{
+	int spshift, ptpshift, ptpindex, nlevels;
+
+	/*
+	 * Compute the size of the mapping that we can accomodate.
+	 *
+	 * This is based on three factors:
+	 * - super page sizes supported by the processor
+	 * - alignment of the region starting at 'gpa' and 'hpa'
+	 * - length of the region 'len'
+	 */
+	spshift = PAGE_SHIFT;
+	if (spok)
+		spshift += (EPT_PWLEVELS - 1) * 9;
+	while (spshift >= PAGE_SHIFT) {
+		uint64_t spsize = 1UL << spshift;
+		if ((page_sizes_mask & spsize) != 0 &&
+		    (gpa & (spsize - 1)) == 0 &&
+		    (hpa & (spsize - 1)) == 0 &&
+		    length >= spsize) {
+			break;
+		}
+		spshift -= 9;
+	}
+
+	if (spshift < PAGE_SHIFT) {
+		panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, "
+		      "length 0x%016lx, page_sizes_mask 0x%016lx",
+		      gpa, hpa, length, page_sizes_mask);
+	}
+
+	nlevels = EPT_PWLEVELS;
+	while (--nlevels >= 0) {
+		ptpshift = PAGE_SHIFT + nlevels * 9;
+		ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+		/* We have reached the leaf mapping */
+		if (spshift >= ptpshift)
+			break;
+
+		/*
+		 * We are working on a non-leaf page table page.
+		 *
+		 * Create the next level page table page if necessary and point
+		 * to it from the current page table.
+		 */
+		if (ptp[ptpindex] == 0) {
+			void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO);
+			ptp[ptpindex] = vtophys(nlp);
+			ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX;
+		}
+
+		/* Work our way down to the next level page table page */
+		ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK);
+	}
+
+	if ((gpa & ((1UL << ptpshift) - 1)) != 0) {
+		panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d "
+		      "mismatch\n", gpa, ptpshift);
+	}
+
+	/* Do the mapping */
+	ptp[ptpindex] = hpa;
+
+	/* Apply the access controls */
+	if (prot & VM_PROT_READ)
+		ptp[ptpindex] |= EPT_PG_RD;
+	if (prot & VM_PROT_WRITE)
+		ptp[ptpindex] |= EPT_PG_WR;
+	if (prot & VM_PROT_EXECUTE)
+		ptp[ptpindex] |= EPT_PG_EX;
+
+	/*
+	 * XXX should we enforce this memory type by setting the ignore PAT
+	 * bit to 1.
+	 */
+	ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
+
+	if (nlevels > 0)
+		ptp[ptpindex] |= EPT_PG_SUPERPAGE;
+
+	return (1UL << ptpshift);
+}
+
+static void
+ept_free_pt_entry(pt_entry_t pte)
+{
+	if (pte == 0)
+		return;
+
+	/* sanity check */
+	if ((pte & EPT_PG_SUPERPAGE) != 0)
+		panic("ept_free_pt_entry: pte cannot have superpage bit");
+
+	return;
+}
+
+static void
+ept_free_pd_entry(pd_entry_t pde)
+{
+	pt_entry_t	*pt;
+	int		i;
+
+	if (pde == 0)
+		return;
+
+	if ((pde & EPT_PG_SUPERPAGE) == 0) {
+		pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK);
+		for (i = 0; i < NPTEPG; i++)
+			ept_free_pt_entry(pt[i]);
+		free(pt, M_VMX);	/* free the page table page */
+	}
+}
+
+static void
+ept_free_pdp_entry(pdp_entry_t pdpe)
+{
+	pd_entry_t 	*pd;
+	int		 i;
+
+	if (pdpe == 0)
+		return;
+
+	if ((pdpe & EPT_PG_SUPERPAGE) == 0) {
+		pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK);
+		for (i = 0; i < NPDEPG; i++)
+			ept_free_pd_entry(pd[i]);
+		free(pd, M_VMX);	/* free the page directory page */
+	}
+}
+
+static void
+ept_free_pml4_entry(pml4_entry_t pml4e)
+{
+	pdp_entry_t	*pdp;
+	int		i;
+
+	if (pml4e == 0)
+		return;
+
+	if ((pml4e & EPT_PG_SUPERPAGE) == 0) {
+		pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK);
+		for (i = 0; i < NPDPEPG; i++)
+			ept_free_pdp_entry(pdp[i]);
+		free(pdp, M_VMX);	/* free the page directory ptr page */
+	}
+}
+
+void
+ept_vmcleanup(struct vmx *vmx)
+{
+	int 		 i;
+
+	for (i = 0; i < NPML4EPG; i++)
+		ept_free_pml4_entry(vmx->pml4ept[i]);
+}
+
+int
+ept_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
+	   vm_memattr_t attr, int prot, boolean_t spok)
+{
+	size_t n;
+	struct vmx *vmx = arg;
+
+	while (len > 0) {
+		n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr,
+				       prot, spok);
+		len -= n;
+		gpa += n;
+		hpa += n;
+	}
+
+	return (0);
+}
+
+static void
+invept_single_context(void *arg)
+{
+	struct invept_desc desc = *(struct invept_desc *)arg;
+
+	invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
+}
+
+void
+ept_invalidate_mappings(u_long pml4ept)
+{
+	struct invept_desc invept_desc = { 0 };
+
+	invept_desc.eptp = EPTP(pml4ept);
+
+	smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
+}
diff --git a/sys/amd64/vmm/intel/ept.h b/sys/amd64/vmm/intel/ept.h
new file mode 100644
index 0000000..013c330
--- /dev/null
+++ b/sys/amd64/vmm/intel/ept.h
@@ -0,0 +1,42 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_EPT_H_
+#define	_EPT_H_
+
+struct vmx;
+
+#define	EPT_PWLEVELS	4		/* page walk levels */
+#define	EPTP(pml4)	((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK)
+
+int	ept_init(void);
+int	ept_vmmmap(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+	    vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings);
+void	ept_invalidate_mappings(u_long ept_pml4);
+void	ept_vmcleanup(struct vmx *vmx);
+#endif
diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
new file mode 100644
index 0000000..80d45cc
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -0,0 +1,451 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/pcpu.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/segments.h>
+#include <machine/pmap.h>
+
+#include <machine/vmm.h>
+#include "vmcs.h"
+#include "vmx_cpufunc.h"
+#include "ept.h"
+#include "vmx.h"
+
+static uint64_t
+vmcs_fix_regval(uint32_t encoding, uint64_t val)
+{
+
+	switch (encoding) {
+	case VMCS_GUEST_CR0:
+		val = vmx_fix_cr0(val);
+		break;
+	case VMCS_GUEST_CR4:
+		val = vmx_fix_cr4(val);
+		break;
+	default:
+		break;
+	}
+	return (val);
+}
+
+static uint32_t
+vmcs_field_encoding(int ident)
+{
+	switch (ident) {
+	case VM_REG_GUEST_CR0:
+		return (VMCS_GUEST_CR0);
+	case VM_REG_GUEST_CR3:
+		return (VMCS_GUEST_CR3);
+	case VM_REG_GUEST_CR4:
+		return (VMCS_GUEST_CR4);
+	case VM_REG_GUEST_DR7:
+		return (VMCS_GUEST_DR7);
+	case VM_REG_GUEST_RSP:
+		return (VMCS_GUEST_RSP);
+	case VM_REG_GUEST_RIP:
+		return (VMCS_GUEST_RIP);
+	case VM_REG_GUEST_RFLAGS:
+		return (VMCS_GUEST_RFLAGS);
+	case VM_REG_GUEST_ES:
+		return (VMCS_GUEST_ES_SELECTOR);
+	case VM_REG_GUEST_CS:
+		return (VMCS_GUEST_CS_SELECTOR);
+	case VM_REG_GUEST_SS:
+		return (VMCS_GUEST_SS_SELECTOR);
+	case VM_REG_GUEST_DS:
+		return (VMCS_GUEST_DS_SELECTOR);
+	case VM_REG_GUEST_FS:
+		return (VMCS_GUEST_FS_SELECTOR);
+	case VM_REG_GUEST_GS:
+		return (VMCS_GUEST_GS_SELECTOR);
+	case VM_REG_GUEST_TR:
+		return (VMCS_GUEST_TR_SELECTOR);
+	case VM_REG_GUEST_LDTR:
+		return (VMCS_GUEST_LDTR_SELECTOR);
+	case VM_REG_GUEST_EFER:
+		return (VMCS_GUEST_IA32_EFER);
+	default:
+		return (-1);
+	}
+
+}
+
+static int
+vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc)
+{
+
+	switch (seg) {
+	case VM_REG_GUEST_ES:
+		*base = VMCS_GUEST_ES_BASE;
+		*lim = VMCS_GUEST_ES_LIMIT;
+		*acc = VMCS_GUEST_ES_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_CS:
+		*base = VMCS_GUEST_CS_BASE;
+		*lim = VMCS_GUEST_CS_LIMIT;
+		*acc = VMCS_GUEST_CS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_SS:
+		*base = VMCS_GUEST_SS_BASE;
+		*lim = VMCS_GUEST_SS_LIMIT;
+		*acc = VMCS_GUEST_SS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_DS:
+		*base = VMCS_GUEST_DS_BASE;
+		*lim = VMCS_GUEST_DS_LIMIT;
+		*acc = VMCS_GUEST_DS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_FS:
+		*base = VMCS_GUEST_FS_BASE;
+		*lim = VMCS_GUEST_FS_LIMIT;
+		*acc = VMCS_GUEST_FS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_GS:
+		*base = VMCS_GUEST_GS_BASE;
+		*lim = VMCS_GUEST_GS_LIMIT;
+		*acc = VMCS_GUEST_GS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_TR:
+		*base = VMCS_GUEST_TR_BASE;
+		*lim = VMCS_GUEST_TR_LIMIT;
+		*acc = VMCS_GUEST_TR_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_LDTR:
+		*base = VMCS_GUEST_LDTR_BASE;
+		*lim = VMCS_GUEST_LDTR_LIMIT;
+		*acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_IDTR:
+		*base = VMCS_GUEST_IDTR_BASE;
+		*lim = VMCS_GUEST_IDTR_LIMIT;
+		*acc = VMCS_INVALID_ENCODING;
+		break;
+	case VM_REG_GUEST_GDTR:
+		*base = VMCS_GUEST_GDTR_BASE;
+		*lim = VMCS_GUEST_GDTR_LIMIT;
+		*acc = VMCS_INVALID_ENCODING;
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+int
+vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval)
+{
+	int error;
+	uint32_t encoding;
+
+	/*
+	 * If we need to get at vmx-specific state in the VMCS we can bypass
+	 * the translation of 'ident' to 'encoding' by simply setting the
+	 * sign bit. As it so happens the upper 16 bits are reserved (i.e
+	 * set to 0) in the encodings for the VMCS so we are free to use the
+	 * sign bit.
+	 */
+	if (ident < 0)
+		encoding = ident & 0x7fffffff;
+	else
+		encoding = vmcs_field_encoding(ident);
+
+	if (encoding == (uint32_t)-1)
+		return (EINVAL);
+
+	VMPTRLD(vmcs);
+	error = vmread(encoding, retval);
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+int
+vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val)
+{
+	int error;
+	uint32_t encoding;
+
+	if (ident < 0)
+		encoding = ident & 0x7fffffff;
+	else
+		encoding = vmcs_field_encoding(ident);
+
+	if (encoding == (uint32_t)-1)
+		return (EINVAL);
+
+	val = vmcs_fix_regval(encoding, val);
+
+	VMPTRLD(vmcs);
+	error = vmwrite(encoding, val);
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+int
+vmcs_setdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc)
+{
+	int error;
+	uint32_t base, limit, access;
+
+	error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
+	if (error != 0)
+		panic("vmcs_setdesc: invalid segment register %d", seg);
+
+	VMPTRLD(vmcs);
+	if ((error = vmwrite(base, desc->base)) != 0)
+		goto done;
+
+	if ((error = vmwrite(limit, desc->limit)) != 0)
+		goto done;
+
+	if (access != VMCS_INVALID_ENCODING) {
+		if ((error = vmwrite(access, desc->access)) != 0)
+			goto done;
+	}
+done:
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+int
+vmcs_getdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc)
+{
+	int error;
+	uint32_t base, limit, access;
+	uint64_t u64;
+
+	error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
+	if (error != 0)
+		panic("vmcs_getdesc: invalid segment register %d", seg);
+
+	VMPTRLD(vmcs);
+	if ((error = vmread(base, &u64)) != 0)
+		goto done;
+	desc->base = u64;
+
+	if ((error = vmread(limit, &u64)) != 0)
+		goto done;
+	desc->limit = u64;
+
+	if (access != VMCS_INVALID_ENCODING) {
+		if ((error = vmread(access, &u64)) != 0)
+			goto done;
+		desc->access = u64;
+	}
+done:
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+int
+vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count)
+{
+	int error;
+
+	VMPTRLD(vmcs);
+
+	/*
+	 * Guest MSRs are saved in the VM-exit MSR-store area.
+	 * Guest MSRs are loaded from the VM-entry MSR-load area.
+	 * Both areas point to the same location in memory.
+	 */
+	if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0)
+		goto done;
+
+	error = 0;
+done:
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+int
+vmcs_set_defaults(struct vmcs *vmcs,
+		  u_long host_rip, u_long host_rsp, u_long ept_pml4,
+		  uint32_t pinbased_ctls, uint32_t procbased_ctls,
+		  uint32_t procbased_ctls2, uint32_t exit_ctls,
+		  uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid)
+{
+	int error, codesel, datasel, tsssel;
+	u_long cr0, cr4, efer;
+	uint64_t eptp, pat;
+	uint32_t exc_bitmap;
+
+	codesel = GSEL(GCODE_SEL, SEL_KPL);
+	datasel = GSEL(GDATA_SEL, SEL_KPL);
+	tsssel = GSEL(GPROC0_SEL, SEL_KPL);
+
+	/*
+	 * Make sure we have a "current" VMCS to work with.
+	 */
+	VMPTRLD(vmcs);
+
+	/*
+	 * Load the VMX controls
+	 */
+	if ((error = vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_EXIT_CTLS, exit_ctls)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls)) != 0)
+		goto done;
+
+	/* Guest state */
+
+	/* Initialize guest IA32_PAT MSR with the default value */
+	pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
+	      PAT_VALUE(1, PAT_WRITE_THROUGH)	|
+	      PAT_VALUE(2, PAT_UNCACHED)	|
+	      PAT_VALUE(3, PAT_UNCACHEABLE)	|
+	      PAT_VALUE(4, PAT_WRITE_BACK)	|
+	      PAT_VALUE(5, PAT_WRITE_THROUGH)	|
+	      PAT_VALUE(6, PAT_UNCACHED)	|
+	      PAT_VALUE(7, PAT_UNCACHEABLE);
+	if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0)
+		goto done;
+
+	/* Host state */
+
+	/* Initialize host IA32_PAT MSR */
+	pat = rdmsr(MSR_PAT);
+	if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0)
+		goto done;
+
+	/* Load the IA32_EFER MSR */
+	efer = rdmsr(MSR_EFER);
+	if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0)
+		goto done;
+
+	/* Load the control registers */
+	cr0 = rcr0();
+	if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0)
+		goto done;
+	
+	cr4 = rcr4();
+	if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0)
+		goto done;
+
+	/* Load the segment selectors */
+	if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0)
+		goto done;
+
+	/*
+	 * Load the Base-Address for %fs and idtr.
+	 *
+	 * Note that we exclude %gs, tss and gdtr here because their base
+	 * address is pcpu specific.
+	 */
+	if ((error = vmwrite(VMCS_HOST_FS_BASE, 0)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_IDTR_BASE, r_idt.rd_base)) != 0)
+		goto done;
+
+	/* instruction pointer */
+	if ((error = vmwrite(VMCS_HOST_RIP, host_rip)) != 0)
+		goto done;
+
+	/* stack pointer */
+	if ((error = vmwrite(VMCS_HOST_RSP, host_rsp)) != 0)
+		goto done;
+
+	/* eptp */
+	eptp = EPTP(ept_pml4);
+	if ((error = vmwrite(VMCS_EPTP, eptp)) != 0)
+		goto done;
+
+	/* vpid */
+	if ((error = vmwrite(VMCS_VPID, vpid)) != 0)
+		goto done;
+
+	/* msr bitmap */
+	if ((error = vmwrite(VMCS_MSR_BITMAP, msr_bitmap)) != 0)
+		goto done;
+
+	/* exception bitmap */
+	exc_bitmap = 1 << IDT_MC;
+	if ((error = vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap)) != 0)
+		goto done;
+
+	/* link pointer */
+	if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0)
+		goto done;
+done:
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+uint64_t
+vmcs_read(uint32_t encoding)
+{
+	int error;
+	uint64_t val;
+
+	error = vmread(encoding, &val);
+	if (error != 0)
+		panic("vmcs_read(%u) error %d", encoding, error);
+
+	return (val);
+}
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
new file mode 100644
index 0000000..c633a59
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -0,0 +1,324 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMCS_H_
+#define	_VMCS_H_
+
+#ifdef _KERNEL
+struct vmcs {
+	uint32_t	identifier;
+	uint32_t	abort_code;
+	char		_impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2];
+};
+CTASSERT(sizeof(struct vmcs) == PAGE_SIZE);
+
+/* MSR save region is composed of an array of 'struct msr_entry' */
+struct msr_entry {
+	uint32_t	index;
+	uint32_t	reserved;
+	uint64_t	val;
+
+};
+
+int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
+int	vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp,
+			  u_long ept_pml4,
+			  uint32_t pinbased_ctls, uint32_t procbased_ctls,
+			  uint32_t procbased_ctls2, uint32_t exit_ctls,
+			  uint32_t entry_ctls, u_long msr_bitmap,
+			  uint16_t vpid);
+int	vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval);
+int	vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val);
+int	vmcs_getdesc(struct vmcs *vmcs, int ident,
+		     struct seg_desc *desc);
+int	vmcs_setdesc(struct vmcs *vmcs, int ident,
+		     struct seg_desc *desc);
+uint64_t vmcs_read(uint32_t encoding);
+
+#define	vmexit_instruction_length()	vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH)
+#define	vmcs_guest_rip()		vmcs_read(VMCS_GUEST_RIP)
+#define	vmcs_instruction_error()	vmcs_read(VMCS_INSTRUCTION_ERROR)
+#define	vmcs_exit_reason()		(vmcs_read(VMCS_EXIT_REASON) & 0xffff)
+#define	vmcs_exit_qualification()	vmcs_read(VMCS_EXIT_QUALIFICATION)
+
+#endif	/* _KERNEL */
+
+#define	VMCS_IDENT(encoding)		((encoding) | 0x80000000)
+/*
+ * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B.
+ */
+#define	VMCS_INVALID_ENCODING		0xffffffff
+
+/* 16-bit control fields */
+#define	VMCS_VPID			0x00000000
+
+/* 16-bit guest-state fields */
+#define	VMCS_GUEST_ES_SELECTOR		0x00000800
+#define	VMCS_GUEST_CS_SELECTOR		0x00000802
+#define	VMCS_GUEST_SS_SELECTOR		0x00000804
+#define	VMCS_GUEST_DS_SELECTOR		0x00000806
+#define	VMCS_GUEST_FS_SELECTOR		0x00000808
+#define	VMCS_GUEST_GS_SELECTOR		0x0000080A
+#define	VMCS_GUEST_LDTR_SELECTOR	0x0000080C
+#define	VMCS_GUEST_TR_SELECTOR		0x0000080E
+
+/* 16-bit host-state fields */
+#define	VMCS_HOST_ES_SELECTOR		0x00000C00
+#define	VMCS_HOST_CS_SELECTOR		0x00000C02
+#define	VMCS_HOST_SS_SELECTOR		0x00000C04
+#define	VMCS_HOST_DS_SELECTOR		0x00000C06
+#define	VMCS_HOST_FS_SELECTOR		0x00000C08
+#define	VMCS_HOST_GS_SELECTOR		0x00000C0A
+#define	VMCS_HOST_TR_SELECTOR		0x00000C0C
+
+/* 64-bit control fields */
+#define	VMCS_IO_BITMAP_A		0x00002000
+#define	VMCS_IO_BITMAP_B		0x00002002
+#define	VMCS_MSR_BITMAP			0x00002004
+#define	VMCS_EXIT_MSR_STORE		0x00002006
+#define	VMCS_EXIT_MSR_LOAD		0x00002008
+#define	VMCS_ENTRY_MSR_LOAD		0x0000200A
+#define	VMCS_EXECUTIVE_VMCS		0x0000200C
+#define	VMCS_TSC_OFFSET			0x00002010
+#define	VMCS_VIRTUAL_APIC		0x00002012
+#define	VMCS_APIC_ACCESS		0x00002014
+#define	VMCS_EPTP			0x0000201A
+
+/* 64-bit read-only fields */
+#define	VMCS_GUEST_PHYSICAL_ADDRESS	0x00002400
+
+/* 64-bit guest-state fields */
+#define	VMCS_LINK_POINTER		0x00002800
+#define	VMCS_GUEST_IA32_DEBUGCTL	0x00002802
+#define	VMCS_GUEST_IA32_PAT		0x00002804
+#define	VMCS_GUEST_IA32_EFER		0x00002806
+#define	VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808
+#define	VMCS_GUEST_PDPTE0		0x0000280A
+#define	VMCS_GUEST_PDPTE1		0x0000280C
+#define	VMCS_GUEST_PDPTE2		0x0000280E
+#define	VMCS_GUEST_PDPTE3		0x00002810
+
+/* 64-bit host-state fields */
+#define	VMCS_HOST_IA32_PAT		0x00002C00
+#define	VMCS_HOST_IA32_EFER		0x00002C02
+#define	VMCS_HOST_IA32_PERF_GLOBAL_CTRL	0x00002C04
+
+/* 32-bit control fields */
+#define	VMCS_PIN_BASED_CTLS		0x00004000
+#define	VMCS_PRI_PROC_BASED_CTLS	0x00004002
+#define	VMCS_EXCEPTION_BITMAP		0x00004004
+#define	VMCS_PF_ERROR_MASK		0x00004006
+#define	VMCS_PF_ERROR_MATCH		0x00004008
+#define	VMCS_CR3_TARGET_COUNT		0x0000400A
+#define	VMCS_EXIT_CTLS			0x0000400C
+#define	VMCS_EXIT_MSR_STORE_COUNT	0x0000400E
+#define	VMCS_EXIT_MSR_LOAD_COUNT	0x00004010
+#define	VMCS_ENTRY_CTLS			0x00004012
+#define	VMCS_ENTRY_MSR_LOAD_COUNT	0x00004014
+#define	VMCS_ENTRY_INTR_INFO		0x00004016
+#define	VMCS_ENTRY_EXCEPTION_ERROR	0x00004018
+#define	VMCS_ENTRY_INST_LENGTH		0x0000401A
+#define	VMCS_TPR_THRESHOLD		0x0000401C
+#define	VMCS_SEC_PROC_BASED_CTLS	0x0000401E
+#define	VMCS_PLE_GAP			0x00004020
+#define	VMCS_PLE_WINDOW			0x00004022
+
+/* 32-bit read-only data fields */
+#define	VMCS_INSTRUCTION_ERROR		0x00004400
+#define	VMCS_EXIT_REASON		0x00004402
+#define	VMCS_EXIT_INTERRUPTION_INFO	0x00004404
+#define	VMCS_EXIT_INTERRUPTION_ERROR	0x00004406
+#define	VMCS_IDT_VECTORING_INFO		0x00004408
+#define	VMCS_IDT_VECTORING_ERROR	0x0000440A
+#define	VMCS_EXIT_INSTRUCTION_LENGTH	0x0000440C
+#define	VMCS_EXIT_INSTRUCTION_INFO	0x0000440E
+
+/* 32-bit guest-state fields */
+#define	VMCS_GUEST_ES_LIMIT		0x00004800
+#define	VMCS_GUEST_CS_LIMIT		0x00004802
+#define	VMCS_GUEST_SS_LIMIT		0x00004804
+#define	VMCS_GUEST_DS_LIMIT		0x00004806
+#define	VMCS_GUEST_FS_LIMIT		0x00004808
+#define	VMCS_GUEST_GS_LIMIT		0x0000480A
+#define	VMCS_GUEST_LDTR_LIMIT		0x0000480C
+#define	VMCS_GUEST_TR_LIMIT		0x0000480E
+#define	VMCS_GUEST_GDTR_LIMIT		0x00004810
+#define	VMCS_GUEST_IDTR_LIMIT		0x00004812
+#define	VMCS_GUEST_ES_ACCESS_RIGHTS	0x00004814
+#define	VMCS_GUEST_CS_ACCESS_RIGHTS	0x00004816
+#define	VMCS_GUEST_SS_ACCESS_RIGHTS	0x00004818
+#define	VMCS_GUEST_DS_ACCESS_RIGHTS	0x0000481A
+#define	VMCS_GUEST_FS_ACCESS_RIGHTS	0x0000481C
+#define	VMCS_GUEST_GS_ACCESS_RIGHTS	0x0000481E
+#define	VMCS_GUEST_LDTR_ACCESS_RIGHTS	0x00004820
+#define	VMCS_GUEST_TR_ACCESS_RIGHTS	0x00004822
+#define	VMCS_GUEST_INTERRUPTIBILITY	0x00004824
+#define	VMCS_GUEST_ACTIVITY		0x00004826
+#define VMCS_GUEST_SMBASE		0x00004828
+#define	VMCS_GUEST_IA32_SYSENTER_CS	0x0000482A
+#define	VMCS_PREEMPTION_TIMER_VALUE	0x0000482E
+
+/* 32-bit host state fields */
+#define	VMCS_HOST_IA32_SYSENTER_CS	0x00004C00
+
+/* Natural Width control fields */
+#define	VMCS_CR0_MASK			0x00006000
+#define	VMCS_CR4_MASK			0x00006002
+#define	VMCS_CR0_SHADOW			0x00006004
+#define	VMCS_CR4_SHADOW			0x00006006
+#define	VMCS_CR3_TARGET0		0x00006008
+#define	VMCS_CR3_TARGET1		0x0000600A
+#define	VMCS_CR3_TARGET2		0x0000600C
+#define	VMCS_CR3_TARGET3		0x0000600E
+
+/* Natural Width read-only fields */
+#define	VMCS_EXIT_QUALIFICATION		0x00006400
+#define	VMCS_IO_RCX			0x00006402
+#define	VMCS_IO_RSI			0x00006404
+#define	VMCS_IO_RDI			0x00006406
+#define	VMCS_IO_RIP			0x00006408
+#define	VMCS_GUEST_LINEAR_ADDRESS	0x0000640A
+
+/* Natural Width guest-state fields */
+#define	VMCS_GUEST_CR0			0x00006800
+#define	VMCS_GUEST_CR3			0x00006802
+#define	VMCS_GUEST_CR4			0x00006804
+#define	VMCS_GUEST_ES_BASE		0x00006806
+#define	VMCS_GUEST_CS_BASE		0x00006808
+#define	VMCS_GUEST_SS_BASE		0x0000680A
+#define	VMCS_GUEST_DS_BASE		0x0000680C
+#define	VMCS_GUEST_FS_BASE		0x0000680E
+#define	VMCS_GUEST_GS_BASE		0x00006810
+#define	VMCS_GUEST_LDTR_BASE		0x00006812
+#define	VMCS_GUEST_TR_BASE		0x00006814
+#define	VMCS_GUEST_GDTR_BASE		0x00006816
+#define	VMCS_GUEST_IDTR_BASE		0x00006818
+#define	VMCS_GUEST_DR7			0x0000681A
+#define	VMCS_GUEST_RSP			0x0000681C
+#define	VMCS_GUEST_RIP			0x0000681E
+#define	VMCS_GUEST_RFLAGS		0x00006820
+#define	VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822
+#define	VMCS_GUEST_IA32_SYSENTER_ESP	0x00006824
+#define	VMCS_GUEST_IA32_SYSENTER_EIP	0x00006826
+
+/* Natural Width host-state fields */
+#define	VMCS_HOST_CR0			0x00006C00
+#define	VMCS_HOST_CR3			0x00006C02
+#define	VMCS_HOST_CR4			0x00006C04
+#define	VMCS_HOST_FS_BASE		0x00006C06
+#define	VMCS_HOST_GS_BASE		0x00006C08
+#define	VMCS_HOST_TR_BASE		0x00006C0A
+#define	VMCS_HOST_GDTR_BASE		0x00006C0C
+#define	VMCS_HOST_IDTR_BASE		0x00006C0E
+#define	VMCS_HOST_IA32_SYSENTER_ESP	0x00006C10
+#define	VMCS_HOST_IA32_SYSENTER_EIP	0x00006C12
+#define	VMCS_HOST_RSP			0x00006C14
+#define	VMCS_HOST_RIP			0x00006c16
+
+/*
+ * VM instruction error numbers
+ */
+#define	VMRESUME_WITH_NON_LAUNCHED_VMCS	5
+
+/*
+ * VMCS exit reasons
+ */
+#define EXIT_REASON_EXCEPTION		0
+#define EXIT_REASON_EXT_INTR		1
+#define EXIT_REASON_TRIPLE_FAULT	2
+#define EXIT_REASON_INIT		3
+#define EXIT_REASON_SIPI		4
+#define EXIT_REASON_IO_SMI		5
+#define EXIT_REASON_SMI			6
+#define EXIT_REASON_INTR_WINDOW		7
+#define EXIT_REASON_NMI_WINDOW		8
+#define EXIT_REASON_TASK_SWITCH		9
+#define EXIT_REASON_CPUID		10
+#define EXIT_REASON_GETSEC		11
+#define EXIT_REASON_HLT			12
+#define EXIT_REASON_INVD		13
+#define EXIT_REASON_INVLPG		14
+#define EXIT_REASON_RDPMC		15
+#define EXIT_REASON_RDTSC		16
+#define EXIT_REASON_RSM			17
+#define EXIT_REASON_VMCALL		18
+#define EXIT_REASON_VMCLEAR		19
+#define EXIT_REASON_VMLAUNCH		20
+#define EXIT_REASON_VMPTRLD		21
+#define EXIT_REASON_VMPTRST		22
+#define EXIT_REASON_VMREAD		23
+#define EXIT_REASON_VMRESUME		24
+#define EXIT_REASON_VMWRITE		25
+#define EXIT_REASON_VMXOFF		26
+#define EXIT_REASON_VMXON		27
+#define EXIT_REASON_CR_ACCESS		28
+#define EXIT_REASON_DR_ACCESS		29
+#define EXIT_REASON_INOUT		30
+#define EXIT_REASON_RDMSR		31
+#define EXIT_REASON_WRMSR		32
+#define EXIT_REASON_INVAL_VMCS		33
+#define EXIT_REASON_INVAL_MSR		34
+#define EXIT_REASON_MWAIT		36
+#define EXIT_REASON_MTF			37
+#define EXIT_REASON_MONITOR		39
+#define EXIT_REASON_PAUSE		40
+#define EXIT_REASON_MCE			41
+#define EXIT_REASON_TPR			43
+#define EXIT_REASON_APIC		44
+#define EXIT_REASON_GDTR_IDTR		46
+#define EXIT_REASON_LDTR_TR		47
+#define EXIT_REASON_EPT_FAULT		48
+#define EXIT_REASON_EPT_MISCONFIG	49
+#define EXIT_REASON_INVEPT		50
+#define EXIT_REASON_RDTSCP		51
+#define EXIT_REASON_VMX_PREEMPT		52
+#define EXIT_REASON_INVVPID		53
+#define EXIT_REASON_WBINVD		54
+#define EXIT_REASON_XSETBV		55
+
+/*
+ * VMCS interrupt information fields
+ */
+#define	VMCS_INTERRUPTION_INFO_VALID	(1 << 31)
+#define	VMCS_INTERRUPTION_INFO_HW_INTR	(0 << 8)
+#define	VMCS_INTERRUPTION_INFO_NMI	(2 << 8)
+
+/*
+ * VMCS Guest interruptibility field
+ */
+#define	VMCS_INTERRUPTIBILITY_STI_BLOCKING	(1 << 0)
+#define	VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING	(1 << 1)
+#define	VMCS_INTERRUPTIBILITY_SMI_BLOCKING	(1 << 2)
+#define	VMCS_INTERRUPTIBILITY_NMI_BLOCKING	(1 << 3)
+
+/*
+ * Exit qualification for EXIT_REASON_INVAL_VMCS
+ */
+#define	EXIT_QUAL_NMI_WHILE_STI_BLOCKING	3
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
new file mode 100644
index 0000000..ec181c4
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -0,0 +1,1673 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/psl.h>
+#include <machine/cpufunc.h>
+#include <machine/pmap.h>
+#include <machine/segments.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include "vmm_lapic.h"
+#include "vmm_msr.h"
+#include "vmm_ktr.h"
+#include "vmm_stat.h"
+
+#include "vmx_msr.h"
+#include "ept.h"
+#include "vmx_cpufunc.h"
+#include "vmx.h"
+#include "x86.h"
+#include "vmx_controls.h"
+
+#define	CR4_VMXE	(1UL << 13)
+
+#define	PINBASED_CTLS_ONE_SETTING					\
+	(PINBASED_EXTINT_EXITING	|				\
+	 PINBASED_NMI_EXITING		|				\
+	 PINBASED_VIRTUAL_NMI)
+#define	PINBASED_CTLS_ZERO_SETTING	0
+
+#define PROCBASED_CTLS_WINDOW_SETTING					\
+	(PROCBASED_INT_WINDOW_EXITING	|				\
+	 PROCBASED_NMI_WINDOW_EXITING)
+
+#define	PROCBASED_CTLS_ONE_SETTING 					\
+	(PROCBASED_SECONDARY_CONTROLS	|				\
+	 PROCBASED_IO_EXITING		|				\
+	 PROCBASED_MSR_BITMAPS		|				\
+	 PROCBASED_CTLS_WINDOW_SETTING)
+#define	PROCBASED_CTLS_ZERO_SETTING	\
+	(PROCBASED_CR3_LOAD_EXITING |	\
+	PROCBASED_CR3_STORE_EXITING |	\
+	PROCBASED_IO_BITMAPS)
+
+#define	PROCBASED_CTLS2_ONE_SETTING	PROCBASED2_ENABLE_EPT
+#define	PROCBASED_CTLS2_ZERO_SETTING	0
+
+#define	VM_EXIT_CTLS_ONE_SETTING					\
+	(VM_EXIT_HOST_LMA			|			\
+	VM_EXIT_SAVE_EFER			|			\
+	VM_EXIT_SAVE_PAT			|			\
+	VM_EXIT_LOAD_PAT			|			\
+	VM_EXIT_LOAD_EFER)
+#define	VM_EXIT_CTLS_ZERO_SETTING	VM_EXIT_SAVE_DEBUG_CONTROLS
+
+#define	VM_ENTRY_CTLS_ONE_SETTING					\
+	(VM_ENTRY_LOAD_PAT			|			\
+	VM_ENTRY_LOAD_EFER)
+#define	VM_ENTRY_CTLS_ZERO_SETTING					\
+	(VM_ENTRY_LOAD_DEBUG_CONTROLS		|			\
+	VM_ENTRY_INTO_SMM			|			\
+	VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
+
+#define	guest_msr_rw(vmx, msr) \
+	msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
+
+#define	HANDLED		1
+#define	UNHANDLED	0
+
+MALLOC_DEFINE(M_VMX, "vmx", "vmx");
+
+extern  struct pcpu __pcpu[];
+
+static int vmxon_enabled[MAXCPU];
+static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
+
+static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
+static uint32_t exit_ctls, entry_ctls;
+
+static uint64_t cr0_ones_mask, cr0_zeros_mask;
+static uint64_t cr4_ones_mask, cr4_zeros_mask;
+
+static volatile u_int nextvpid;
+
+/*
+ * Virtual NMI blocking conditions.
+ *
+ * Some processor implementations also require NMI to be blocked if
+ * the STI_BLOCKING bit is set. It is possible to detect this at runtime
+ * based on the (exit_reason,exit_qual) tuple being set to 
+ * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING).
+ *
+ * We take the easy way out and also include STI_BLOCKING as one of the
+ * gating items for vNMI injection.
+ */
+static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING |
+				    VMCS_INTERRUPTIBILITY_NMI_BLOCKING |
+				    VMCS_INTERRUPTIBILITY_STI_BLOCKING;
+
+/*
+ * Optional capabilities
+ */
+static int cap_halt_exit;
+static int cap_pause_exit;
+static int cap_unrestricted_guest;
+static int cap_monitor_trap;
+ 
+/* statistics */
+static VMM_STAT_DEFINE(VCPU_MIGRATIONS, "vcpu migration across host cpus");
+static VMM_STAT_DEFINE(VMEXIT_EXTINT, "vm exits due to external interrupt");
+
+#ifdef KTR
+static const char *
+exit_reason_to_str(int reason)
+{
+	static char reasonbuf[32];
+
+	switch (reason) {
+	case EXIT_REASON_EXCEPTION:
+		return "exception";
+	case EXIT_REASON_EXT_INTR:
+		return "extint";
+	case EXIT_REASON_TRIPLE_FAULT:
+		return "triplefault";
+	case EXIT_REASON_INIT:
+		return "init";
+	case EXIT_REASON_SIPI:
+		return "sipi";
+	case EXIT_REASON_IO_SMI:
+		return "iosmi";
+	case EXIT_REASON_SMI:
+		return "smi";
+	case EXIT_REASON_INTR_WINDOW:
+		return "intrwindow";
+	case EXIT_REASON_NMI_WINDOW:
+		return "nmiwindow";
+	case EXIT_REASON_TASK_SWITCH:
+		return "taskswitch";
+	case EXIT_REASON_CPUID:
+		return "cpuid";
+	case EXIT_REASON_GETSEC:
+		return "getsec";
+	case EXIT_REASON_HLT:
+		return "hlt";
+	case EXIT_REASON_INVD:
+		return "invd";
+	case EXIT_REASON_INVLPG:
+		return "invlpg";
+	case EXIT_REASON_RDPMC:
+		return "rdpmc";
+	case EXIT_REASON_RDTSC:
+		return "rdtsc";
+	case EXIT_REASON_RSM:
+		return "rsm";
+	case EXIT_REASON_VMCALL:
+		return "vmcall";
+	case EXIT_REASON_VMCLEAR:
+		return "vmclear";
+	case EXIT_REASON_VMLAUNCH:
+		return "vmlaunch";
+	case EXIT_REASON_VMPTRLD:
+		return "vmptrld";
+	case EXIT_REASON_VMPTRST:
+		return "vmptrst";
+	case EXIT_REASON_VMREAD:
+		return "vmread";
+	case EXIT_REASON_VMRESUME:
+		return "vmresume";
+	case EXIT_REASON_VMWRITE:
+		return "vmwrite";
+	case EXIT_REASON_VMXOFF:
+		return "vmxoff";
+	case EXIT_REASON_VMXON:
+		return "vmxon";
+	case EXIT_REASON_CR_ACCESS:
+		return "craccess";
+	case EXIT_REASON_DR_ACCESS:
+		return "draccess";
+	case EXIT_REASON_INOUT:
+		return "inout";
+	case EXIT_REASON_RDMSR:
+		return "rdmsr";
+	case EXIT_REASON_WRMSR:
+		return "wrmsr";
+	case EXIT_REASON_INVAL_VMCS:
+		return "invalvmcs";
+	case EXIT_REASON_INVAL_MSR:
+		return "invalmsr";
+	case EXIT_REASON_MWAIT:
+		return "mwait";
+	case EXIT_REASON_MTF:
+		return "mtf";
+	case EXIT_REASON_MONITOR:
+		return "monitor";
+	case EXIT_REASON_PAUSE:
+		return "pause";
+	case EXIT_REASON_MCE:
+		return "mce";
+	case EXIT_REASON_TPR:
+		return "tpr";
+	case EXIT_REASON_APIC:
+		return "apic";
+	case EXIT_REASON_GDTR_IDTR:
+		return "gdtridtr";
+	case EXIT_REASON_LDTR_TR:
+		return "ldtrtr";
+	case EXIT_REASON_EPT_FAULT:
+		return "eptfault";
+	case EXIT_REASON_EPT_MISCONFIG:
+		return "eptmisconfig";
+	case EXIT_REASON_INVEPT:
+		return "invept";
+	case EXIT_REASON_RDTSCP:
+		return "rdtscp";
+	case EXIT_REASON_VMX_PREEMPT:
+		return "vmxpreempt";
+	case EXIT_REASON_INVVPID:
+		return "invvpid";
+	case EXIT_REASON_WBINVD:
+		return "wbinvd";
+	case EXIT_REASON_XSETBV:
+		return "xsetbv";
+	default:
+		snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
+		return (reasonbuf);
+	}
+}
+
+#ifdef SETJMP_TRACE
+static const char *
+vmx_setjmp_rc2str(int rc)
+{
+	switch (rc) {
+	case VMX_RETURN_DIRECT:
+		return "direct";
+	case VMX_RETURN_LONGJMP:
+		return "longjmp";
+	case VMX_RETURN_VMRESUME:
+		return "vmresume";
+	case VMX_RETURN_VMLAUNCH:
+		return "vmlaunch";
+	default:
+		return "unknown";
+	}
+}
+
+#define	SETJMP_TRACE(vmx, vcpu, vmxctx, regname)			  \
+	VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \
+		 (vmxctx)->regname)
+
+static void
+vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
+{
+	uint64_t host_rip, host_rsp;
+
+	if (vmxctx != &vmx->ctx[vcpu])
+		panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p",
+			vmxctx, &vmx->ctx[vcpu]);
+
+	VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx);
+	VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)",
+		 vmx_setjmp_rc2str(rc), rc);
+
+	host_rsp = host_rip = ~0;
+	vmread(VMCS_HOST_RIP, &host_rip);
+	vmread(VMCS_HOST_RSP, &host_rsp);
+	VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx",
+		 host_rip, host_rsp);
+
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip);
+
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2);
+}
+#endif
+#else
+static void __inline
+vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
+{
+	return;
+}
+#endif	/* KTR */
+
+u_long
+vmx_fix_cr0(u_long cr0)
+{
+
+	return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
+}
+
+u_long
+vmx_fix_cr4(u_long cr4)
+{
+
+	return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
+}
+
+static void
+msr_save_area_init(struct msr_entry *g_area, int *g_count)
+{
+	int cnt;
+
+	static struct msr_entry guest_msrs[] = {
+		{ MSR_KGSBASE, 0, 0 },
+	};
+
+	cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
+	if (cnt > GUEST_MSR_MAX_ENTRIES)
+		panic("guest msr save area overrun");
+	bcopy(guest_msrs, g_area, sizeof(guest_msrs));
+	*g_count = cnt;
+}
+
+static void
+vmx_disable(void *arg __unused)
+{
+	struct invvpid_desc invvpid_desc = { 0 };
+	struct invept_desc invept_desc = { 0 };
+
+	if (vmxon_enabled[curcpu]) {
+		/*
+		 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
+		 *
+		 * VMXON or VMXOFF are not required to invalidate any TLB
+		 * caching structures. This prevents potential retention of
+		 * cached information in the TLB between distinct VMX episodes.
+		 */
+		invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
+		invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
+		vmxoff();
+	}
+	load_cr4(rcr4() & ~CR4_VMXE);
+}
+
+static int
+vmx_cleanup(void)
+{
+
+	smp_rendezvous(NULL, vmx_disable, NULL, NULL);
+
+	return (0);
+}
+
+static void
+vmx_enable(void *arg __unused)
+{
+	int error;
+
+	load_cr4(rcr4() | CR4_VMXE);
+
+	*(uint32_t *)vmxon_region[curcpu] = vmx_revision();
+	error = vmxon(vmxon_region[curcpu]);
+	if (error == 0)
+		vmxon_enabled[curcpu] = 1;
+}
+
+static int
+vmx_init(void)
+{
+	int error;
+	unsigned int regs[4];
+	uint64_t fixed0, fixed1;
+	uint32_t tmp;
+
+	/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
+	do_cpuid(1, regs);
+	if ((regs[2] & CPUID_0000_0001_FEAT0_VMX) == 0) {
+		printf("vmx_init: processor does not support VMX operation\n");
+		return (ENXIO);
+	}
+
+	/* Check support for primary processor-based VM-execution controls */
+	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+			       MSR_VMX_TRUE_PROCBASED_CTLS,
+			       PROCBASED_CTLS_ONE_SETTING,
+			       PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
+	if (error) {
+		printf("vmx_init: processor does not support desired primary "
+		       "processor-based controls\n");
+		return (error);
+	}
+
+	/* Clear the processor-based ctl bits that are set on demand */
+	procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
+
+	/* Check support for secondary processor-based VM-execution controls */
+	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+			       MSR_VMX_PROCBASED_CTLS2,
+			       PROCBASED_CTLS2_ONE_SETTING,
+			       PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
+	if (error) {
+		printf("vmx_init: processor does not support desired secondary "
+		       "processor-based controls\n");
+		return (error);
+	}
+
+	/* Check support for VPID */
+	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
+			       PROCBASED2_ENABLE_VPID, 0, &tmp);
+	if (error == 0)
+		procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
+
+	/* Check support for pin-based VM-execution controls */
+	error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
+			       MSR_VMX_TRUE_PINBASED_CTLS,
+			       PINBASED_CTLS_ONE_SETTING,
+			       PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
+	if (error) {
+		printf("vmx_init: processor does not support desired "
+		       "pin-based controls\n");
+		return (error);
+	}
+
+	/* Check support for VM-exit controls */
+	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
+			       VM_EXIT_CTLS_ONE_SETTING,
+			       VM_EXIT_CTLS_ZERO_SETTING,
+			       &exit_ctls);
+	if (error) {
+		printf("vmx_init: processor does not support desired "
+		       "exit controls\n");
+		       return (error);
+	}
+
+	/* Check support for VM-entry controls */
+	error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
+			       VM_ENTRY_CTLS_ONE_SETTING,
+			       VM_ENTRY_CTLS_ZERO_SETTING,
+			       &entry_ctls);
+	if (error) {
+		printf("vmx_init: processor does not support desired "
+		       "entry controls\n");
+		       return (error);
+	}
+
+	/*
+	 * Check support for optional features by testing them
+	 * as individual bits
+	 */
+	cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+					MSR_VMX_TRUE_PROCBASED_CTLS,
+					PROCBASED_HLT_EXITING, 0,
+					&tmp) == 0);
+
+	cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+					MSR_VMX_PROCBASED_CTLS,
+					PROCBASED_MTF, 0,
+					&tmp) == 0);
+
+	cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+					 MSR_VMX_TRUE_PROCBASED_CTLS,
+					 PROCBASED_PAUSE_EXITING, 0,
+					 &tmp) == 0);
+
+	cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+					MSR_VMX_PROCBASED_CTLS2,
+					PROCBASED2_UNRESTRICTED_GUEST, 0,
+				        &tmp) == 0);
+
+	/* Initialize EPT */
+	error = ept_init();
+	if (error) {
+		printf("vmx_init: ept initialization failed (%d)\n", error);
+		return (error);
+	}
+
+	/*
+	 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
+	 */
+	fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
+	fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
+	cr0_ones_mask = fixed0 & fixed1;
+	cr0_zeros_mask = ~fixed0 & ~fixed1;
+
+	/*
+	 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
+	 * if unrestricted guest execution is allowed.
+	 */
+	if (cap_unrestricted_guest)
+		cr0_ones_mask &= ~(CR0_PG | CR0_PE);
+
+	/*
+	 * Do not allow the guest to set CR0_NW or CR0_CD.
+	 */
+	cr0_zeros_mask |= (CR0_NW | CR0_CD);
+
+	fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
+	fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
+	cr4_ones_mask = fixed0 & fixed1;
+	cr4_zeros_mask = ~fixed0 & ~fixed1;
+
+	/* enable VMX operation */
+	smp_rendezvous(NULL, vmx_enable, NULL, NULL);
+
+	return (0);
+}
+
+/*
+ * If this processor does not support VPIDs then simply return 0.
+ *
+ * Otherwise generate the next value of VPID to use. Any value is alright
+ * as long as it is non-zero.
+ *
+ * We always execute in VMX non-root context with EPT enabled. Thus all
+ * combined mappings are tagged with the (EP4TA, VPID, PCID) tuple. This
+ * in turn means that multiple VMs can share the same VPID as long as
+ * they have distinct EPT page tables.
+ *
+ * XXX
+ * We should optimize this so that it returns VPIDs that are not in
+ * use. Then we will not unnecessarily invalidate mappings in
+ * vmx_set_pcpu_defaults() just because two or more vcpus happen to
+ * use the same 'vpid'.
+ */
+static uint16_t
+vmx_vpid(void)
+{
+	uint16_t vpid = 0;
+
+	if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) != 0) {
+		do {
+			vpid = atomic_fetchadd_int(&nextvpid, 1);
+		} while (vpid == 0);
+	}
+
+	return (vpid);
+}
+
+static int
+vmx_setup_cr0_shadow(struct vmcs *vmcs)
+{
+	int error;
+	uint64_t mask, shadow;
+
+	mask = cr0_ones_mask | cr0_zeros_mask;
+	error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_CR0_MASK), mask);
+	if (error)
+		return (error);
+
+	shadow = cr0_ones_mask;
+	error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_CR0_SHADOW), shadow);
+	if (error)
+		return (error);
+
+	return (0);
+}
+
+static void *
+vmx_vminit(struct vm *vm)
+{
+	uint16_t vpid;
+	int i, error, guest_msr_count;
+	struct vmx *vmx;
+
+	vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
+	if ((uintptr_t)vmx & PAGE_MASK) {
+		panic("malloc of struct vmx not aligned on %d byte boundary",
+		      PAGE_SIZE);
+	}
+	vmx->vm = vm;
+
+	/*
+	 * Clean up EPTP-tagged guest physical and combined mappings
+	 *
+	 * VMX transitions are not required to invalidate any guest physical
+	 * mappings. So, it may be possible for stale guest physical mappings
+	 * to be present in the processor TLBs.
+	 *
+	 * Combined mappings for this EP4TA are also invalidated for all VPIDs.
+	 */
+	ept_invalidate_mappings(vtophys(vmx->pml4ept));
+
+	msr_bitmap_initialize(vmx->msr_bitmap);
+
+	/*
+	 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
+	 * The guest FSBASE and GSBASE are saved and restored during
+	 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
+	 * always restored from the vmcs host state area on vm-exit.
+	 *
+	 * Guest KGSBASE is saved and restored in the guest MSR save area.
+	 * Host KGSBASE is restored before returning to userland from the pcb.
+	 * There will be a window of time when we are executing in the host
+	 * kernel context with a value of KGSBASE from the guest. This is ok
+	 * because the value of KGSBASE is inconsequential in kernel context.
+	 *
+	 * MSR_EFER is saved and restored in the guest VMCS area on a
+	 * VM exit and entry respectively. It is also restored from the
+	 * host VMCS area on a VM exit.
+	 *
+	 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
+	 * and entry respectively. It is also restored from the host VMCS
+	 * area on a VM exit.
+	 */
+	if (guest_msr_rw(vmx, MSR_GSBASE) ||
+	    guest_msr_rw(vmx, MSR_FSBASE) ||
+	    guest_msr_rw(vmx, MSR_KGSBASE) ||
+	    guest_msr_rw(vmx, MSR_EFER) ||
+	    guest_msr_rw(vmx, MSR_PAT))
+		panic("vmx_vminit: error setting guest msr access");
+
+	for (i = 0; i < VM_MAXCPU; i++) {
+		vmx->vmcs[i].identifier = vmx_revision();
+		error = vmclear(&vmx->vmcs[i]);
+		if (error != 0) {
+			panic("vmx_vminit: vmclear error %d on vcpu %d\n",
+			      error, i);
+		}
+
+		vpid = vmx_vpid();
+
+		error = vmcs_set_defaults(&vmx->vmcs[i],
+					  (u_long)vmx_longjmp,
+					  (u_long)&vmx->ctx[i],
+					  vtophys(vmx->pml4ept),
+					  pinbased_ctls,
+					  procbased_ctls,
+					  procbased_ctls2,
+					  exit_ctls, entry_ctls,
+					  vtophys(vmx->msr_bitmap),
+					  vpid);
+
+		if (error != 0)
+			panic("vmx_vminit: vmcs_set_defaults error %d", error);
+
+		vmx->cap[i].set = 0;
+		vmx->cap[i].proc_ctls = procbased_ctls;
+
+		vmx->state[i].request_nmi = 0;
+		vmx->state[i].lastcpu = -1;
+		vmx->state[i].vpid = vpid;
+
+		msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
+
+		error = vmcs_set_msr_save(&vmx->vmcs[i],
+					  vtophys(vmx->guest_msrs[i]),
+					  guest_msr_count);
+		if (error != 0)
+			panic("vmcs_set_msr_save error %d", error);
+
+		error = vmx_setup_cr0_shadow(&vmx->vmcs[i]);
+	}
+
+	return (vmx);
+}
+
+static int
+vmx_handle_cpuid(struct vmxctx *vmxctx)
+{
+	int handled, func;
+	
+	func = vmxctx->guest_rax;
+
+	handled = x86_emulate_cpuid((uint32_t*)(&vmxctx->guest_rax),
+	    (uint32_t*)(&vmxctx->guest_rbx), (uint32_t*)(&vmxctx->guest_rcx),
+	    (uint32_t*)(&vmxctx->guest_rdx));
+#if 0
+	printf("%s: func %x rax %lx rbx %lx rcx %lx rdx %lx handled %d\n",
+		__func__, func, vmxctx->guest_rax, vmxctx->guest_rbx,
+		vmxctx->guest_rcx, vmxctx->guest_rdx, handled);
+#endif
+
+	return (handled);
+}
+
+static __inline void
+vmx_run_trace(struct vmx *vmx, int vcpu)
+{
+#ifdef KTR
+	VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip());
+#endif
+}
+
+static __inline void
+vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
+	       int handled, int astpending)
+{
+#ifdef KTR
+	VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
+		 handled ? "handled" : "unhandled",
+		 exit_reason_to_str(exit_reason), rip);
+
+	if (astpending)
+		VMM_CTR0(vmx->vm, vcpu, "astpending");
+#endif
+}
+
+static int
+vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu)
+{
+	int error, lastcpu;
+	struct vmxstate *vmxstate;
+	struct invvpid_desc invvpid_desc = { 0 };
+
+	vmxstate = &vmx->state[vcpu];
+	lastcpu = vmxstate->lastcpu;
+	vmxstate->lastcpu = curcpu;
+
+	if (lastcpu == curcpu) {
+		error = 0;
+		goto done;
+	}
+
+	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
+
+	error = vmwrite(VMCS_HOST_TR_BASE, (u_long)PCPU_GET(tssp));
+	if (error != 0)
+		goto done;
+
+	error = vmwrite(VMCS_HOST_GDTR_BASE, (u_long)&gdt[NGDT * curcpu]);
+	if (error != 0)
+		goto done;
+
+	error = vmwrite(VMCS_HOST_GS_BASE, (u_long)&__pcpu[curcpu]);
+	if (error != 0)
+		goto done;
+
+	/*
+	 * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
+	 *
+	 * We do this because this vcpu was executing on a different host
+	 * cpu when it last ran. We do not track whether it invalidated
+	 * mappings associated with its 'vpid' during that run. So we must
+	 * assume that the mappings associated with 'vpid' on 'curcpu' are
+	 * stale and invalidate them.
+	 *
+	 * Note that we incur this penalty only when the scheduler chooses to
+	 * move the thread associated with this vcpu between host cpus.
+	 *
+	 * Note also that this will invalidate mappings tagged with 'vpid'
+	 * for "all" EP4TAs.
+	 */
+	if (vmxstate->vpid != 0) {
+		invvpid_desc.vpid = vmxstate->vpid;
+		invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
+	}
+done:
+	return (error);
+}
+
+static void 
+vm_exit_update_rip(struct vm_exit *vmexit)
+{
+	int error;
+
+	error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length);
+	if (error)
+		panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
+}
+
+/*
+ * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
+ */
+CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
+
+static void __inline
+vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
+{
+	int error;
+
+	vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
+
+	error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+	if (error)
+		panic("vmx_set_int_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
+{
+	int error;
+
+	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
+
+	error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+	if (error)
+		panic("vmx_clear_int_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+	int error;
+
+	vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
+
+	error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+	if (error)
+		panic("vmx_set_nmi_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+	int error;
+
+	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
+
+	error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+	if (error)
+		panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error);
+}
+
+static int
+vmx_inject_nmi(struct vmx *vmx, int vcpu)
+{
+	int error;
+	uint64_t info, interruptibility;
+
+	/* Bail out if no NMI requested */
+	if (vmx->state[vcpu].request_nmi == 0)
+		return (0);
+
+	error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility);
+	if (error) {
+		panic("vmx_inject_nmi: vmread(interruptibility) %d",
+			error);
+	}
+	if (interruptibility & nmi_blocking_bits)
+		goto nmiblocked;
+
+	/*
+	 * Inject the virtual NMI. The vector must be the NMI IDT entry
+	 * or the VMCS entry check will fail.
+	 */
+	info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID;
+	info |= IDT_NMI;
+
+	error = vmwrite(VMCS_ENTRY_INTR_INFO, info);
+	if (error)
+		panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error);
+
+	VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI");
+
+	/* Clear the request */
+	vmx->state[vcpu].request_nmi = 0;
+	return (1);
+
+nmiblocked:
+	/*
+	 * Set the NMI Window Exiting execution control so we can inject
+	 * the virtual NMI as soon as blocking condition goes away.
+	 */
+	vmx_set_nmi_window_exiting(vmx, vcpu);
+
+	VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
+	return (1);
+}
+
+static void
+vmx_inject_interrupts(struct vmx *vmx, int vcpu)
+{
+	int error, vector;
+	uint64_t info, rflags, interruptibility;
+
+	const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING |
+				   VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING;
+
+#if 1
+	/*
+	 * XXX
+	 * If an event is being injected from userland then just return.
+	 * For e.g. we may inject a breakpoint exception to cause the
+	 * guest to enter the debugger so we can inspect its state.
+	 */
+	error = vmread(VMCS_ENTRY_INTR_INFO, &info);
+	if (error)
+		panic("vmx_inject_interrupts: vmread(intrinfo) %d", error);
+	if (info & VMCS_INTERRUPTION_INFO_VALID)
+		return;
+#endif
+	/*
+	 * NMI injection has priority so deal with those first
+	 */
+	if (vmx_inject_nmi(vmx, vcpu))
+		return;
+
+	/* Ask the local apic for a vector to inject */
+	vector = lapic_pending_intr(vmx->vm, vcpu);
+	if (vector < 0)
+		return;
+
+	if (vector < 32 || vector > 255)
+		panic("vmx_inject_interrupts: invalid vector %d\n", vector);
+
+	/* Check RFLAGS.IF and the interruptibility state of the guest */
+	error = vmread(VMCS_GUEST_RFLAGS, &rflags);
+	if (error)
+		panic("vmx_inject_interrupts: vmread(rflags) %d", error);
+
+	if ((rflags & PSL_I) == 0)
+		goto cantinject;
+
+	error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility);
+	if (error) {
+		panic("vmx_inject_interrupts: vmread(interruptibility) %d",
+			error);
+	}
+	if (interruptibility & HWINTR_BLOCKED)
+		goto cantinject;
+
+	/* Inject the interrupt */
+	info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID;
+	info |= vector;
+	error = vmwrite(VMCS_ENTRY_INTR_INFO, info);
+	if (error)
+		panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error);
+
+	/* Update the Local APIC ISR */
+	lapic_intr_accepted(vmx->vm, vcpu, vector);
+
+	VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
+
+	return;
+
+cantinject:
+	/*
+	 * Set the Interrupt Window Exiting execution control so we can inject
+	 * the interrupt as soon as blocking condition goes away.
+	 */
+	vmx_set_int_window_exiting(vmx, vcpu);
+
+	VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
+}
+
+static int
+vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
+{
+	int error;
+	uint64_t regval;
+	const struct vmxctx *vmxctx;
+
+	/* We only handle mov to %cr0 at this time */
+	if ((exitqual & 0xff) != 0x00)
+		return (UNHANDLED);
+
+	vmxctx = &vmx->ctx[vcpu];
+
+	/*
+	 * We must use vmwrite() directly here because vmcs_setreg() will
+	 * call vmclear(vmcs) as a side-effect which we certainly don't want.
+	 */
+	switch ((exitqual >> 8) & 0xf) {
+	case 0:
+		regval = vmxctx->guest_rax;
+		break;
+	case 1:
+		regval = vmxctx->guest_rcx;
+		break;
+	case 2:
+		regval = vmxctx->guest_rdx;
+		break;
+	case 3:
+		regval = vmxctx->guest_rbx;
+		break;
+	case 4:
+		error = vmread(VMCS_GUEST_RSP, &regval);
+		if (error) {
+			panic("vmx_emulate_cr_access: "
+			      "error %d reading guest rsp", error);
+		}
+		break;
+	case 5:
+		regval = vmxctx->guest_rbp;
+		break;
+	case 6:
+		regval = vmxctx->guest_rsi;
+		break;
+	case 7:
+		regval = vmxctx->guest_rdi;
+		break;
+	case 8:
+		regval = vmxctx->guest_r8;
+		break;
+	case 9:
+		regval = vmxctx->guest_r9;
+		break;
+	case 10:
+		regval = vmxctx->guest_r10;
+		break;
+	case 11:
+		regval = vmxctx->guest_r11;
+		break;
+	case 12:
+		regval = vmxctx->guest_r12;
+		break;
+	case 13:
+		regval = vmxctx->guest_r13;
+		break;
+	case 14:
+		regval = vmxctx->guest_r14;
+		break;
+	case 15:
+		regval = vmxctx->guest_r15;
+		break;
+	}
+
+	regval |= cr0_ones_mask;
+	regval &= ~cr0_zeros_mask;
+	error = vmwrite(VMCS_GUEST_CR0, regval);
+	if (error)
+		panic("vmx_emulate_cr_access: error %d writing cr0", error);
+
+	return (HANDLED);
+}
+
+static int
+vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
+{
+	int handled;
+	struct vmcs *vmcs;
+	struct vmxctx *vmxctx;
+	uint32_t eax, ecx, edx;
+	uint64_t qual;
+
+	handled = 0;
+	vmcs = &vmx->vmcs[vcpu];
+	vmxctx = &vmx->ctx[vcpu];
+	qual = vmexit->u.vmx.exit_qualification;
+	vmexit->exitcode = VM_EXITCODE_BOGUS;
+
+	switch (vmexit->u.vmx.exit_reason) {
+	case EXIT_REASON_CR_ACCESS:
+		handled = vmx_emulate_cr_access(vmx, vcpu, qual);
+		break;
+	case EXIT_REASON_RDMSR:
+		ecx = vmxctx->guest_rcx;
+		handled = emulate_rdmsr(vmx->vm, vcpu, ecx);
+		if (!handled) {
+			vmexit->exitcode = VM_EXITCODE_RDMSR;
+			vmexit->u.msr.code = ecx;
+		}
+		break;
+	case EXIT_REASON_WRMSR:
+		eax = vmxctx->guest_rax;
+		ecx = vmxctx->guest_rcx;
+		edx = vmxctx->guest_rdx;
+		handled = emulate_wrmsr(vmx->vm, vcpu, ecx,
+					(uint64_t)edx << 32 | eax);
+		if (!handled) {
+			vmexit->exitcode = VM_EXITCODE_WRMSR;
+			vmexit->u.msr.code = ecx;
+			vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
+		}
+		break;
+	case EXIT_REASON_HLT:
+		vmexit->exitcode = VM_EXITCODE_HLT;
+		break;
+	case EXIT_REASON_MTF:
+		vmexit->exitcode = VM_EXITCODE_MTRAP;
+		break;
+	case EXIT_REASON_PAUSE:
+		vmexit->exitcode = VM_EXITCODE_PAUSE;
+		break;
+	case EXIT_REASON_INTR_WINDOW:
+		vmx_clear_int_window_exiting(vmx, vcpu);
+		VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
+		/* FALLTHRU */
+	case EXIT_REASON_EXT_INTR:
+		/*
+		 * External interrupts serve only to cause VM exits and allow
+		 * the host interrupt handler to run.
+		 *
+		 * If this external interrupt triggers a virtual interrupt
+		 * to a VM, then that state will be recorded by the
+		 * host interrupt handler in the VM's softc. We will inject
+		 * this virtual interrupt during the subsequent VM enter.
+		 */
+
+		/*
+		 * This is special. We want to treat this as an 'handled'
+		 * VM-exit but not increment the instruction pointer.
+		 */
+		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
+		return (1);
+	case EXIT_REASON_NMI_WINDOW:
+		/* Exit to allow the pending virtual NMI to be injected */
+		vmx_clear_nmi_window_exiting(vmx, vcpu);
+		VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
+		return (1);
+	case EXIT_REASON_INOUT:
+		vmexit->exitcode = VM_EXITCODE_INOUT;
+		vmexit->u.inout.bytes = (qual & 0x7) + 1;
+		vmexit->u.inout.in = (qual & 0x8) ? 1 : 0;
+		vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
+		vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
+		vmexit->u.inout.port = (uint16_t)(qual >> 16);
+		vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
+		break;
+	case EXIT_REASON_CPUID:
+		handled = vmx_handle_cpuid(vmxctx);
+		break;
+	default:
+		break;
+	}
+
+	if (handled) {
+		/*
+		 * It is possible that control is returned to userland
+		 * even though we were able to handle the VM exit in the
+		 * kernel (for e.g. 'astpending' is set in the run loop).
+		 *
+		 * In such a case we want to make sure that the userland
+		 * restarts guest execution at the instruction *after*
+		 * the one we just processed. Therefore we update the
+		 * guest rip in the VMCS and in 'vmexit'.
+		 */
+		vm_exit_update_rip(vmexit);
+		vmexit->rip += vmexit->inst_length;
+		vmexit->inst_length = 0;
+	} else {
+		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
+			/*
+			 * If this VM exit was not claimed by anybody then
+			 * treat it as a generic VMX exit.
+			 */
+			vmexit->exitcode = VM_EXITCODE_VMX;
+			vmexit->u.vmx.error = 0;
+		} else {
+			/*
+			 * The exitcode and collateral have been populated.
+			 * The VM exit will be processed further in userland.
+			 */
+		}
+	}
+	return (handled);
+}
+
+static int
+vmx_run(void *arg, int vcpu, register_t rip, struct vm_exit *vmexit)
+{
+	int error, vie, rc, handled, astpending, loopstart;
+	uint32_t exit_reason;
+	struct vmx *vmx;
+	struct vmxctx *vmxctx;
+	struct vmcs *vmcs;
+	
+	vmx = arg;
+	vmcs = &vmx->vmcs[vcpu];
+	vmxctx = &vmx->ctx[vcpu];
+	loopstart = 1;
+
+	/*
+	 * XXX Can we avoid doing this every time we do a vm run?
+	 */
+	VMPTRLD(vmcs);
+
+	/*
+	 * XXX
+	 * We do this every time because we may setup the virtual machine
+	 * from a different process than the one that actually runs it.
+	 *
+	 * If the life of a virtual machine was spent entirely in the context
+	 * of a single process we could do this once in vmcs_set_defaults().
+	 */
+	if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0)
+		panic("vmx_run: error %d writing to VMCS_HOST_CR3", error);
+
+	if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0)
+		panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
+
+	if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0)
+		panic("vmx_run: error %d setting up pcpu defaults", error);
+
+	do {
+		lapic_timer_tick(vmx->vm, vcpu);
+		vmx_inject_interrupts(vmx, vcpu);
+		vmx_run_trace(vmx, vcpu);
+		rc = vmx_setjmp(vmxctx);
+#ifdef SETJMP_TRACE
+		vmx_setjmp_trace(vmx, vcpu, vmxctx, rc);
+#endif
+		switch (rc) {
+		case VMX_RETURN_DIRECT:
+			if (loopstart) {
+				loopstart = 0;
+				vmx_launch(vmxctx);
+			} else
+				vmx_resume(vmxctx);
+			panic("vmx_launch/resume should not return");
+			break;
+		case VMX_RETURN_LONGJMP:
+			break;			/* vm exit */
+		case VMX_RETURN_VMRESUME:
+			vie = vmcs_instruction_error();
+			if (vmxctx->launch_error == VM_FAIL_INVALID ||
+			    vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) {
+				printf("vmresume error %d vmcs inst error %d\n",
+					vmxctx->launch_error, vie);
+				goto err_exit;
+			}
+			vmx_launch(vmxctx);	/* try to launch the guest */
+			panic("vmx_launch should not return");
+			break;
+		case VMX_RETURN_VMLAUNCH:
+			vie = vmcs_instruction_error();
+#if 1
+			printf("vmlaunch error %d vmcs inst error %d\n",
+				vmxctx->launch_error, vie);
+#endif
+			goto err_exit;
+		default:
+			panic("vmx_setjmp returned %d", rc);
+		}
+		
+		/*
+		 * XXX locking?
+		 * See comments in exception.S about checking for ASTs
+		 * atomically while interrupts are disabled. But it is
+		 * not clear that they apply in our case.
+		 */
+		astpending = curthread->td_flags & TDF_ASTPENDING;
+
+		/* enable interrupts */
+		enable_intr();
+
+		/* collect some basic information for VM exit processing */
+		vmexit->rip = rip = vmcs_guest_rip();
+		vmexit->inst_length = vmexit_instruction_length();
+		vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
+		vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
+
+		handled = vmx_exit_process(vmx, vcpu, vmexit);
+
+		vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled,
+			       astpending);
+	} while (handled && !astpending);
+
+	/*
+	 * If a VM exit has been handled then the exitcode must be BOGUS
+	 * If a VM exit is not handled then the exitcode must not be BOGUS
+	 */
+	if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
+	    (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
+		panic("Mismatch between handled (%d) and exitcode (%d)",
+		      handled, vmexit->exitcode);
+	}
+
+	VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode);
+
+	/*
+	 * XXX
+	 * We need to do this to ensure that any VMCS state cached by the
+	 * processor is flushed to memory. We need to do this in case the
+	 * VM moves to a different cpu the next time it runs.
+	 *
+	 * Can we avoid doing this?
+	 */
+	VMCLEAR(vmcs);
+	return (0);
+
+err_exit:
+	vmexit->exitcode = VM_EXITCODE_VMX;
+	vmexit->u.vmx.exit_reason = (uint32_t)-1;
+	vmexit->u.vmx.exit_qualification = (uint32_t)-1;
+	vmexit->u.vmx.error = vie;
+	VMCLEAR(vmcs);
+	return (ENOEXEC);
+}
+
+static void
+vmx_vmcleanup(void *arg)
+{
+	int error;
+	struct vmx *vmx = arg;
+
+	/*
+	 * XXXSMP we also need to clear the VMCS active on the other vcpus.
+	 */
+	error = vmclear(&vmx->vmcs[0]);
+	if (error != 0)
+		panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
+
+	ept_vmcleanup(vmx);
+	free(vmx, M_VMX);
+
+	return;
+}
+
+static register_t *
+vmxctx_regptr(struct vmxctx *vmxctx, int reg)
+{
+
+	switch (reg) {
+	case VM_REG_GUEST_RAX:
+		return (&vmxctx->guest_rax);
+	case VM_REG_GUEST_RBX:
+		return (&vmxctx->guest_rbx);
+	case VM_REG_GUEST_RCX:
+		return (&vmxctx->guest_rcx);
+	case VM_REG_GUEST_RDX:
+		return (&vmxctx->guest_rdx);
+	case VM_REG_GUEST_RSI:
+		return (&vmxctx->guest_rsi);
+	case VM_REG_GUEST_RDI:
+		return (&vmxctx->guest_rdi);
+	case VM_REG_GUEST_RBP:
+		return (&vmxctx->guest_rbp);
+	case VM_REG_GUEST_R8:
+		return (&vmxctx->guest_r8);
+	case VM_REG_GUEST_R9:
+		return (&vmxctx->guest_r9);
+	case VM_REG_GUEST_R10:
+		return (&vmxctx->guest_r10);
+	case VM_REG_GUEST_R11:
+		return (&vmxctx->guest_r11);
+	case VM_REG_GUEST_R12:
+		return (&vmxctx->guest_r12);
+	case VM_REG_GUEST_R13:
+		return (&vmxctx->guest_r13);
+	case VM_REG_GUEST_R14:
+		return (&vmxctx->guest_r14);
+	case VM_REG_GUEST_R15:
+		return (&vmxctx->guest_r15);
+	default:
+		break;
+	}
+	return (NULL);
+}
+
+static int
+vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
+{
+	register_t *regp;
+
+	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
+		*retval = *regp;
+		return (0);
+	} else
+		return (EINVAL);
+}
+
+static int
+vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
+{
+	register_t *regp;
+
+	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
+		*regp = val;
+		return (0);
+	} else
+		return (EINVAL);
+}
+
+static int
+vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
+{
+	struct vmx *vmx = arg;
+
+	if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
+		return (0);
+
+	/*
+	 * If the vcpu is running then don't mess with the VMCS.
+	 *
+	 * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause
+	 * the subsequent vmlaunch/vmresume to fail.
+	 */
+	if (vcpu_is_running(vmx->vm, vcpu, NULL))
+		panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+	return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval));
+}
+
+static int
+vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
+{
+	int error;
+	uint64_t ctls;
+	struct vmx *vmx = arg;
+
+	/*
+	 * XXX Allow caller to set contents of the guest registers saved in
+	 * the 'vmxctx' even though the vcpu might be running. We need this
+	 * specifically to support the rdmsr emulation that will set the
+	 * %eax and %edx registers during vm exit processing.
+	 */
+	if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
+		return (0);
+
+	/*
+	 * If the vcpu is running then don't mess with the VMCS.
+	 *
+	 * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause
+	 * the subsequent vmlaunch/vmresume to fail.
+	 */
+	if (vcpu_is_running(vmx->vm, vcpu, NULL))
+		panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+	error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val);
+
+	if (error == 0) {
+		/*
+		 * If the "load EFER" VM-entry control is 1 then the
+		 * value of EFER.LMA must be identical to "IA-32e mode guest"
+		 * bit in the VM-entry control.
+		 */
+		if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
+		    (reg == VM_REG_GUEST_EFER)) {
+			vmcs_getreg(&vmx->vmcs[vcpu],
+				    VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
+			if (val & EFER_LMA)
+				ctls |= VM_ENTRY_GUEST_LMA;
+			else
+				ctls &= ~VM_ENTRY_GUEST_LMA;
+			vmcs_setreg(&vmx->vmcs[vcpu],
+				    VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
+		}
+	}
+
+	return (error);
+}
+
+static int
+vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+	struct vmx *vmx = arg;
+
+	return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc));
+}
+
+static int
+vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+	struct vmx *vmx = arg;
+
+	return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc));
+}
+
+static int
+vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
+	   int code_valid)
+{
+	int error;
+	uint32_t info;
+	struct vmx *vmx = arg;
+	struct vmcs *vmcs = &vmx->vmcs[vcpu];
+
+	static uint32_t type_map[VM_EVENT_MAX] = {
+		0x1,		/* VM_EVENT_NONE */
+		0x0,		/* VM_HW_INTR */
+		0x2,		/* VM_NMI */
+		0x3,		/* VM_HW_EXCEPTION */
+		0x4,		/* VM_SW_INTR */
+		0x5,		/* VM_PRIV_SW_EXCEPTION */
+		0x6,		/* VM_SW_EXCEPTION */
+	};
+
+	info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0);
+	info |= VMCS_INTERRUPTION_INFO_VALID;
+	error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info);
+	if (error != 0)
+		return (error);
+
+	if (code_valid) {
+		error = vmcs_setreg(vmcs,
+				    VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR),
+				    code);
+	}
+	return (error);
+}
+
+static int
+vmx_nmi(void *arg, int vcpu)
+{
+	struct vmx *vmx = arg;
+
+	atomic_set_int(&vmx->state[vcpu].request_nmi, 1);
+
+	return (0);
+}
+
+static int
+vmx_getcap(void *arg, int vcpu, int type, int *retval)
+{
+	struct vmx *vmx = arg;
+	int vcap;
+	int ret;
+
+	ret = ENOENT;
+
+	vcap = vmx->cap[vcpu].set;
+
+	switch (type) {
+	case VM_CAP_HALT_EXIT:
+		if (cap_halt_exit)
+			ret = 0;
+		break;
+	case VM_CAP_PAUSE_EXIT:
+		if (cap_pause_exit)
+			ret = 0;
+		break;
+	case VM_CAP_MTRAP_EXIT:
+		if (cap_monitor_trap)
+			ret = 0;
+		break;
+	case VM_CAP_UNRESTRICTED_GUEST:
+		if (cap_unrestricted_guest)
+			ret = 0;
+		break;
+	default:
+		break;
+	}
+
+	if (ret == 0)
+		*retval = (vcap & (1 << type)) ? 1 : 0;
+
+	return (ret);
+}
+
+static int
+vmx_setcap(void *arg, int vcpu, int type, int val)
+{
+	struct vmx *vmx = arg;
+	struct vmcs *vmcs = &vmx->vmcs[vcpu];
+	uint32_t baseval;
+	uint32_t *pptr;
+	int error;
+	int flag;
+	int reg;
+	int retval;
+
+	retval = ENOENT;
+	pptr = NULL;
+
+	switch (type) {
+	case VM_CAP_HALT_EXIT:
+		if (cap_halt_exit) {
+			retval = 0;
+			pptr = &vmx->cap[vcpu].proc_ctls;
+			baseval = *pptr;
+			flag = PROCBASED_HLT_EXITING;
+			reg = VMCS_PRI_PROC_BASED_CTLS;
+		}
+		break;
+	case VM_CAP_MTRAP_EXIT:
+		if (cap_monitor_trap) {
+			retval = 0;
+			pptr = &vmx->cap[vcpu].proc_ctls;
+			baseval = *pptr;
+			flag = PROCBASED_MTF;
+			reg = VMCS_PRI_PROC_BASED_CTLS;
+		}
+		break;
+	case VM_CAP_PAUSE_EXIT:
+		if (cap_pause_exit) {
+			retval = 0;
+			pptr = &vmx->cap[vcpu].proc_ctls;
+			baseval = *pptr;
+			flag = PROCBASED_PAUSE_EXITING;
+			reg = VMCS_PRI_PROC_BASED_CTLS;
+		}
+		break;
+	case VM_CAP_UNRESTRICTED_GUEST:
+		if (cap_unrestricted_guest) {
+			retval = 0;
+			baseval = procbased_ctls2;
+			flag = PROCBASED2_UNRESTRICTED_GUEST;
+			reg = VMCS_SEC_PROC_BASED_CTLS;
+		}
+		break;
+	default:
+		break;
+	}
+
+	if (retval == 0) {
+		if (val) {
+			baseval |= flag;
+		} else {
+			baseval &= ~flag;
+		}
+		VMPTRLD(vmcs);
+		error = vmwrite(reg, baseval);
+		VMCLEAR(vmcs);
+
+		if (error) {
+			retval = error;
+		} else {
+			/*
+			 * Update optional stored flags, and record
+			 * setting
+			 */
+			if (pptr != NULL) {
+				*pptr = baseval;
+			}
+
+			if (val) {
+				vmx->cap[vcpu].set |= (1 << type);
+			} else {
+				vmx->cap[vcpu].set &= ~(1 << type);
+			}
+		}
+	}
+
+        return (retval);
+}
+
+struct vmm_ops vmm_ops_intel = {
+	vmx_init,
+	vmx_cleanup,
+	vmx_vminit,
+	vmx_run,
+	vmx_vmcleanup,
+	ept_vmmmap,
+	vmx_getreg,
+	vmx_setreg,
+	vmx_getdesc,
+	vmx_setdesc,
+	vmx_inject,
+	vmx_nmi,
+	vmx_getcap,
+	vmx_setcap
+};
diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h
new file mode 100644
index 0000000..69697f8
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx.h
@@ -0,0 +1,115 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_H_
+#define	_VMX_H_
+
+#include "vmcs.h"
+
+#define	GUEST_MSR_MAX_ENTRIES	64		/* arbitrary */
+
+struct vmxctx {
+	register_t	guest_rdi;		/* Guest state */
+	register_t	guest_rsi;
+	register_t	guest_rdx;
+	register_t	guest_rcx;
+	register_t	guest_r8;
+	register_t	guest_r9;
+	register_t	guest_rax;
+	register_t	guest_rbx;
+	register_t	guest_rbp;
+	register_t	guest_r10;
+	register_t	guest_r11;
+	register_t	guest_r12;
+	register_t	guest_r13;
+	register_t	guest_r14;
+	register_t	guest_r15;
+	register_t	guest_cr2;
+
+	register_t	host_r15;		/* Host state */
+	register_t	host_r14;
+	register_t	host_r13;
+	register_t	host_r12;
+	register_t	host_rbp;
+	register_t	host_rsp;
+	register_t	host_rbx;
+	register_t	host_rip;
+	/*
+	 * XXX todo debug registers and fpu state
+	 */
+	
+	int		launch_error;
+};
+
+struct vmxcap {
+	int	set;
+	uint32_t proc_ctls;
+};
+
+struct vmxstate {
+	int	request_nmi;	
+	int	lastcpu;	/* host cpu that this 'vcpu' last ran on */
+	uint16_t vpid;
+};
+
+/* virtual machine softc */
+struct vmx {
+	pml4_entry_t	pml4ept[NPML4EPG];
+	struct vmcs	vmcs[VM_MAXCPU];	/* one vmcs per virtual cpu */
+	char		msr_bitmap[PAGE_SIZE];
+	struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES];
+	struct vmxctx	ctx[VM_MAXCPU];
+	struct vmxcap	cap[VM_MAXCPU];
+	struct vmxstate	state[VM_MAXCPU];
+	struct vm	*vm;
+};
+CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
+
+#define	VMX_RETURN_DIRECT	0
+#define	VMX_RETURN_LONGJMP	1
+#define	VMX_RETURN_VMRESUME	2
+#define	VMX_RETURN_VMLAUNCH	3
+/*
+ * vmx_setjmp() returns:
+ * - 0 when it returns directly
+ * - 1 when it returns from vmx_longjmp
+ * - 2 when it returns from vmx_resume (which would only be in the error case)
+ * - 3 when it returns from vmx_launch (which would only be in the error case)
+ */
+int	vmx_setjmp(struct vmxctx *ctx);
+void	vmx_longjmp(void);			/* returns via vmx_setjmp */
+void	vmx_launch(struct vmxctx *ctx) __dead2;	/* may return via vmx_setjmp */
+void	vmx_resume(struct vmxctx *ctx) __dead2;	/* may return via vmx_setjmp */
+
+u_long	vmx_fix_cr0(u_long cr0);
+u_long	vmx_fix_cr4(u_long cr4);
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_controls.h b/sys/amd64/vmm/intel/vmx_controls.h
new file mode 100644
index 0000000..31f29f8
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_controls.h
@@ -0,0 +1,92 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_CONTROLS_H_
+#define	_VMX_CONTROLS_H_
+
+/* Pin-Based VM-Execution Controls */
+#define	PINBASED_EXTINT_EXITING		(1 << 0)
+#define	PINBASED_NMI_EXITING		(1 << 3)
+#define	PINBASED_VIRTUAL_NMI		(1 << 5)
+#define	PINBASED_PREMPTION_TIMER	(1 << 6)
+
+/* Primary Processor-Based VM-Execution Controls */
+#define	PROCBASED_INT_WINDOW_EXITING	(1 << 2)
+#define	PROCBASED_TSC_OFFSET		(1 << 3)
+#define	PROCBASED_HLT_EXITING		(1 << 7)
+#define	PROCBASED_INVLPG_EXITING	(1 << 9)
+#define	PROCBASED_MWAIT_EXITING		(1 << 10)
+#define	PROCBASED_RDPMC_EXITING		(1 << 11)
+#define	PROCBASED_RDTSC_EXITING		(1 << 12)
+#define	PROCBASED_CR3_LOAD_EXITING	(1 << 15)
+#define	PROCBASED_CR3_STORE_EXITING	(1 << 16)
+#define	PROCBASED_CR8_LOAD_EXITING	(1 << 19)
+#define	PROCBASED_CR8_STORE_EXITING	(1 << 20)
+#define	PROCBASED_USE_TPR_SHADOW	(1 << 21)
+#define	PROCBASED_NMI_WINDOW_EXITING	(1 << 22)
+#define PROCBASED_MOV_DR_EXITING	(1 << 23)
+#define	PROCBASED_IO_EXITING		(1 << 24)
+#define	PROCBASED_IO_BITMAPS		(1 << 25)
+#define	PROCBASED_MTF			(1 << 27)
+#define	PROCBASED_MSR_BITMAPS		(1 << 28)
+#define	PROCBASED_MONITOR_EXITING	(1 << 29)
+#define	PROCBASED_PAUSE_EXITING		(1 << 30)
+#define	PROCBASED_SECONDARY_CONTROLS	(1 << 31)
+
+/* Secondary Processor-Based VM-Execution Controls */
+#define	PROCBASED2_VIRTUALIZE_APIC	(1 << 0)
+#define	PROCBASED2_ENABLE_EPT		(1 << 1)
+#define	PROCBASED2_DESC_TABLE_EXITING	(1 << 2)
+#define	PROCBASED2_ENABLE_RDTSCP	(1 << 3)
+#define	PROCBASED2_VIRTUALIZE_X2APIC	(1 << 4)
+#define	PROCBASED2_ENABLE_VPID		(1 << 5)
+#define	PROCBASED2_WBINVD_EXITING	(1 << 6)
+#define	PROCBASED2_UNRESTRICTED_GUEST	(1 << 7)
+#define	PROCBASED2_PAUSE_LOOP_EXITING	(1 << 10)
+
+/* VM Exit Controls */
+#define	VM_EXIT_SAVE_DEBUG_CONTROLS	(1 << 2)
+#define	VM_EXIT_HOST_LMA		(1 << 9)
+#define	VM_EXIT_LOAD_PERF_GLOBAL_CTRL	(1 << 12)
+#define	VM_EXIT_ACKNOWLEDGE_INTERRUPT	(1 << 15)
+#define	VM_EXIT_SAVE_PAT		(1 << 18)
+#define	VM_EXIT_LOAD_PAT		(1 << 19)
+#define	VM_EXIT_SAVE_EFER		(1 << 20)
+#define	VM_EXIT_LOAD_EFER		(1 << 21)
+#define	VM_EXIT_SAVE_PREEMPTION_TIMER	(1 << 22)
+
+/* VM Entry Controls */
+#define	VM_ENTRY_LOAD_DEBUG_CONTROLS	(1 << 2)
+#define	VM_ENTRY_GUEST_LMA		(1 << 9)
+#define	VM_ENTRY_INTO_SMM		(1 << 10)
+#define	VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1 << 11)
+#define	VM_ENTRY_LOAD_PERF_GLOBAL_CTRL	(1 << 13)
+#define	VM_ENTRY_LOAD_PAT		(1 << 14)
+#define	VM_ENTRY_LOAD_EFER		(1 << 15)
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_cpufunc.h b/sys/amd64/vmm/intel/vmx_cpufunc.h
new file mode 100644
index 0000000..e9f6c6d
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_cpufunc.h
@@ -0,0 +1,199 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_VMX_CPUFUNC_H_
+#define	_VMX_CPUFUNC_H_
+
+struct vmcs;
+
+/*
+ * Section 5.2 "Conventions" from Intel Architecture Manual 2B.
+ *
+ *			error
+ * VMsucceed		  0
+ * VMFailInvalid	  1
+ * VMFailValid		  2	see also VMCS VM-Instruction Error Field
+ */
+#define	VM_SUCCESS		0
+#define	VM_FAIL_INVALID		1
+#define	VM_FAIL_VALID		2
+#define	VMX_SET_ERROR_CODE(varname)					\
+	do {								\
+	__asm __volatile("	jnc 1f;"				\
+			 "	mov $1, %0;"	/* CF: error = 1 */	\
+			 "	jmp 3f;"				\
+			 "1:	jnz 2f;"				\
+			 "	mov $2, %0;"	/* ZF: error = 2 */	\
+			 "	jmp 3f;"				\
+			 "2:	mov $0, %0;"				\
+			 "3:	nop"					\
+			 :"=r" (varname));				\
+	} while (0)
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmxon(char *region)
+{
+	int error;
+	uint64_t addr;
+
+	addr = vtophys(region);
+	__asm __volatile("vmxon %0" : : "m" (*(uint64_t *)&addr) : "memory");
+	VMX_SET_ERROR_CODE(error);
+	return (error);
+}
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmclear(struct vmcs *vmcs)
+{
+	int error;
+	uint64_t addr;
+
+	addr = vtophys(vmcs);
+	__asm __volatile("vmclear %0" : : "m" (*(uint64_t *)&addr) : "memory");
+	VMX_SET_ERROR_CODE(error);
+	return (error);
+}
+
+static __inline void
+vmxoff(void)
+{
+	__asm __volatile("vmxoff");
+}
+
+static __inline void
+vmptrst(uint64_t *addr)
+{
+	__asm __volatile("vmptrst %0" : : "m" (*addr) : "memory");
+}
+
+static __inline int
+vmptrld(struct vmcs *vmcs)
+{
+	int error;
+	uint64_t addr;
+
+	addr = vtophys(vmcs);
+	__asm __volatile("vmptrld %0" : : "m" (*(uint64_t *)&addr) : "memory");
+	VMX_SET_ERROR_CODE(error);
+	return (error);
+}
+
+static __inline int
+vmwrite(uint64_t reg, uint64_t val)
+{
+	int error;
+
+	__asm __volatile("vmwrite %0, %1" : : "r" (val), "r" (reg) : "memory");
+
+	VMX_SET_ERROR_CODE(error);
+
+	return (error);
+}
+
+static __inline int
+vmread(uint64_t r, uint64_t *addr)
+{
+	int error;
+
+	__asm __volatile("vmread %0, %1" : : "r" (r), "m" (*addr) : "memory");
+
+	VMX_SET_ERROR_CODE(error);
+
+	return (error);
+}
+
+static void __inline
+VMCLEAR(struct vmcs *vmcs)
+{
+	int err;
+
+	err = vmclear(vmcs);
+	if (err != 0)
+		panic("%s: vmclear(%p) error %d", __func__, vmcs, err);
+
+	critical_exit();
+}
+
+static void __inline
+VMPTRLD(struct vmcs *vmcs)
+{
+	int err;
+
+	critical_enter();
+
+	err = vmptrld(vmcs);
+	if (err != 0)
+		panic("%s: vmptrld(%p) error %d", __func__, vmcs, err);
+}
+
+#define	INVVPID_TYPE_ADDRESS		0UL
+#define	INVVPID_TYPE_SINGLE_CONTEXT	1UL
+#define	INVVPID_TYPE_ALL_CONTEXTS	2UL
+
+struct invvpid_desc {
+	uint16_t	vpid;
+	uint16_t	_res1;
+	uint32_t	_res2;
+	uint64_t	linear_addr;
+};
+CTASSERT(sizeof(struct invvpid_desc) == 16);
+
+static void __inline
+invvpid(uint64_t type, struct invvpid_desc desc)
+{
+	int error;
+
+	__asm __volatile("invvpid %0, %1" :: "m" (desc), "r" (type) : "memory");
+
+	VMX_SET_ERROR_CODE(error);
+	if (error)
+		panic("invvpid error %d", error);
+}
+
+#define	INVEPT_TYPE_SINGLE_CONTEXT	1UL
+#define	INVEPT_TYPE_ALL_CONTEXTS	2UL
+struct invept_desc {
+	uint64_t	eptp;
+	uint64_t	_res;
+};
+CTASSERT(sizeof(struct invept_desc) == 16);
+
+static void __inline
+invept(uint64_t type, struct invept_desc desc)
+{
+	int error;
+
+	__asm __volatile("invept %0, %1" :: "m" (desc), "r" (type) : "memory");
+
+	VMX_SET_ERROR_CODE(error);
+	if (error)
+		panic("invept error %d", error);
+}
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_genassym.c b/sys/amd64/vmm/intel/vmx_genassym.c
new file mode 100644
index 0000000..c4b1efc
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_genassym.c
@@ -0,0 +1,81 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/assym.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+
+#include <machine/vmm.h>
+#include "vmx.h"
+#include "vmx_cpufunc.h"
+
+ASSYM(VMXCTX_GUEST_RDI, offsetof(struct vmxctx, guest_rdi));
+ASSYM(VMXCTX_GUEST_RSI, offsetof(struct vmxctx, guest_rsi));
+ASSYM(VMXCTX_GUEST_RDX, offsetof(struct vmxctx, guest_rdx));
+ASSYM(VMXCTX_GUEST_RCX, offsetof(struct vmxctx, guest_rcx));
+ASSYM(VMXCTX_GUEST_R8, offsetof(struct vmxctx, guest_r8));
+ASSYM(VMXCTX_GUEST_R9, offsetof(struct vmxctx, guest_r9));
+ASSYM(VMXCTX_GUEST_RAX, offsetof(struct vmxctx, guest_rax));
+ASSYM(VMXCTX_GUEST_RBX, offsetof(struct vmxctx, guest_rbx));
+ASSYM(VMXCTX_GUEST_RBP, offsetof(struct vmxctx, guest_rbp));
+ASSYM(VMXCTX_GUEST_R10, offsetof(struct vmxctx, guest_r10));
+ASSYM(VMXCTX_GUEST_R11, offsetof(struct vmxctx, guest_r11));
+ASSYM(VMXCTX_GUEST_R12, offsetof(struct vmxctx, guest_r12));
+ASSYM(VMXCTX_GUEST_R13, offsetof(struct vmxctx, guest_r13));
+ASSYM(VMXCTX_GUEST_R14, offsetof(struct vmxctx, guest_r14));
+ASSYM(VMXCTX_GUEST_R15, offsetof(struct vmxctx, guest_r15));
+ASSYM(VMXCTX_GUEST_CR2, offsetof(struct vmxctx, guest_cr2));
+
+ASSYM(VMXCTX_HOST_R15, offsetof(struct vmxctx, host_r15));
+ASSYM(VMXCTX_HOST_R14, offsetof(struct vmxctx, host_r14));
+ASSYM(VMXCTX_HOST_R13, offsetof(struct vmxctx, host_r13));
+ASSYM(VMXCTX_HOST_R12, offsetof(struct vmxctx, host_r12));
+ASSYM(VMXCTX_HOST_RBP, offsetof(struct vmxctx, host_rbp));
+ASSYM(VMXCTX_HOST_RSP, offsetof(struct vmxctx, host_rsp));
+ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx));
+ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip));
+
+ASSYM(VMXCTX_LAUNCH_ERROR, offsetof(struct vmxctx, launch_error));
+
+ASSYM(VM_SUCCESS,	VM_SUCCESS);
+ASSYM(VM_FAIL_INVALID,	VM_FAIL_INVALID);
+ASSYM(VM_FAIL_VALID,	VM_FAIL_VALID);
+
+ASSYM(VMX_RETURN_DIRECT,	VMX_RETURN_DIRECT);
+ASSYM(VMX_RETURN_LONGJMP,	VMX_RETURN_LONGJMP);
+ASSYM(VMX_RETURN_VMRESUME,	VMX_RETURN_VMRESUME);
+ASSYM(VMX_RETURN_VMLAUNCH,	VMX_RETURN_VMLAUNCH);
diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c
new file mode 100644
index 0000000..1e9a837
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_msr.c
@@ -0,0 +1,172 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/cpufunc.h>
+
+#include "vmx_msr.h"
+
+static boolean_t
+vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
+{
+
+	if (msr_val & (1UL << (bitpos + 32)))
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+static boolean_t
+vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
+{
+
+	if ((msr_val & (1UL << bitpos)) == 0)
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+uint32_t
+vmx_revision(void)
+{
+
+	return (rdmsr(MSR_VMX_BASIC) & 0xffffffff);
+}
+
+/*
+ * Generate a bitmask to be used for the VMCS execution control fields.
+ *
+ * The caller specifies what bits should be set to one in 'ones_mask'
+ * and what bits should be set to zero in 'zeros_mask'. The don't-care
+ * bits are set to the default value. The default values are obtained
+ * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining
+ * VMX Capabilities".
+ *
+ * Returns zero on success and non-zero on error.
+ */
+int
+vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
+	       uint32_t zeros_mask, uint32_t *retval)
+{
+	int i;
+	uint64_t val, trueval;
+	boolean_t true_ctls_avail, one_allowed, zero_allowed;
+
+	/* We cannot ask the same bit to be set to both '1' and '0' */
+	if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
+		return (EINVAL);
+
+	if (rdmsr(MSR_VMX_BASIC) & (1UL << 55))
+		true_ctls_avail = TRUE;
+	else
+		true_ctls_avail = FALSE;
+
+	val = rdmsr(ctl_reg);
+	if (true_ctls_avail)
+		trueval = rdmsr(true_ctl_reg);		/* step c */
+	else
+		trueval = val;				/* step a */
+
+	for (i = 0; i < 32; i++) {
+		one_allowed = vmx_ctl_allows_one_setting(trueval, i);
+		zero_allowed = vmx_ctl_allows_zero_setting(trueval, i);
+
+		KASSERT(one_allowed || zero_allowed,
+			("invalid zero/one setting for bit %d of ctl 0x%0x, "
+			 "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg));
+
+		if (zero_allowed && !one_allowed) {		/* b(i),c(i) */
+			if (ones_mask & (1 << i))
+				return (EINVAL);
+			*retval &= ~(1 << i);
+		} else if (one_allowed && !zero_allowed) {	/* b(i),c(i) */
+			if (zeros_mask & (1 << i))
+				return (EINVAL);
+			*retval |= 1 << i;
+		} else {
+			if (zeros_mask & (1 << i))	/* b(ii),c(ii) */
+				*retval &= ~(1 << i);
+			else if (ones_mask & (1 << i)) /* b(ii), c(ii) */
+				*retval |= 1 << i;
+			else if (!true_ctls_avail)
+				*retval &= ~(1 << i);	/* b(iii) */
+			else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/
+				*retval &= ~(1 << i);
+			else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */
+				*retval |= 1 << i;
+			else {
+				panic("vmx_set_ctlreg: unable to determine "
+				      "correct value of ctl bit %d for msr "
+				      "0x%0x and true msr 0x%0x", i, ctl_reg,
+				      true_ctl_reg);
+			}
+		}
+	}
+
+	return (0);
+}
+
+void
+msr_bitmap_initialize(char *bitmap)
+{
+
+	memset(bitmap, 0xff, PAGE_SIZE);
+}
+
+int
+msr_bitmap_change_access(char *bitmap, u_int msr, int access)
+{
+	int byte, bit;
+
+	if (msr >= 0x00000000 && msr <= 0x00001FFF)
+		byte = msr / 8;
+	else if (msr >= 0xC0000000 && msr <= 0xC0001FFF)
+		byte = 1024 + (msr - 0xC0000000) / 8;
+	else
+		return (EINVAL);
+
+	bit = msr & 0x7;
+
+	if (access & MSR_BITMAP_ACCESS_READ)
+		bitmap[byte] &= ~(1 << bit);
+	else
+		bitmap[byte] |= 1 << bit;
+
+	byte += 2048;
+	if (access & MSR_BITMAP_ACCESS_WRITE)
+		bitmap[byte] &= ~(1 << bit);
+	else
+		bitmap[byte] |= 1 << bit;
+
+	return (0);
+}
diff --git a/sys/amd64/vmm/intel/vmx_msr.h b/sys/amd64/vmm/intel/vmx_msr.h
new file mode 100644
index 0000000..e6379a9
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_msr.h
@@ -0,0 +1,78 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_MSR_H_
+#define	_VMX_MSR_H_
+
+#define	MSR_VMX_BASIC			0x480
+#define	MSR_VMX_EPT_VPID_CAP		0x48C
+
+#define	MSR_VMX_PROCBASED_CTLS		0x482
+#define	MSR_VMX_TRUE_PROCBASED_CTLS	0x48E
+
+#define	MSR_VMX_PINBASED_CTLS		0x481
+#define	MSR_VMX_TRUE_PINBASED_CTLS	0x48D
+
+#define	MSR_VMX_PROCBASED_CTLS2		0x48B
+
+#define	MSR_VMX_EXIT_CTLS		0x483
+#define	MSR_VMX_TRUE_EXIT_CTLS		0x48f
+
+#define	MSR_VMX_ENTRY_CTLS		0x484
+#define	MSR_VMX_TRUE_ENTRY_CTLS		0x490
+
+#define	MSR_VMX_CR0_FIXED0		0x486
+#define	MSR_VMX_CR0_FIXED1		0x487
+
+#define	MSR_VMX_CR4_FIXED0		0x488
+#define	MSR_VMX_CR4_FIXED1		0x489
+
+uint32_t vmx_revision(void);
+
+int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
+		   uint32_t zeros_mask, uint32_t *retval);
+
+/*
+ * According to Section 21.10.4 "Software Access to Related Structures",
+ * changes to data structures pointed to by the VMCS must be made only when
+ * there is no logical processor with a current VMCS that points to the
+ * data structure.
+ *
+ * This pretty much limits us to configuring the MSR bitmap before VMCS
+ * initialization for SMP VMs. Unless of course we do it the hard way - which
+ * would involve some form of synchronization between the vcpus to vmclear
+ * all VMCSs' that point to the bitmap.
+ */
+#define	MSR_BITMAP_ACCESS_NONE	0x0
+#define	MSR_BITMAP_ACCESS_READ	0x1
+#define	MSR_BITMAP_ACCESS_WRITE	0x2
+#define	MSR_BITMAP_ACCESS_RW	(MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE)
+void	msr_bitmap_initialize(char *bitmap);
+int	msr_bitmap_change_access(char *bitmap, u_int msr, int access);
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S
new file mode 100644
index 0000000..4d1bf1d
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_support.S
@@ -0,0 +1,204 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <machine/asmacros.h>
+
+#include "vmx_assym.s"
+
+/*
+ * Assumes that %rdi holds a pointer to the 'vmxctx'
+ */
+#define	VMX_GUEST_RESTORE						\
+	/*								\
+	 * Make sure that interrupts are disabled before restoring CR2.	\
+	 * Otherwise there could be a page fault during the interrupt	\
+	 * handler execution that would end up trashing CR2.		\
+	 */								\
+	cli;								\
+	movq	VMXCTX_GUEST_CR2(%rdi),%rsi;				\
+	movq	%rsi,%cr2;						\
+	movq	VMXCTX_GUEST_RSI(%rdi),%rsi;				\
+	movq	VMXCTX_GUEST_RDX(%rdi),%rdx;				\
+	movq	VMXCTX_GUEST_RCX(%rdi),%rcx;				\
+	movq	VMXCTX_GUEST_R8(%rdi),%r8;				\
+	movq	VMXCTX_GUEST_R9(%rdi),%r9;				\
+	movq	VMXCTX_GUEST_RAX(%rdi),%rax;				\
+	movq	VMXCTX_GUEST_RBX(%rdi),%rbx;				\
+	movq	VMXCTX_GUEST_RBP(%rdi),%rbp;				\
+	movq	VMXCTX_GUEST_R10(%rdi),%r10;				\
+	movq	VMXCTX_GUEST_R11(%rdi),%r11;				\
+	movq	VMXCTX_GUEST_R12(%rdi),%r12;				\
+	movq	VMXCTX_GUEST_R13(%rdi),%r13;				\
+	movq	VMXCTX_GUEST_R14(%rdi),%r14;				\
+	movq	VMXCTX_GUEST_R15(%rdi),%r15;				\
+	movq	VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */
+
+#define	VM_INSTRUCTION_ERROR(reg)					\
+	jnc 	1f;							\
+	movl 	$VM_FAIL_INVALID,reg;		/* CF is set */		\
+	jmp 	3f;							\
+1:	jnz 	2f;							\
+	movl 	$VM_FAIL_VALID,reg;		/* ZF is set */		\
+	jmp 	3f;							\
+2:	movl 	$VM_SUCCESS,reg;					\
+3:	movl	reg,VMXCTX_LAUNCH_ERROR(%rsp)
+
+	.text
+/*
+ * int vmx_setjmp(ctxp)
+ * %rdi = ctxp
+ *
+ * Return value is '0' when it returns directly from here.
+ * Return value is '1' when it returns after a vm exit through vmx_longjmp.
+ */
+ENTRY(vmx_setjmp)
+	movq	(%rsp),%rax			/* return address */
+	movq    %r15,VMXCTX_HOST_R15(%rdi)
+	movq    %r14,VMXCTX_HOST_R14(%rdi)
+	movq    %r13,VMXCTX_HOST_R13(%rdi)
+	movq    %r12,VMXCTX_HOST_R12(%rdi)
+	movq    %rbp,VMXCTX_HOST_RBP(%rdi)
+	movq    %rsp,VMXCTX_HOST_RSP(%rdi)
+	movq    %rbx,VMXCTX_HOST_RBX(%rdi)
+	movq    %rax,VMXCTX_HOST_RIP(%rdi)
+
+	/*
+	 * XXX save host debug registers
+	 */
+	movl	$VMX_RETURN_DIRECT,%eax
+	ret
+END(vmx_setjmp)
+
+/*
+ * void vmx_return(struct vmxctx *ctxp, int retval)
+ * %rdi = ctxp
+ * %rsi = retval
+ * Return to vmm context through vmx_setjmp() with a value of 'retval'.
+ */
+ENTRY(vmx_return)
+	/* Restore host context. */
+	movq	VMXCTX_HOST_R15(%rdi),%r15
+	movq	VMXCTX_HOST_R14(%rdi),%r14
+	movq	VMXCTX_HOST_R13(%rdi),%r13
+	movq	VMXCTX_HOST_R12(%rdi),%r12
+	movq	VMXCTX_HOST_RBP(%rdi),%rbp
+	movq	VMXCTX_HOST_RSP(%rdi),%rsp
+	movq	VMXCTX_HOST_RBX(%rdi),%rbx
+	movq	VMXCTX_HOST_RIP(%rdi),%rax
+	movq	%rax,(%rsp)			/* return address */
+
+	/*
+	 * XXX restore host debug registers
+	 */
+	movl	%esi,%eax
+	ret
+END(vmx_return)
+
+/*
+ * void vmx_longjmp(void)
+ * %rsp points to the struct vmxctx
+ */
+ENTRY(vmx_longjmp)
+	/*
+	 * Save guest state that is not automatically saved in the vmcs.
+	 */
+	movq	%rdi,VMXCTX_GUEST_RDI(%rsp)
+	movq	%rsi,VMXCTX_GUEST_RSI(%rsp)
+	movq	%rdx,VMXCTX_GUEST_RDX(%rsp)
+	movq	%rcx,VMXCTX_GUEST_RCX(%rsp)
+	movq	%r8,VMXCTX_GUEST_R8(%rsp)
+	movq	%r9,VMXCTX_GUEST_R9(%rsp)
+	movq	%rax,VMXCTX_GUEST_RAX(%rsp)
+	movq	%rbx,VMXCTX_GUEST_RBX(%rsp)
+	movq	%rbp,VMXCTX_GUEST_RBP(%rsp)
+	movq	%r10,VMXCTX_GUEST_R10(%rsp)
+	movq	%r11,VMXCTX_GUEST_R11(%rsp)
+	movq	%r12,VMXCTX_GUEST_R12(%rsp)
+	movq	%r13,VMXCTX_GUEST_R13(%rsp)
+	movq	%r14,VMXCTX_GUEST_R14(%rsp)
+	movq	%r15,VMXCTX_GUEST_R15(%rsp)
+
+	movq	%cr2,%rdi
+	movq	%rdi,VMXCTX_GUEST_CR2(%rsp)
+
+	movq	%rsp,%rdi
+	movq	$VMX_RETURN_LONGJMP,%rsi
+	callq	vmx_return
+END(vmx_longjmp)
+
+/*
+ * void vmx_resume(struct vmxctx *ctxp)
+ * %rdi = ctxp
+ *
+ * Although the return type is a 'void' this function may return indirectly
+ * through vmx_setjmp() with a return value of 2.
+ */
+ENTRY(vmx_resume)
+	/*
+	 * Restore guest state that is not automatically loaded from the vmcs.
+	 */
+	VMX_GUEST_RESTORE
+
+	vmresume
+
+	/*
+	 * Capture the reason why vmresume failed.
+	 */
+	VM_INSTRUCTION_ERROR(%eax)
+
+	/* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */
+	movq	%rsp,%rdi
+	movq	$VMX_RETURN_VMRESUME,%rsi
+	callq	vmx_return
+END(vmx_resume)
+
+/*
+ * void vmx_launch(struct vmxctx *ctxp)
+ * %rdi = ctxp
+ *
+ * Although the return type is a 'void' this function may return indirectly
+ * through vmx_setjmp() with a return value of 3.
+ */
+ENTRY(vmx_launch)
+	/*
+	 * Restore guest state that is not automatically loaded from the vmcs.
+	 */
+	VMX_GUEST_RESTORE
+
+	vmlaunch
+
+	/*
+	 * Capture the reason why vmlaunch failed.
+	 */
+	VM_INSTRUCTION_ERROR(%eax)
+
+	/* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */
+	movq	%rsp,%rdi
+	movq	$VMX_RETURN_VMLAUNCH,%rsi
+	callq	vmx_return
+END(vmx_launch)
diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c
new file mode 100644
index 0000000..24495a9
--- /dev/null
+++ b/sys/amd64/vmm/intel/vtd.c
@@ -0,0 +1,637 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <dev/pci/pcireg.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+#include <machine/pci_cfgreg.h>
+
+#include "io/iommu.h"
+
+/*
+ * Documented in the "Intel Virtualization Technology for Directed I/O",
+ * Architecture Spec, September 2008.
+ */
+
+/* Section 10.4 "Register Descriptions" */
+struct vtdmap {
+	volatile uint32_t	version;
+	volatile uint32_t	res0;
+	volatile uint64_t	cap;
+	volatile uint64_t	ext_cap;
+	volatile uint32_t	gcr;
+	volatile uint32_t	gsr;
+	volatile uint64_t	rta;
+	volatile uint64_t	ccr;
+};
+
+#define	VTD_CAP_SAGAW(cap)	(((cap) >> 8) & 0x1F)
+#define	VTD_CAP_ND(cap)		((cap) & 0x7)
+#define	VTD_CAP_CM(cap)		(((cap) >> 7) & 0x1)
+#define	VTD_CAP_SPS(cap)	(((cap) >> 34) & 0xF)
+#define	VTD_CAP_RWBF(cap)	(((cap) >> 4) & 0x1)
+
+#define	VTD_ECAP_DI(ecap)	(((ecap) >> 2) & 0x1)
+#define	VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1)
+#define	VTD_ECAP_IRO(ecap)	(((ecap) >> 8) & 0x3FF)
+
+#define	VTD_GCR_WBF		(1 << 27)
+#define	VTD_GCR_SRTP		(1 << 30)
+#define	VTD_GCR_TE		(1 << 31)
+
+#define	VTD_GSR_WBFS		(1 << 27)
+#define	VTD_GSR_RTPS		(1 << 30)
+#define	VTD_GSR_TES		(1 << 31)
+
+#define	VTD_CCR_ICC		(1UL << 63)	/* invalidate context cache */
+#define	VTD_CCR_CIRG_GLOBAL	(1UL << 61)	/* global invalidation */
+
+#define	VTD_IIR_IVT		(1UL << 63)	/* invalidation IOTLB */
+#define	VTD_IIR_IIRG_GLOBAL	(1ULL << 60)	/* global IOTLB invalidation */
+#define	VTD_IIR_IIRG_DOMAIN	(2ULL << 60)	/* domain IOTLB invalidation */
+#define	VTD_IIR_IIRG_PAGE	(3ULL << 60)	/* page IOTLB invalidation */
+#define	VTD_IIR_DRAIN_READS	(1ULL << 49)	/* drain pending DMA reads */
+#define	VTD_IIR_DRAIN_WRITES	(1ULL << 48)	/* drain pending DMA writes */
+#define	VTD_IIR_DOMAIN_P	32
+
+#define	VTD_ROOT_PRESENT	0x1
+#define	VTD_CTX_PRESENT		0x1
+#define	VTD_CTX_TT_ALL		(1UL << 2)
+
+#define	VTD_PTE_RD		(1UL << 0)
+#define	VTD_PTE_WR		(1UL << 1)
+#define	VTD_PTE_SUPERPAGE	(1UL << 7)
+#define	VTD_PTE_ADDR_M		(0x000FFFFFFFFFF000UL)
+
+struct domain {
+	uint64_t	*ptp;		/* first level page table page */
+	int		pt_levels;	/* number of page table levels */
+	int		addrwidth;	/* 'AW' field in context entry */
+	int		spsmask;	/* supported super page sizes */
+	u_int		id;		/* domain id */
+	vm_paddr_t	maxaddr;	/* highest address to be mapped */
+	SLIST_ENTRY(domain) next;
+};
+
+static SLIST_HEAD(, domain) domhead;
+
+#define	DRHD_MAX_UNITS	8
+static int		drhd_num;
+static struct vtdmap	*vtdmaps[DRHD_MAX_UNITS];
+static int		max_domains;
+typedef int		(*drhd_ident_func_t)(void);
+
+static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
+static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
+
+static MALLOC_DEFINE(M_VTD, "vtd", "vtd");
+
+/*
+ * Config space register definitions from the "Intel 5520 and 5500" datasheet.
+ */
+static int
+tylersburg_vtd_ident(void)
+{
+	int units, nlbus;
+	uint16_t did, vid;
+	uint32_t miscsts, vtbar;
+
+	const int bus = 0;
+	const int slot = 20;
+	const int func = 0;
+
+	units = 0;
+
+	vid = pci_cfgregread(bus, slot, func, PCIR_VENDOR, 2);
+	did = pci_cfgregread(bus, slot, func, PCIR_DEVICE, 2);
+	if (vid != 0x8086 || did != 0x342E)
+		goto done;
+
+	/*
+	 * Check if this is a dual IOH configuration.
+	 */
+	miscsts = pci_cfgregread(bus, slot, func, 0x9C, 4);
+	if (miscsts & (1 << 25))
+		nlbus = pci_cfgregread(bus, slot, func, 0x160, 1);
+	else	
+		nlbus = -1;
+
+	vtbar = pci_cfgregread(bus, slot, func, 0x180, 4);
+	if (vtbar & 0x1) {
+		vtdmaps[units++] = (struct vtdmap *)
+					PHYS_TO_DMAP(vtbar & 0xffffe000);
+	} else if (bootverbose)
+		printf("VT-d unit in legacy IOH is disabled!\n");
+
+	if (nlbus != -1) {
+		vtbar = pci_cfgregread(nlbus, slot, func, 0x180, 4);
+		if (vtbar & 0x1) {
+			vtdmaps[units++] = (struct vtdmap *)
+					   PHYS_TO_DMAP(vtbar & 0xffffe000);
+		} else if (bootverbose)
+			printf("VT-d unit in non-legacy IOH is disabled!\n");
+	}
+done:
+	return (units);
+}
+
+static drhd_ident_func_t drhd_ident_funcs[] = {
+	tylersburg_vtd_ident,
+	NULL
+};
+
+static int
+vtd_max_domains(struct vtdmap *vtdmap)
+{
+	int nd;
+
+	nd = VTD_CAP_ND(vtdmap->cap);
+
+	switch (nd) {
+	case 0:
+		return (16);
+	case 1:
+		return (64);
+	case 2:
+		return (256);
+	case 3:
+		return (1024);
+	case 4:
+		return (4 * 1024);
+	case 5:
+		return (16 * 1024);
+	case 6:
+		return (64 * 1024);
+	default:
+		panic("vtd_max_domains: invalid value of nd (0x%0x)", nd);
+	}
+}
+
+static u_int
+domain_id(void)
+{
+	u_int id;
+	struct domain *dom;
+
+	/* Skip domain id 0 - it is reserved when Caching Mode field is set */
+	for (id = 1; id < max_domains; id++) {
+		SLIST_FOREACH(dom, &domhead, next) {
+			if (dom->id == id)
+				break;
+		}
+		if (dom == NULL)
+			break;		/* found it */
+	}
+	
+	if (id >= max_domains)
+		panic("domain ids exhausted");
+
+	return (id);
+}
+
+static void
+vtd_wbflush(struct vtdmap *vtdmap)
+{
+
+	if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0)
+		pmap_invalidate_cache();
+
+	if (VTD_CAP_RWBF(vtdmap->cap)) {
+		vtdmap->gcr = VTD_GCR_WBF;
+		while ((vtdmap->gsr & VTD_GSR_WBFS) != 0)
+			;
+	}
+}
+
+static void
+vtd_ctx_global_invalidate(struct vtdmap *vtdmap)
+{
+
+	vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL;
+	while ((vtdmap->ccr & VTD_CCR_ICC) != 0)
+		;
+}
+
+static void
+vtd_iotlb_global_invalidate(struct vtdmap *vtdmap)
+{
+	int offset;
+	volatile uint64_t *iotlb_reg, val;
+
+	vtd_wbflush(vtdmap);
+
+	offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16;
+	iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8);
+	
+	*iotlb_reg =  VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL |
+		      VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES;
+
+	while (1) {
+		val = *iotlb_reg;
+		if ((val & VTD_IIR_IVT) == 0)
+			break;
+	}
+}
+
+static void
+vtd_translation_enable(struct vtdmap *vtdmap)
+{
+
+	vtdmap->gcr = VTD_GCR_TE;
+	while ((vtdmap->gsr & VTD_GSR_TES) == 0)
+		;
+}
+
+static void
+vtd_translation_disable(struct vtdmap *vtdmap)
+{
+
+	vtdmap->gcr = 0;
+	while ((vtdmap->gsr & VTD_GSR_TES) != 0)
+		;
+}
+
+static int
+vtd_init(void)
+{
+	int i, units;
+	struct vtdmap *vtdmap;
+	vm_paddr_t ctx_paddr;
+	
+	for (i = 0; drhd_ident_funcs[i] != NULL; i++) {
+		units = (*drhd_ident_funcs[i])();
+		if (units > 0)
+			break;
+	}
+
+	if (units <= 0)
+		return (ENXIO);
+
+	drhd_num = units;
+	vtdmap = vtdmaps[0];
+
+	if (VTD_CAP_CM(vtdmap->cap) != 0)
+		panic("vtd_init: invalid caching mode");
+
+	max_domains = vtd_max_domains(vtdmap);
+
+	/*
+	 * Set up the root-table to point to the context-entry tables
+	 */
+	for (i = 0; i < 256; i++) {
+		ctx_paddr = vtophys(ctx_tables[i]);
+		if (ctx_paddr & PAGE_MASK)
+			panic("ctx table (0x%0lx) not page aligned", ctx_paddr);
+
+		root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT;
+	}
+
+	return (0);
+}
+
+static void
+vtd_cleanup(void)
+{
+}
+
+static void
+vtd_enable(void)
+{
+	int i;
+	struct vtdmap *vtdmap;
+
+	for (i = 0; i < drhd_num; i++) {
+		vtdmap = vtdmaps[i];
+		vtd_wbflush(vtdmap);
+
+		/* Update the root table address */
+		vtdmap->rta = vtophys(root_table);
+		vtdmap->gcr = VTD_GCR_SRTP;
+		while ((vtdmap->gsr & VTD_GSR_RTPS) == 0)
+			;
+
+		vtd_ctx_global_invalidate(vtdmap);
+		vtd_iotlb_global_invalidate(vtdmap);
+
+		vtd_translation_enable(vtdmap);
+	}
+}
+
+static void
+vtd_disable(void)
+{
+	int i;
+	struct vtdmap *vtdmap;
+
+	for (i = 0; i < drhd_num; i++) {
+		vtdmap = vtdmaps[i];
+		vtd_translation_disable(vtdmap);
+	}
+}
+
+static void
+vtd_add_device(void *arg, int bus, int slot, int func)
+{
+	int idx;
+	uint64_t *ctxp;
+	struct domain *dom = arg;
+	vm_paddr_t pt_paddr;
+	struct vtdmap *vtdmap;
+
+	if (bus < 0 || bus > PCI_BUSMAX ||
+	    slot < 0 || slot > PCI_SLOTMAX ||
+	    func < 0 || func > PCI_FUNCMAX)
+		panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
+
+	vtdmap = vtdmaps[0];
+	ctxp = ctx_tables[bus];
+	pt_paddr = vtophys(dom->ptp);
+	idx = (slot << 3 | func) * 2;
+
+	if (ctxp[idx] & VTD_CTX_PRESENT) {
+		panic("vtd_add_device: device %d/%d/%d is already owned by "
+		      "domain %d", bus, slot, func,
+		      (uint16_t)(ctxp[idx + 1] >> 8));
+	}
+
+	/*
+	 * Order is important. The 'present' bit is set only after all fields
+	 * of the context pointer are initialized.
+	 */
+	ctxp[idx + 1] = dom->addrwidth | (dom->id << 8);
+
+	if (VTD_ECAP_DI(vtdmap->ext_cap))
+		ctxp[idx] = VTD_CTX_TT_ALL;
+	else
+		ctxp[idx] = 0;
+
+	ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT;
+
+	/*
+	 * 'Not Present' entries are not cached in either the Context Cache
+	 * or in the IOTLB, so there is no need to invalidate either of them.
+	 */
+}
+
+static void
+vtd_remove_device(void *arg, int bus, int slot, int func)
+{
+	int i, idx;
+	uint64_t *ctxp;
+	struct vtdmap *vtdmap;
+
+	if (bus < 0 || bus > PCI_BUSMAX ||
+	    slot < 0 || slot > PCI_SLOTMAX ||
+	    func < 0 || func > PCI_FUNCMAX)
+		panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
+
+	ctxp = ctx_tables[bus];
+	idx = (slot << 3 | func) * 2;
+
+	/*
+	 * Order is important. The 'present' bit is must be cleared first.
+	 */
+	ctxp[idx] = 0;
+	ctxp[idx + 1] = 0;
+
+	/*
+	 * Invalidate the Context Cache and the IOTLB.
+	 *
+	 * XXX use device-selective invalidation for Context Cache
+	 * XXX use domain-selective invalidation for IOTLB
+	 */
+	for (i = 0; i < drhd_num; i++) {
+		vtdmap = vtdmaps[i];
+		vtd_ctx_global_invalidate(vtdmap);
+		vtd_iotlb_global_invalidate(vtdmap);
+	}
+}
+
+static uint64_t
+vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
+{
+	struct domain *dom;
+	int i, spshift, ptpshift, ptpindex, nlevels;
+	uint64_t spsize, *ptp;
+
+	dom = arg;
+	ptpindex = 0;
+	ptpshift = 0;
+
+	if (gpa & PAGE_MASK)
+		panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
+
+	if (hpa & PAGE_MASK)
+		panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa);
+
+	if (len & PAGE_MASK)
+		panic("vtd_create_mapping: unaligned len 0x%0lx", len);
+
+	/*
+	 * Compute the size of the mapping that we can accomodate.
+	 *
+	 * This is based on three factors:
+	 * - supported super page size
+	 * - alignment of the region starting at 'gpa' and 'hpa'
+	 * - length of the region 'len'
+	 */
+	spshift = 48;
+	for (i = 3; i >= 0; i--) {
+		spsize = 1UL << spshift;
+		if ((dom->spsmask & (1 << i)) != 0 &&
+		    (gpa & (spsize - 1)) == 0 &&
+		    (hpa & (spsize - 1)) == 0 &&
+		    (len >= spsize)) {
+			break;
+		}
+		spshift -= 9;
+	}
+
+	ptp = dom->ptp;
+	nlevels = dom->pt_levels;
+	while (--nlevels >= 0) {
+		ptpshift = 12 + nlevels * 9;
+		ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+		/* We have reached the leaf mapping */
+		if (spshift >= ptpshift) {
+			break;
+		}
+
+		/*
+		 * We are working on a non-leaf page table page.
+		 *
+		 * Create a downstream page table page if necessary and point
+		 * to it from the current page table.
+		 */
+		if (ptp[ptpindex] == 0) {
+			void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO);
+			ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR;
+		}
+
+		ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M);
+	}
+
+	if ((gpa & ((1UL << ptpshift) - 1)) != 0)
+		panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift);
+
+	/*
+	 * Create a 'gpa' -> 'hpa' mapping
+	 */
+	ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR;
+
+	if (nlevels > 0)
+		ptp[ptpindex] |= VTD_PTE_SUPERPAGE;
+
+	return (1UL << ptpshift);
+}
+
+static void *
+vtd_create_domain(vm_paddr_t maxaddr)
+{
+	struct domain *dom;
+	vm_paddr_t addr;
+	int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth;
+	struct vtdmap *vtdmap;
+
+	if (drhd_num <= 0)
+		panic("vtd_create_domain: no dma remapping hardware available");
+
+	vtdmap = vtdmaps[0];
+
+	/*
+	 * Calculate AGAW.
+	 * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec.
+	 */
+	addr = 0;
+	for (gaw = 0; addr < maxaddr; gaw++)
+		addr = 1ULL << gaw;
+
+	res = (gaw - 12) % 9;
+	if (res == 0)
+		agaw = gaw;
+	else
+		agaw = gaw + 9 - res;
+
+	if (agaw > 64)
+		agaw = 64;
+
+	/*
+	 * Select the smallest Supported AGAW and the corresponding number
+	 * of page table levels.
+	 */
+	pt_levels = 2;
+	sagaw = 30;
+	addrwidth = 0;
+	tmp = VTD_CAP_SAGAW(vtdmap->cap);
+	for (i = 0; i < 5; i++) {
+		if ((tmp & (1 << i)) != 0 && sagaw >= agaw)
+			break;
+		pt_levels++;
+		addrwidth++;
+		sagaw += 9;
+		if (sagaw > 64)
+			sagaw = 64;
+	}
+
+	if (i >= 5) {
+		panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d",
+		      VTD_CAP_SAGAW(vtdmap->cap), agaw);
+	}
+
+	dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK);
+	dom->pt_levels = pt_levels;
+	dom->addrwidth = addrwidth;
+	dom->spsmask = VTD_CAP_SPS(vtdmap->cap);
+	dom->id = domain_id();
+	dom->maxaddr = maxaddr;
+	dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK);
+	if ((uintptr_t)dom->ptp & PAGE_MASK)
+		panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp);
+
+	SLIST_INSERT_HEAD(&domhead, dom, next);
+
+	return (dom);
+}
+
+static void
+vtd_free_ptp(uint64_t *ptp, int level)
+{
+	int i;
+	uint64_t *nlp;
+
+	if (level > 1) {
+		for (i = 0; i < 512; i++) {
+			if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0)
+				continue;
+			if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0)
+				continue;
+			nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M);
+			vtd_free_ptp(nlp, level - 1);
+		}
+	}
+
+	bzero(ptp, PAGE_SIZE);
+	free(ptp, M_VTD);
+}
+
+static void
+vtd_destroy_domain(void *arg)
+{
+	struct domain *dom;
+	
+	dom = arg;
+
+	SLIST_REMOVE(&domhead, dom, domain, next);
+	vtd_free_ptp(dom->ptp, dom->pt_levels);
+	free(dom, M_VTD);
+}
+
+struct iommu_ops iommu_ops_intel = {
+	vtd_init,
+	vtd_cleanup,
+	vtd_enable,
+	vtd_disable,
+	vtd_create_domain,
+	vtd_destroy_domain,
+	vtd_create_mapping,
+	vtd_add_device,
+	vtd_remove_device,
+};
diff --git a/sys/amd64/vmm/io/iommu.c b/sys/amd64/vmm/io/iommu.c
new file mode 100644
index 0000000..baf2447
--- /dev/null
+++ b/sys/amd64/vmm/io/iommu.c
@@ -0,0 +1,230 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/md_var.h>
+
+#include "vmm_util.h"
+#include "iommu.h"
+
+static boolean_t iommu_avail;
+static struct iommu_ops *ops;
+static void *host_domain;
+
+static __inline int
+IOMMU_INIT(void)
+{
+	if (ops != NULL)
+		return ((*ops->init)());
+	else
+		return (ENXIO);
+}
+
+static __inline void
+IOMMU_CLEANUP(void)
+{
+	if (ops != NULL && iommu_avail)
+		(*ops->cleanup)();
+}
+
+static __inline void *
+IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr)
+{
+
+	if (ops != NULL && iommu_avail)
+		return ((*ops->create_domain)(maxaddr));
+	else
+		return (NULL);
+}
+
+static __inline void
+IOMMU_DESTROY_DOMAIN(void *dom)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->destroy_domain)(dom);
+}
+
+static __inline uint64_t
+IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
+{
+
+	if (ops != NULL && iommu_avail)
+		return ((*ops->create_mapping)(domain, gpa, hpa, len));
+	else
+		return (len);		/* XXX */
+}
+
+static __inline void
+IOMMU_ADD_DEVICE(void *domain, int bus, int slot, int func)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->add_device)(domain, bus, slot, func);
+}
+
+static __inline void
+IOMMU_REMOVE_DEVICE(void *domain, int bus, int slot, int func)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->remove_device)(domain, bus, slot, func);
+}
+
+static __inline void
+IOMMU_ENABLE(void)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->enable)();
+}
+
+static __inline void
+IOMMU_DISABLE(void)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->disable)();
+}
+
+void
+iommu_init(void)
+{
+	int error, bus, slot, func;
+	vm_paddr_t maxaddr;
+	const char *name;
+	device_t dev;
+
+	if (vmm_is_intel())
+		ops = &iommu_ops_intel;
+	else if (vmm_is_amd())
+		ops = &iommu_ops_amd;
+	else
+		ops = NULL;
+
+	error = IOMMU_INIT();
+	if (error)
+		return;
+
+	iommu_avail = TRUE;
+
+	/*
+	 * Create a domain for the devices owned by the host
+	 */
+	maxaddr = ptoa(Maxmem);
+	host_domain = IOMMU_CREATE_DOMAIN(maxaddr);
+	if (host_domain == NULL)
+		panic("iommu_init: unable to create a host domain");
+
+	/*
+	 * Create 1:1 mappings from '0' to 'Maxmem' for devices assigned to
+	 * the host
+	 */
+	iommu_create_mapping(host_domain, 0, 0, maxaddr);
+
+	for (bus = 0; bus <= PCI_BUSMAX; bus++) {
+		for (slot = 0; slot <= PCI_SLOTMAX; slot++) {
+			for (func = 0; func <= PCI_FUNCMAX; func++) {
+				dev = pci_find_dbsf(0, bus, slot, func);
+				if (dev == NULL)
+					continue;
+
+				/* skip passthrough devices */
+				name = device_get_name(dev);
+				if (name != NULL && strcmp(name, "ppt") == 0)
+					continue;
+
+				/* everything else belongs to the host domain */
+				iommu_add_device(host_domain, bus, slot, func);
+			}
+		}
+	}
+	IOMMU_ENABLE();
+
+}
+
+void
+iommu_cleanup(void)
+{
+	IOMMU_DISABLE();
+	IOMMU_DESTROY_DOMAIN(host_domain);
+	IOMMU_CLEANUP();
+}
+
+void *
+iommu_create_domain(vm_paddr_t maxaddr)
+{
+
+	return (IOMMU_CREATE_DOMAIN(maxaddr));
+}
+
+void
+iommu_destroy_domain(void *dom)
+{
+
+	IOMMU_DESTROY_DOMAIN(dom);
+}
+
+void
+iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len)
+{
+	uint64_t mapped, remaining;
+
+	remaining = len;
+
+	while (remaining > 0) {
+		mapped = IOMMU_CREATE_MAPPING(dom, gpa, hpa, remaining);
+		gpa += mapped;
+		hpa += mapped;
+		remaining -= mapped;
+	}
+}
+
+void
+iommu_add_device(void *dom, int bus, int slot, int func)
+{
+
+	IOMMU_ADD_DEVICE(dom, bus, slot, func);
+}
+
+void
+iommu_remove_device(void *dom, int bus, int slot, int func)
+{
+
+	IOMMU_REMOVE_DEVICE(dom, bus, slot, func);
+}
diff --git a/sys/amd64/vmm/io/iommu.h b/sys/amd64/vmm/io/iommu.h
new file mode 100644
index 0000000..e4f7229
--- /dev/null
+++ b/sys/amd64/vmm/io/iommu.h
@@ -0,0 +1,67 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IO_IOMMU_H_
+#define	_IO_IOMMU_H_
+
+typedef int (*iommu_init_func_t)(void);
+typedef void (*iommu_cleanup_func_t)(void);
+typedef void (*iommu_enable_func_t)(void);
+typedef void (*iommu_disable_func_t)(void);
+typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr);
+typedef void (*iommu_destroy_domain_t)(void *domain);
+typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa,
+					   vm_paddr_t hpa, uint64_t len);
+typedef void (*iommu_add_device_t)(void *domain, int bus, int slot, int func);
+typedef void (*iommu_remove_device_t)(void *dom, int bus, int slot, int func);
+
+struct iommu_ops {
+	iommu_init_func_t	init;		/* module wide */
+	iommu_cleanup_func_t	cleanup;
+	iommu_enable_func_t	enable;
+	iommu_disable_func_t	disable;
+
+	iommu_create_domain_t	create_domain;	/* domain-specific */
+	iommu_destroy_domain_t	destroy_domain;
+	iommu_create_mapping_t	create_mapping;
+	iommu_add_device_t	add_device;
+	iommu_remove_device_t	remove_device;
+};
+
+extern struct iommu_ops iommu_ops_intel;
+extern struct iommu_ops iommu_ops_amd;
+
+void	iommu_init(void);
+void	iommu_cleanup(void);
+void	*iommu_create_domain(vm_paddr_t maxaddr);
+void	iommu_destroy_domain(void *dom);
+void	iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa,
+			     size_t len);
+void	iommu_add_device(void *dom, int bus, int slot, int func);
+void	iommu_remove_device(void *dom, int bus, int slot, int func);
+#endif
diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c
new file mode 100644
index 0000000..dc2f326
--- /dev/null
+++ b/sys/amd64/vmm/io/ppt.c
@@ -0,0 +1,449 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/bus.h>
+#include <sys/pciio.h>
+#include <sys/rman.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/resource.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+
+#include "vmm_lapic.h"
+#include "vmm_ktr.h"
+
+#include "iommu.h"
+#include "ppt.h"
+
+#define	MAX_PPTDEVS	(sizeof(pptdevs) / sizeof(pptdevs[0]))
+#define	MAX_MMIOSEGS	(PCIR_MAX_BAR_0 + 1)
+#define	MAX_MSIMSGS	32
+
+struct pptintr_arg {				/* pptintr(pptintr_arg) */
+	struct pptdev	*pptdev;
+	int		msg;
+};
+
+static struct pptdev {
+	device_t	dev;
+	struct vm	*vm;			/* owner of this device */
+	struct vm_memory_segment mmio[MAX_MMIOSEGS];
+	struct {
+		int	num_msgs;		/* guest state */
+		int	vector;
+		int	vcpu;
+
+		int	startrid;		/* host state */
+		struct resource *res[MAX_MSIMSGS];
+		void	*cookie[MAX_MSIMSGS];
+		struct pptintr_arg arg[MAX_MSIMSGS];
+	} msi;
+} pptdevs[32];
+
+static int num_pptdevs;
+
+static int
+ppt_probe(device_t dev)
+{
+	int bus, slot, func;
+	struct pci_devinfo *dinfo;
+
+	dinfo = (struct pci_devinfo *)device_get_ivars(dev);
+
+	bus = pci_get_bus(dev);
+	slot = pci_get_slot(dev);
+	func = pci_get_function(dev);
+
+	/*
+	 * To qualify as a pci passthrough device a device must:
+	 * - be allowed by administrator to be used in this role
+	 * - be an endpoint device
+	 */
+	if (vmm_is_pptdev(bus, slot, func) &&
+	    (dinfo->cfg.hdrtype & PCIM_HDRTYPE) == PCIM_HDRTYPE_NORMAL)
+		return (0);
+	else
+		return (ENXIO);
+}
+
+static int
+ppt_attach(device_t dev)
+{
+	int n;
+
+	if (num_pptdevs >= MAX_PPTDEVS) {
+		printf("ppt_attach: maximum number of pci passthrough devices "
+		       "exceeded\n");
+		return (ENXIO);
+	}
+
+	n = num_pptdevs++;
+	pptdevs[n].dev = dev;
+
+	if (bootverbose)
+		device_printf(dev, "attached\n");
+
+	return (0);
+}
+
+static int
+ppt_detach(device_t dev)
+{
+	/*
+	 * XXX check whether there are any pci passthrough devices assigned
+	 * to guests before we allow this driver to detach.
+	 */
+
+	return (0);
+}
+
+static device_method_t ppt_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,		ppt_probe),
+	DEVMETHOD(device_attach,	ppt_attach),
+	DEVMETHOD(device_detach,	ppt_detach),
+	{0, 0}
+};
+
+static devclass_t ppt_devclass;
+DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, 0);
+DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL);
+
+static struct pptdev *
+ppt_find(int bus, int slot, int func)
+{
+	device_t dev;
+	int i, b, s, f;
+
+	for (i = 0; i < num_pptdevs; i++) {
+		dev = pptdevs[i].dev;
+		b = pci_get_bus(dev);
+		s = pci_get_slot(dev);
+		f = pci_get_function(dev);
+		if (bus == b && slot == s && func == f)
+			return (&pptdevs[i]);
+	}
+	return (NULL);
+}
+
+static void
+ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
+{
+	int i;
+	struct vm_memory_segment *seg;
+
+	for (i = 0; i < MAX_MMIOSEGS; i++) {
+		seg = &ppt->mmio[i];
+		if (seg->len == 0)
+			continue;
+		(void)vm_unmap_mmio(vm, seg->gpa, seg->len);
+		bzero(seg, sizeof(struct vm_memory_segment));
+	}
+}
+
+static void
+ppt_teardown_msi(struct pptdev *ppt)
+{
+	int i, rid;
+	void *cookie;
+	struct resource *res;
+
+	if (ppt->msi.num_msgs == 0)
+		return;
+
+	for (i = 0; i < ppt->msi.num_msgs; i++) {
+		rid = ppt->msi.startrid + i;
+		res = ppt->msi.res[i];
+		cookie = ppt->msi.cookie[i];
+
+		if (cookie != NULL)
+			bus_teardown_intr(ppt->dev, res, cookie);
+
+		if (res != NULL)
+			bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
+		
+		ppt->msi.res[i] = NULL;
+		ppt->msi.cookie[i] = NULL;
+	}
+
+	if (ppt->msi.startrid == 1)
+		pci_release_msi(ppt->dev);
+
+	ppt->msi.num_msgs = 0;
+}
+
+int
+ppt_assign_device(struct vm *vm, int bus, int slot, int func)
+{
+	struct pptdev *ppt;
+
+	ppt = ppt_find(bus, slot, func);
+	if (ppt != NULL) {
+		/*
+		 * If this device is owned by a different VM then we
+		 * cannot change its owner.
+		 */
+		if (ppt->vm != NULL && ppt->vm != vm)
+			return (EBUSY);
+
+		ppt->vm = vm;
+		iommu_add_device(vm_iommu_domain(vm), bus, slot, func);
+		return (0);
+	}
+	return (ENOENT);
+}
+
+int
+ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
+{
+	struct pptdev *ppt;
+
+	ppt = ppt_find(bus, slot, func);
+	if (ppt != NULL) {
+		/*
+		 * If this device is not owned by this 'vm' then bail out.
+		 */
+		if (ppt->vm != vm)
+			return (EBUSY);
+		ppt_unmap_mmio(vm, ppt);
+		ppt_teardown_msi(ppt);
+		iommu_remove_device(vm_iommu_domain(vm), bus, slot, func);
+		ppt->vm = NULL;
+		return (0);
+	}
+	return (ENOENT);
+}
+
+int
+ppt_unassign_all(struct vm *vm)
+{
+	int i, bus, slot, func;
+	device_t dev;
+
+	for (i = 0; i < num_pptdevs; i++) {
+		if (pptdevs[i].vm == vm) {
+			dev = pptdevs[i].dev;
+			bus = pci_get_bus(dev);
+			slot = pci_get_slot(dev);
+			func = pci_get_function(dev);
+			ppt_unassign_device(vm, bus, slot, func);
+		}
+	}
+
+	return (0);
+}
+
+int
+ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
+	     vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+	int i, error;
+	struct vm_memory_segment *seg;
+	struct pptdev *ppt;
+
+	ppt = ppt_find(bus, slot, func);
+	if (ppt != NULL) {
+		if (ppt->vm != vm)
+			return (EBUSY);
+
+		for (i = 0; i < MAX_MMIOSEGS; i++) {
+			seg = &ppt->mmio[i];
+			if (seg->len == 0) {
+				error = vm_map_mmio(vm, gpa, len, hpa);
+				if (error == 0) {
+					seg->gpa = gpa;
+					seg->len = len;
+					seg->hpa = hpa;
+				}
+				return (error);
+			}
+		}
+		return (ENOSPC);
+	}
+	return (ENOENT);
+}
+
+static int
+pptintr(void *arg)
+{
+	int vec;
+	struct pptdev *ppt;
+	struct pptintr_arg *pptarg;
+	
+	pptarg = arg;
+	ppt = pptarg->pptdev;
+	vec = ppt->msi.vector + pptarg->msg;
+
+	if (ppt->vm != NULL)
+		(void) lapic_set_intr(ppt->vm, ppt->msi.vcpu, vec);
+	else {
+		/*
+		 * XXX
+		 * This is not expected to happen - panic?
+		 */
+	}
+
+	/*
+	 * For legacy interrupts give other filters a chance in case
+	 * the interrupt was not generated by the passthrough device.
+	 */
+	if (ppt->msi.startrid == 0)
+		return (FILTER_STRAY);
+	else
+		return (FILTER_HANDLED);
+}
+
+/*
+ * XXX
+ * When we try to free the MSI resource the kernel will bind the thread to
+ * the host cpu was originally handling the MSI. The function freeing the
+ * MSI vector (apic_free_vector()) will panic the kernel if the thread
+ * is already bound to a cpu.
+ * 
+ * So, we temporarily unbind the vcpu thread before freeing the MSI resource.
+ */
+static void
+PPT_TEARDOWN_MSI(struct vm *vm, int vcpu, struct pptdev *ppt)
+{
+	int pincpu = -1;
+
+	vm_get_pinning(vm, vcpu, &pincpu);
+
+	if (pincpu >= 0)
+		vm_set_pinning(vm, vcpu, -1);
+
+	ppt_teardown_msi(ppt);
+
+	if (pincpu >= 0)
+		vm_set_pinning(vm, vcpu, pincpu);
+}
+
+int
+ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
+	      int destcpu, int vector, int numvec)
+{
+	int i, rid, flags;
+	int msi_count, startrid, error, tmp;
+	struct pptdev *ppt;
+
+	if ((destcpu >= VM_MAXCPU || destcpu < 0) ||
+	    (vector < 0 || vector > 255) ||
+	    (numvec < 0 || numvec > MAX_MSIMSGS))
+		return (EINVAL);
+
+	ppt = ppt_find(bus, slot, func);
+	if (ppt == NULL)
+		return (ENOENT);
+	if (ppt->vm != vm)		/* Make sure we own this device */
+		return (EBUSY);
+
+	/* Free any allocated resources */
+	PPT_TEARDOWN_MSI(vm, vcpu, ppt);
+
+	if (numvec == 0)		/* nothing more to do */
+		return (0);
+
+	flags = RF_ACTIVE;
+	msi_count = pci_msi_count(ppt->dev);
+	if (msi_count == 0) {
+		startrid = 0;		/* legacy interrupt */
+		msi_count = 1;
+		flags |= RF_SHAREABLE;
+	} else
+		startrid = 1;		/* MSI */
+
+	/*
+	 * The device must be capable of supporting the number of vectors
+	 * the guest wants to allocate.
+	 */
+	if (numvec > msi_count)
+		return (EINVAL);
+
+	/*
+	 * Make sure that we can allocate all the MSI vectors that are needed
+	 * by the guest.
+	 */
+	if (startrid == 1) {
+		tmp = numvec;
+		error = pci_alloc_msi(ppt->dev, &tmp);
+		if (error)
+			return (error);
+		else if (tmp != numvec) {
+			pci_release_msi(ppt->dev);
+			return (ENOSPC);
+		} else {
+			/* success */
+		}
+	}
+	
+	ppt->msi.vector = vector;
+	ppt->msi.vcpu = destcpu;
+	ppt->msi.startrid = startrid;
+
+	/*
+	 * Allocate the irq resource and attach it to the interrupt handler.
+	 */
+	for (i = 0; i < numvec; i++) {
+		ppt->msi.num_msgs = i + 1;
+		ppt->msi.cookie[i] = NULL;
+
+		rid = startrid + i;
+		ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
+							 &rid, flags);
+		if (ppt->msi.res[i] == NULL)
+			break;
+
+		ppt->msi.arg[i].pptdev = ppt;
+		ppt->msi.arg[i].msg = i;
+
+		error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
+				       INTR_TYPE_NET | INTR_MPSAFE | INTR_FAST,
+				       pptintr, NULL, &ppt->msi.arg[i],
+				       &ppt->msi.cookie[i]);
+		if (error != 0)
+			break;
+	}
+	
+	if (i < numvec) {
+		PPT_TEARDOWN_MSI(vm, vcpu, ppt);
+		return (ENXIO);
+	}
+
+	return (0);
+}
diff --git a/sys/amd64/vmm/io/ppt.h b/sys/amd64/vmm/io/ppt.h
new file mode 100644
index 0000000..95f3ad0
--- /dev/null
+++ b/sys/amd64/vmm/io/ppt.h
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IO_PPT_H_
+#define	_IO_PPT_H_
+
+int	ppt_assign_device(struct vm *vm, int bus, int slot, int func);
+int	ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
+int	ppt_unassign_all(struct vm *vm);
+int	ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
+		     vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int	ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
+		      int destcpu, int vector, int numvec);
+
+#endif
diff --git a/sys/amd64/vmm/io/vdev.c b/sys/amd64/vmm/io/vdev.c
new file mode 100644
index 0000000..cd6c5d1
--- /dev/null
+++ b/sys/amd64/vmm/io/vdev.c
@@ -0,0 +1,270 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include "vdev.h"
+
+struct vdev {
+	SLIST_ENTRY(vdev) 	 entry;
+	struct vdev_ops 	*ops;
+	void			*dev;
+};
+static SLIST_HEAD(, vdev)	vdev_head;
+static int 		  	vdev_count;
+
+struct vdev_region {
+	SLIST_ENTRY(vdev_region) 	 entry;
+	struct vdev_ops 		*ops;
+	void				*dev;
+	struct io_region		*io;
+};
+static SLIST_HEAD(, vdev_region)	 region_head;
+static int 		  		 region_count;
+
+static MALLOC_DEFINE(M_VDEV, "vdev", "vdev");
+
+#define VDEV_INIT 	(0)
+#define VDEV_RESET	(1)
+#define VDEV_HALT	(2)
+
+// static const char* vdev_event_str[] = {"VDEV_INIT", "VDEV_RESET", "VDEV_HALT"};
+
+static int
+vdev_system_event(int event)
+{
+	struct vdev 	*vd;
+	int		 rc;
+
+	// TODO: locking
+	SLIST_FOREACH(vd, &vdev_head, entry) {
+		// printf("%s : %s Device %s\n", __func__, vdev_event_str[event], vd->ops->name);
+		switch (event) {
+			case VDEV_INIT:
+				rc = vd->ops->init(vd->dev);
+				break;
+			case VDEV_RESET:
+				rc = vd->ops->reset(vd->dev);
+				break;
+			case VDEV_HALT:
+				rc = vd->ops->halt(vd->dev);
+				break;
+			default:
+				break;
+		}
+		if (rc) {
+			printf("vdev %s init failed rc=%d\n",
+			    vd->ops->name, rc);
+			return rc;
+		}
+	}
+	return 0;
+}
+
+int
+vdev_init(void)
+{
+	return vdev_system_event(VDEV_INIT);
+}
+
+int
+vdev_reset(void)
+{
+	return vdev_system_event(VDEV_RESET);
+}
+
+int
+vdev_halt(void)
+{
+	return vdev_system_event(VDEV_HALT);
+}
+
+void
+vdev_vm_init(void)
+{
+	SLIST_INIT(&vdev_head);
+	vdev_count = 0;
+
+	SLIST_INIT(&region_head);
+	region_count = 0;
+}
+void
+vdev_vm_cleanup(void)
+{
+	struct vdev *vd;
+     
+	// TODO: locking
+	while (!SLIST_EMPTY(&vdev_head)) {
+		vd = SLIST_FIRST(&vdev_head);
+		SLIST_REMOVE_HEAD(&vdev_head, entry);
+		free(vd, M_VDEV);
+		vdev_count--;
+	}
+}
+
+int
+vdev_register(struct vdev_ops *ops, void *dev)
+{
+	struct vdev *vd;
+	vd = malloc(sizeof(*vd), M_VDEV, M_WAITOK | M_ZERO); 
+	vd->ops = ops;
+	vd->dev = dev;
+	
+	// TODO: locking
+	SLIST_INSERT_HEAD(&vdev_head, vd, entry); 
+	vdev_count++;
+	return 0;
+}
+
+void
+vdev_unregister(void *dev)
+{
+	struct vdev 	*vd, *found;
+
+	found = NULL;
+	// TODO: locking
+	SLIST_FOREACH(vd, &vdev_head, entry) {
+		if (vd->dev == dev) {
+			found = vd;
+		}
+	}
+
+	if (found) {
+		SLIST_REMOVE(&vdev_head, found, vdev, entry);
+		free(found, M_VDEV);
+	}
+}
+
+#define IN_RANGE(val, start, end)	\
+    (((val) >= (start)) && ((val) < (end)))
+
+static struct vdev_region*
+vdev_find_region(struct io_region *io, void *dev) 
+{
+	struct 		vdev_region *region, *found;
+	uint64_t	region_base;
+	uint64_t	region_end;
+
+	found = NULL;
+
+	// TODO: locking
+	// FIXME: we should verify we are in the context the current
+	// 	  vcpu here as well.
+	SLIST_FOREACH(region, &region_head, entry) {
+		region_base = region->io->base;
+		region_end = region_base + region->io->len;
+		if (IN_RANGE(io->base, region_base, region_end) &&
+		    IN_RANGE(io->base+io->len, region_base, region_end+1) &&
+		    (dev && dev == region->dev)) {
+			found = region;
+			break;
+		}
+	}
+	return found;
+}
+
+int
+vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io)
+{
+	struct vdev_region *region;
+
+	region = vdev_find_region(io, dev);
+	if (region) {
+		return -EEXIST;
+	}
+
+	region = malloc(sizeof(*region), M_VDEV, M_WAITOK | M_ZERO);
+	region->io = io;
+	region->ops = ops;
+	region->dev = dev;
+
+	// TODO: locking
+	SLIST_INSERT_HEAD(&region_head, region, entry); 
+	region_count++;
+
+	return 0;
+}
+
+void
+vdev_unregister_region(void *dev, struct io_region *io)
+{
+	struct vdev_region *region;
+
+	region = vdev_find_region(io, dev);
+	
+	if (region) {
+		SLIST_REMOVE(&region_head, region, vdev_region, entry);
+		free(region, M_VDEV);
+		region_count--;
+	}
+}
+
+static int
+vdev_memrw(uint64_t gpa, opsize_t size, uint64_t *data, int read)
+{
+	struct vdev_region 	*region;
+	struct io_region	 io;
+	region_attr_t		 attr;
+	int			 rc;
+
+	io.base = gpa;
+	io.len = size;
+
+	region = vdev_find_region(&io, NULL);
+	if (!region)
+		return -EINVAL;
+	
+	attr = (read) ? MMIO_READ : MMIO_WRITE;
+	if (!(region->io->attr & attr))
+		return -EPERM;
+
+	if (read)
+		rc = region->ops->memread(region->dev, gpa, size, data);
+	else 
+		rc = region->ops->memwrite(region->dev, gpa, size, *data);
+
+	return rc;
+}
+
+int
+vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data)
+{
+	return vdev_memrw(gpa, size, data, 1);
+}
+
+int
+vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data)
+{
+	return vdev_memrw(gpa, size, &data, 0);
+}
diff --git a/sys/amd64/vmm/io/vdev.h b/sys/amd64/vmm/io/vdev.h
new file mode 100644
index 0000000..6feeba8
--- /dev/null
+++ b/sys/amd64/vmm/io/vdev.h
@@ -0,0 +1,84 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VDEV_H_
+#define	_VDEV_H_
+
+typedef enum {
+	BYTE	= 1,
+	WORD	= 2,
+	DWORD	= 4,
+	QWORD	= 8,
+} opsize_t;
+
+typedef enum {
+	MMIO_READ = 1,
+	MMIO_WRITE = 2,
+} region_attr_t;
+
+struct io_region {
+	uint64_t	base;
+	uint64_t	len;
+	region_attr_t	attr;
+	int		vcpu;
+};
+
+typedef int (*vdev_init_t)(void* dev);
+typedef int (*vdev_reset_t)(void* dev);
+typedef int (*vdev_halt_t)(void* dev);
+typedef int (*vdev_memread_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t *data);
+typedef int (*vdev_memwrite_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t data);
+
+
+struct vdev_ops {
+	const char	*name;
+	vdev_init_t	init;
+	vdev_reset_t	reset;
+	vdev_halt_t	halt;
+	vdev_memread_t	memread;
+	vdev_memwrite_t	memwrite;
+};
+
+
+void vdev_vm_init(void);
+void vdev_vm_cleanup(void);
+
+int  vdev_register(struct vdev_ops *ops, void *dev);
+void vdev_unregister(void *dev);
+
+int  vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io);
+void vdev_unregister_region(void *dev, struct io_region *io);
+
+int vdev_init(void);
+int vdev_reset(void);
+int vdev_halt(void);
+int vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data);
+int vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data);
+
+#endif	/* _VDEV_H_ */
+
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
new file mode 100644
index 0000000..a21addf
--- /dev/null
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -0,0 +1,812 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+
+#include <machine/clock.h>
+#include <machine/apicreg.h>
+
+#include <machine/vmm.h>
+
+#include "vmm_lapic.h"
+#include "vmm_ktr.h"
+#include "vdev.h"
+#include "vlapic.h"
+
+#define	VLAPIC_CTR0(vlapic, format)					\
+	VMM_CTR0((vlapic)->vm, (vlapic)->vcpuid, format)
+
+#define	VLAPIC_CTR1(vlapic, format, p1)					\
+	VMM_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1)
+
+#define	VLAPIC_CTR_IRR(vlapic, msg)					\
+do {									\
+	uint32_t *irrptr = &(vlapic)->apic.irr0;			\
+	irrptr[0] = irrptr[0];	/* silence compiler */			\
+	VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]);	\
+} while (0)
+
+#define	VLAPIC_CTR_ISR(vlapic, msg)					\
+do {									\
+	uint32_t *isrptr = &(vlapic)->apic.isr0;			\
+	isrptr[0] = isrptr[0];	/* silence compiler */			\
+	VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]);	\
+} while (0)
+
+static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
+
+#define	PRIO(x)			((x) >> 4)
+
+#define VLAPIC_VERSION		(16)
+#define VLAPIC_MAXLVT_ENTRIES	(5)
+
+struct vlapic {
+	struct vm		*vm;
+	int			vcpuid;
+
+	struct io_region	*mmio;
+	struct vdev_ops		*ops;
+	struct LAPIC		 apic;
+
+	int			 esr_update;
+
+	int			 divisor;
+	int			 ccr_ticks;
+
+	/*
+	 * The 'isrvec_stk' is a stack of vectors injected by the local apic.
+	 * A vector is popped from the stack when the processor does an EOI.
+	 * The vector on the top of the stack is used to compute the
+	 * Processor Priority in conjunction with the TPR.
+	 */
+	uint8_t			 isrvec_stk[ISRVEC_STK_SIZE];
+	int			 isrvec_stk_top;
+};
+
+static void
+vlapic_mask_lvts(uint32_t *lvts, int num_lvt)
+{
+	int i;
+	for (i = 0; i < num_lvt; i++) {
+		*lvts |= APIC_LVT_M;
+		lvts += 4;
+	}
+}
+
+#if 0
+static inline void
+vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
+{
+	printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset,
+	    *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS,
+	    *lvt & APIC_LVTT_M);
+}
+#endif
+
+static uint64_t
+vlapic_get_ccr(struct vlapic *vlapic)
+{
+	struct LAPIC    *lapic = &vlapic->apic;
+	return lapic->ccr_timer;
+}
+
+static void
+vlapic_update_errors(struct vlapic *vlapic)
+{
+	struct LAPIC    *lapic = &vlapic->apic;
+	lapic->esr = 0; // XXX 
+}
+
+static void
+vlapic_init_ipi(struct vlapic *vlapic)
+{
+	struct LAPIC    *lapic = &vlapic->apic;
+	lapic->version = VLAPIC_VERSION;
+	lapic->version |= (VLAPIC_MAXLVT_ENTRIES < MAXLVTSHIFT);
+	lapic->dfr = 0xffffffff;
+	lapic->svr = APIC_SVR_VECTOR;
+	vlapic_mask_lvts(&lapic->lvt_timer, VLAPIC_MAXLVT_ENTRIES+1);
+}
+
+static int
+vlapic_op_reset(void* dev)
+{
+	struct vlapic 	*vlapic = (struct vlapic*)dev;
+	struct LAPIC	*lapic = &vlapic->apic;
+
+	memset(lapic, 0, sizeof(*lapic));
+	lapic->id = vlapic->vcpuid << 24;
+	lapic->apr = vlapic->vcpuid;
+	vlapic_init_ipi(vlapic);
+	
+	return 0;
+
+}
+
+static int
+vlapic_op_init(void* dev)
+{
+	struct vlapic *vlapic = (struct vlapic*)dev;
+	vdev_register_region(vlapic->ops, vlapic, vlapic->mmio);
+	return vlapic_op_reset(dev);
+}
+
+static int
+vlapic_op_halt(void* dev)
+{
+	struct vlapic *vlapic = (struct vlapic*)dev;
+	vdev_unregister_region(vlapic, vlapic->mmio);
+	return 0;
+
+}
+
+void
+vlapic_set_intr_ready(struct vlapic *vlapic, int vector)
+{
+	struct LAPIC	*lapic = &vlapic->apic;
+	uint32_t	*irrptr;
+	int		idx;
+
+	if (vector < 0 || vector >= 256)
+		panic("vlapic_set_intr_ready: invalid vector %d\n", vector);
+
+	idx = (vector / 32) * 4;
+	irrptr = &lapic->irr0;
+	atomic_set_int(&irrptr[idx], 1 << (vector % 32));
+	VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
+}
+
+#define VLAPIC_BUS_FREQ	tsc_freq
+#define VLAPIC_DCR(x)	((x->dcr_timer & 0x8) >> 1)|(x->dcr_timer & 0x3)
+
+static int
+vlapic_timer_divisor(uint32_t dcr)
+{
+	switch (dcr & 0xB) {
+	case APIC_TDCR_2:
+		return (2);
+	case APIC_TDCR_4:
+		return (4);
+	case APIC_TDCR_8:
+		return (8);
+	case APIC_TDCR_16:
+		return (16);
+	case APIC_TDCR_32:
+		return (32);
+	case APIC_TDCR_64:
+		return (64);
+	case APIC_TDCR_128:
+		return (128);
+	default:
+		panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
+	}
+}
+
+static void
+vlapic_start_timer(struct vlapic *vlapic, uint32_t elapsed)
+{
+	uint32_t icr_timer;
+
+	icr_timer = vlapic->apic.icr_timer;
+
+	vlapic->ccr_ticks = ticks;
+	if (elapsed < icr_timer)
+		vlapic->apic.ccr_timer = icr_timer - elapsed;
+	else {
+		/*
+		 * This can happen when the guest is trying to run its local
+		 * apic timer higher that the setting of 'hz' in the host.
+		 *
+		 * We deal with this by running the guest local apic timer
+		 * at the rate of the host's 'hz' setting.
+		 */
+		vlapic->apic.ccr_timer = 0;
+	}
+}
+
+static __inline uint32_t *
+vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
+{
+	struct LAPIC	*lapic = &vlapic->apic;
+	int 		 i;
+
+	if (offset < APIC_OFFSET_TIMER_LVT || offset > APIC_OFFSET_ERROR_LVT) {
+		panic("vlapic_get_lvt: invalid LVT\n");
+	}
+	i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
+	return ((&lapic->lvt_timer) + i);;
+}
+
+#if 1
+static void
+dump_isrvec_stk(struct vlapic *vlapic)
+{
+	int i;
+	uint32_t *isrptr;
+
+	isrptr = &vlapic->apic.isr0;
+	for (i = 0; i < 8; i++)
+		printf("ISR%d 0x%08x\n", i, isrptr[i * 4]);
+
+	for (i = 0; i <= vlapic->isrvec_stk_top; i++)
+		printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]);
+}
+#endif
+
+/*
+ * Algorithm adopted from section "Interrupt, Task and Processor Priority"
+ * in Intel Architecture Manual Vol 3a.
+ */
+static void
+vlapic_update_ppr(struct vlapic *vlapic)
+{
+	int isrvec, tpr, ppr;
+
+	/*
+	 * Note that the value on the stack at index 0 is always 0.
+	 *
+	 * This is a placeholder for the value of ISRV when none of the
+	 * bits is set in the ISRx registers.
+	 */
+	isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top];
+	tpr = vlapic->apic.tpr;
+
+#if 1
+	{
+		int i, lastprio, curprio, vector, idx;
+		uint32_t *isrptr;
+
+		if (vlapic->isrvec_stk_top == 0 && isrvec != 0)
+			panic("isrvec_stk is corrupted: %d", isrvec);
+
+		/*
+		 * Make sure that the priority of the nested interrupts is
+		 * always increasing.
+		 */
+		lastprio = -1;
+		for (i = 1; i <= vlapic->isrvec_stk_top; i++) {
+			curprio = PRIO(vlapic->isrvec_stk[i]);
+			if (curprio <= lastprio) {
+				dump_isrvec_stk(vlapic);
+				panic("isrvec_stk does not satisfy invariant");
+			}
+			lastprio = curprio;
+		}
+
+		/*
+		 * Make sure that each bit set in the ISRx registers has a
+		 * corresponding entry on the isrvec stack.
+		 */
+		i = 1;
+		isrptr = &vlapic->apic.isr0;
+		for (vector = 0; vector < 256; vector++) {
+			idx = (vector / 32) * 4;
+			if (isrptr[idx] & (1 << (vector % 32))) {
+				if (i > vlapic->isrvec_stk_top ||
+				    vlapic->isrvec_stk[i] != vector) {
+					dump_isrvec_stk(vlapic);
+					panic("ISR and isrvec_stk out of sync");
+				}
+				i++;
+			}
+		}
+	}
+#endif
+
+	if (PRIO(tpr) >= PRIO(isrvec))
+		ppr = tpr;
+	else
+		ppr = isrvec & 0xf0;
+
+	vlapic->apic.ppr = ppr;
+	VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
+}
+
+static void
+vlapic_process_eoi(struct vlapic *vlapic)
+{
+	struct LAPIC	*lapic = &vlapic->apic;
+	uint32_t	*isrptr;
+	int		i, idx, bitpos;
+
+	isrptr = &lapic->isr0;
+
+	/*
+	 * The x86 architecture reserves the the first 32 vectors for use
+	 * by the processor.
+	 */
+	for (i = 7; i > 0; i--) {
+		idx = i * 4;
+		bitpos = fls(isrptr[idx]);
+		if (bitpos != 0) {
+			if (vlapic->isrvec_stk_top <= 0) {
+				panic("invalid vlapic isrvec_stk_top %d",
+				      vlapic->isrvec_stk_top);
+			}
+			isrptr[idx] &= ~(1 << (bitpos - 1));
+			VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi");
+			vlapic->isrvec_stk_top--;
+			vlapic_update_ppr(vlapic);
+			return;
+		}
+	}
+}
+
+static __inline int
+vlapic_get_lvt_field(uint32_t *lvt, uint32_t mask)
+{
+	return (*lvt & mask);
+}
+
+static __inline int
+vlapic_periodic_timer(struct vlapic *vlapic)
+{
+	uint32_t *lvt;
+	
+	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+
+	return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC));
+}
+
+static void
+vlapic_fire_timer(struct vlapic *vlapic)
+{
+	int vector;
+	uint32_t *lvt;
+	
+	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+
+	if (!vlapic_get_lvt_field(lvt, APIC_LVTT_M)) {
+		vector = vlapic_get_lvt_field(lvt,APIC_LVTT_VECTOR);
+		vlapic_set_intr_ready(vlapic, vector);
+	}
+}
+
+static int
+lapic_process_icr(struct vlapic *vlapic, uint64_t icrval)
+{
+	int i;
+	cpumask_t dmask, thiscpumask;
+	uint32_t dest, vec, mode;
+	
+	thiscpumask = vcpu_mask(vlapic->vcpuid);
+
+	dmask = 0;
+	dest = icrval >> 32;
+	vec = icrval & APIC_VECTOR_MASK;
+	mode = icrval & APIC_DELMODE_MASK;
+
+	if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) {
+		switch (icrval & APIC_DEST_MASK) {
+		case APIC_DEST_DESTFLD:
+			dmask = vcpu_mask(dest);
+			break;
+		case APIC_DEST_SELF:
+			dmask = thiscpumask;
+			break;
+		case APIC_DEST_ALLISELF:
+			dmask = vm_active_cpus(vlapic->vm);
+			break;
+		case APIC_DEST_ALLESELF:
+			dmask = vm_active_cpus(vlapic->vm) & ~thiscpumask;
+			break;
+		}
+
+		for (i = 0; i < VM_MAXCPU; i++) {
+			if (dmask & vcpu_mask(i)) {
+				if (mode == APIC_DELMODE_FIXED)
+					lapic_set_intr(vlapic->vm, i, vec);
+				else
+					vm_inject_nmi(vlapic->vm, i);
+			}
+		}
+
+		return (0);	/* handled completely in the kernel */
+	}
+
+	/*
+	 * XXX this assumes that the startup IPI always succeeds
+	 */
+	if (mode == APIC_DELMODE_STARTUP)
+		vm_activate_cpu(vlapic->vm, dest);
+
+	/*
+	 * This will cause a return to userland.
+	 */
+	return (1);
+}
+
+int
+vlapic_pending_intr(struct vlapic *vlapic)
+{
+	struct LAPIC	*lapic = &vlapic->apic;
+	int	  	 idx, i, bitpos, vector;
+	uint32_t	*irrptr, val;
+
+	irrptr = &lapic->irr0;
+
+	/*
+	 * The x86 architecture reserves the the first 32 vectors for use
+	 * by the processor.
+	 */
+	for (i = 7; i > 0; i--) {
+		idx = i * 4;
+		val = atomic_load_acq_int(&irrptr[idx]);
+		bitpos = fls(val);
+		if (bitpos != 0) {
+			vector = i * 32 + (bitpos - 1);
+			if (PRIO(vector) > PRIO(lapic->ppr)) {
+				VLAPIC_CTR1(vlapic, "pending intr %d", vector);
+				return (vector);
+			} else 
+				break;
+		}
+	}
+	VLAPIC_CTR0(vlapic, "no pending intr");
+	return (-1);
+}
+
+void
+vlapic_intr_accepted(struct vlapic *vlapic, int vector)
+{
+	struct LAPIC	*lapic = &vlapic->apic;
+	uint32_t	*irrptr, *isrptr;
+	int		idx, stk_top;
+
+	/*
+	 * clear the ready bit for vector being accepted in irr 
+	 * and set the vector as in service in isr.
+	 */
+	idx = (vector / 32) * 4;
+
+	irrptr = &lapic->irr0;
+	atomic_clear_int(&irrptr[idx], 1 << (vector % 32));
+	VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted");
+
+	isrptr = &lapic->isr0;
+	isrptr[idx] |= 1 << (vector % 32);
+	VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted");
+
+	/*
+	 * Update the PPR
+	 */
+	vlapic->isrvec_stk_top++;
+
+	stk_top = vlapic->isrvec_stk_top;
+	if (stk_top >= ISRVEC_STK_SIZE)
+		panic("isrvec_stk_top overflow %d", stk_top);
+
+	vlapic->isrvec_stk[stk_top] = vector;
+	vlapic_update_ppr(vlapic);
+}
+
+int
+vlapic_op_mem_read(void* dev, uint64_t gpa, opsize_t size, uint64_t *data)
+{
+	struct vlapic 	*vlapic = (struct vlapic*)dev;
+	struct LAPIC	*lapic = &vlapic->apic;
+	uint64_t	 offset = gpa & ~(PAGE_SIZE);
+	uint32_t	*reg;
+	int		 i;
+
+	if (offset > sizeof(*lapic)) {
+		*data = 0;
+		return 0;
+	}
+	
+	offset &= ~3;
+	switch(offset)
+	{
+		case APIC_OFFSET_ID:
+			*data = lapic->id;
+			break;
+		case APIC_OFFSET_VER:
+			*data = lapic->version;
+			break;
+		case APIC_OFFSET_TPR:
+			*data = lapic->tpr;
+			break;
+		case APIC_OFFSET_APR:
+			*data = lapic->apr;
+			break;
+		case APIC_OFFSET_PPR:
+			*data = lapic->ppr;
+			break;
+		case APIC_OFFSET_EOI:
+			*data = lapic->eoi;
+			break;
+		case APIC_OFFSET_LDR:
+			*data = lapic->ldr;
+			break;
+		case APIC_OFFSET_DFR:
+			*data = lapic->dfr;
+			break;
+		case APIC_OFFSET_SVR:
+			*data = lapic->svr;
+			break;
+		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
+			i = (offset - APIC_OFFSET_ISR0) >> 2;
+			reg = &lapic->isr0;
+			*data = *(reg + i);
+			break;
+		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
+			i = (offset - APIC_OFFSET_TMR0) >> 2;
+			reg = &lapic->tmr0;
+			*data = *(reg + i);
+			break;
+		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
+			i = (offset - APIC_OFFSET_IRR0) >> 2;
+			reg = &lapic->irr0;
+			*data = atomic_load_acq_int(reg + i);
+			break;
+		case APIC_OFFSET_ESR:
+			*data = lapic->esr;
+			break;
+		case APIC_OFFSET_ICR_LOW: 
+			*data = lapic->icr_lo;
+			break;
+		case APIC_OFFSET_ICR_HI: 
+			*data = lapic->icr_hi;
+			break;
+		case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+			reg = vlapic_get_lvt(vlapic, offset);	
+			*data = *(reg);
+			break;
+		case APIC_OFFSET_ICR:
+			*data = lapic->icr_timer;
+			break;
+		case APIC_OFFSET_CCR:
+			*data = vlapic_get_ccr(vlapic);
+			break;
+		case APIC_OFFSET_DCR:
+			*data = lapic->dcr_timer;
+			break;
+		case APIC_OFFSET_RRR:
+		default:
+			*data = 0;
+			break;
+	}
+	return 0;
+}
+
+int
+vlapic_op_mem_write(void* dev, uint64_t gpa, opsize_t size, uint64_t data)
+{
+	struct vlapic 	*vlapic = (struct vlapic*)dev;
+	struct LAPIC	*lapic = &vlapic->apic;
+	uint64_t	 offset = gpa & ~(PAGE_SIZE);
+	uint32_t	*reg;
+	int		retval;
+
+	if (offset > sizeof(*lapic)) {
+		return 0;
+	}
+
+	retval = 0;
+	offset &= ~3;
+	switch(offset)
+	{
+		case APIC_OFFSET_ID:
+			lapic->id = data;
+			break;
+		case APIC_OFFSET_TPR:
+			lapic->tpr = data & 0xff;
+			vlapic_update_ppr(vlapic);
+			break;
+		case APIC_OFFSET_EOI:
+			vlapic_process_eoi(vlapic);
+			break;
+		case APIC_OFFSET_LDR:
+			break;
+		case APIC_OFFSET_DFR:
+			break;
+		case APIC_OFFSET_SVR:
+			lapic->svr = data;
+			break;
+		case APIC_OFFSET_ICR_LOW: 
+			retval = lapic_process_icr(vlapic, data);
+			break;
+		case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+			reg = vlapic_get_lvt(vlapic, offset);	
+			if (!(lapic->svr & APIC_SVR_ENABLE)) {
+				data |= APIC_LVT_M;
+			}
+			*reg = data;
+			// vlapic_dump_lvt(offset, reg);
+			break;
+		case APIC_OFFSET_ICR:
+			lapic->icr_timer = data;
+			vlapic_start_timer(vlapic, 0);
+			break;
+
+		case APIC_OFFSET_DCR:
+			lapic->dcr_timer = data;
+			vlapic->divisor = vlapic_timer_divisor(data);
+			break;
+
+		case APIC_OFFSET_ESR:
+			vlapic_update_errors(vlapic);
+			break;
+		case APIC_OFFSET_VER:
+		case APIC_OFFSET_APR:
+		case APIC_OFFSET_PPR:
+		case APIC_OFFSET_RRR:
+		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
+		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
+		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
+		case APIC_OFFSET_CCR:
+		default:
+			// Read only.
+			break;
+	}
+
+	return (retval);
+}
+
+void
+vlapic_timer_tick(struct vlapic *vlapic)
+{
+	int curticks, delta, periodic;
+	uint32_t ccr;
+	uint32_t decrement, remainder;
+
+	curticks = ticks;
+
+	/* Common case */
+	delta = curticks - vlapic->ccr_ticks;
+	if (delta == 0)
+		return;
+
+	/* Local APIC timer is disabled */
+	if (vlapic->apic.icr_timer == 0)
+		return;
+
+	/* One-shot mode and timer has already counted down to zero */
+	periodic = vlapic_periodic_timer(vlapic);
+	if (!periodic && vlapic->apic.ccr_timer == 0)
+		return;
+	/*
+	 * The 'curticks' and 'ccr_ticks' are out of sync by more than
+	 * 2^31 ticks. We deal with this by restarting the timer.
+	 */
+	if (delta < 0) {
+		vlapic_start_timer(vlapic, 0);
+		return;
+	}
+
+	ccr = vlapic->apic.ccr_timer;
+	decrement = (VLAPIC_BUS_FREQ / vlapic->divisor) / hz;
+	while (delta-- > 0) {
+		if (ccr <= decrement) {
+			remainder = decrement - ccr;
+			vlapic_fire_timer(vlapic);
+			if (periodic) {
+				vlapic_start_timer(vlapic, remainder);
+				ccr = vlapic->apic.ccr_timer;
+			} else {
+				/*
+				 * One-shot timer has counted down to zero.
+				 */
+				ccr = 0;
+				break;
+			}
+		} else 
+			ccr -= decrement;
+	}
+
+	vlapic->ccr_ticks = curticks;
+	vlapic->apic.ccr_timer = ccr;
+}
+
+struct vdev_ops vlapic_dev_ops = {
+	.name = "vlapic",
+	.init = vlapic_op_init,
+	.reset = vlapic_op_reset,
+	.halt = vlapic_op_halt,
+	.memread = vlapic_op_mem_read,
+	.memwrite = vlapic_op_mem_write,
+};
+static struct io_region vlapic_mmio[VM_MAXCPU];
+
+struct vlapic *
+vlapic_init(struct vm *vm, int vcpuid)
+{
+	struct vlapic 		*vlapic;
+
+	vlapic = malloc(sizeof(struct vlapic), M_VLAPIC, M_WAITOK | M_ZERO);
+	vlapic->vm = vm;
+	vlapic->vcpuid = vcpuid;
+	vlapic->ops = &vlapic_dev_ops;
+
+	vlapic->mmio = vlapic_mmio + vcpuid;
+	vlapic->mmio->base = DEFAULT_APIC_BASE;
+	vlapic->mmio->len = PAGE_SIZE;
+	vlapic->mmio->attr = MMIO_READ|MMIO_WRITE;
+	vlapic->mmio->vcpu = vcpuid;
+
+	vdev_register(&vlapic_dev_ops, vlapic);
+
+	vlapic_op_init(vlapic);
+
+	return (vlapic);
+}
+
+void
+vlapic_cleanup(struct vlapic *vlapic)
+{
+	vdev_unregister(vlapic);
+	free(vlapic, M_VLAPIC);
+}
diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h
new file mode 100644
index 0000000..861ea8c
--- /dev/null
+++ b/sys/amd64/vmm/io/vlapic.h
@@ -0,0 +1,105 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VLAPIC_H_
+#define	_VLAPIC_H_
+
+#include "vdev.h"
+
+struct vm;
+  
+/*
+ * Map of APIC Registers:       Offset  Description          		 	Access
+ */
+#define APIC_OFFSET_ID 		0x20    // Local APIC ID               		R/W
+#define APIC_OFFSET_VER 	0x30    // Local APIC Version              	R
+#define APIC_OFFSET_TPR 	0x80    // Task Priority Register          	R/W
+#define APIC_OFFSET_APR 	0x90    // Arbitration Priority Register   	R
+#define APIC_OFFSET_PPR 	0xA0    // Processor Priority Register     	R
+#define APIC_OFFSET_EOI 	0xB0    // EOI Register                    	W
+#define APIC_OFFSET_RRR 	0xC0    // Remote read                     	R
+#define APIC_OFFSET_LDR 	0xD0    // Logical Destination             	R/W
+#define APIC_OFFSET_DFR 	0xE0    // Destination Format Register     	0..27 R;  28..31 R/W
+#define APIC_OFFSET_SVR 	0xF0    // Spurious Interrupt Vector Reg.  	0..3  R;  4..9   R/W
+#define APIC_OFFSET_ISR0 	0x100   // ISR  000-031                    	R
+#define APIC_OFFSET_ISR1 	0x110   // ISR  032-063                    	R
+#define APIC_OFFSET_ISR2 	0x120   // ISR  064-095                    	R
+#define APIC_OFFSET_ISR3 	0x130   // ISR  095-128                    	R
+#define APIC_OFFSET_ISR4 	0x140   // ISR  128-159                    	R
+#define APIC_OFFSET_ISR5 	0x150   // ISR  160-191                    	R
+#define APIC_OFFSET_ISR6 	0x160   // ISR  192-223                    	R
+#define APIC_OFFSET_ISR7 	0x170   // ISR  224-255                    	R
+#define APIC_OFFSET_TMR0 	0x180   // TMR  000-031                    	R
+#define APIC_OFFSET_TMR1 	0x190   // TMR  032-063                    	R
+#define APIC_OFFSET_TMR2 	0x1A0   // TMR  064-095                    	R
+#define APIC_OFFSET_TMR3 	0x1B0   // TMR  095-128                    	R
+#define APIC_OFFSET_TMR4 	0x1C0   // TMR  128-159                    	R
+#define APIC_OFFSET_TMR5 	0x1D0   // TMR  160-191                    	R
+#define APIC_OFFSET_TMR6 	0x1E0   // TMR  192-223                    	R
+#define APIC_OFFSET_TMR7 	0x1F0   // TMR  224-255                    	R
+#define APIC_OFFSET_IRR0 	0x200   // IRR  000-031                    	R
+#define APIC_OFFSET_IRR1 	0x210   // IRR  032-063                    	R
+#define APIC_OFFSET_IRR2 	0x220   // IRR  064-095                    	R
+#define APIC_OFFSET_IRR3 	0x230   // IRR  095-128                    	R
+#define APIC_OFFSET_IRR4 	0x240   // IRR  128-159                    	R
+#define APIC_OFFSET_IRR5 	0x250   // IRR  160-191                    	R
+#define APIC_OFFSET_IRR6 	0x260   // IRR  192-223                    	R
+#define APIC_OFFSET_IRR7 	0x270   // IRR  224-255                    	R
+#define APIC_OFFSET_ESR		0x280   // Error Status Register           	R
+#define APIC_OFFSET_ICR_LOW 	0x300   // Interrupt Command Reg. (0-31)   	R/W
+#define APIC_OFFSET_ICR_HI 	0x310   // Interrupt Command Reg. (32-63)  	R/W
+#define APIC_OFFSET_TIMER_LVT 	0x320   // Local Vector Table (Timer)      	R/W
+#define APIC_OFFSET_THERM_LVT 	0x330   // Local Vector Table (Thermal)    	R/W (PIV+)
+#define APIC_OFFSET_PERF_LVT 	0x340   // Local Vector Table (Performance) 	R/W (P6+)
+#define APIC_OFFSET_LINT0_LVT 	0x350   // Local Vector Table (LINT0)      	R/W
+#define APIC_OFFSET_LINT1_LVT 	0x360 	// Local Vector Table (LINT1)      	R/W
+#define APIC_OFFSET_ERROR_LVT 	0x370   // Local Vector Table (ERROR)      	R/W
+#define APIC_OFFSET_ICR 	0x380   // Initial Count Reg. for Timer    	R/W
+#define APIC_OFFSET_CCR 	0x390   // Current Count of Timer          	R
+#define APIC_OFFSET_DCR 	0x3E0   // Timer Divide Configuration Reg. 	R/W
+
+/*
+ * 16 priority levels with at most one vector injected per level.
+ */
+#define	ISRVEC_STK_SIZE		(16 + 1)
+
+struct vlapic *vlapic_init(struct vm *vm, int vcpuid);
+void vlapic_cleanup(struct vlapic *vlapic);
+
+int vlapic_op_mem_write(void* dev, uint64_t gpa,
+    			opsize_t size, uint64_t data);
+
+int vlapic_op_mem_read(void* dev, uint64_t gpa,
+    			opsize_t size, uint64_t *data);
+
+int vlapic_pending_intr(struct vlapic *vlapic);
+void vlapic_intr_accepted(struct vlapic *vlapic, int vector);
+void vlapic_set_intr_ready(struct vlapic *vlapic, int vector);
+void vlapic_timer_tick(struct vlapic *vlapic);
+
+#endif	/* _VLAPIC_H_ */
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
new file mode 100644
index 0000000..c93c31e
--- /dev/null
+++ b/sys/amd64/vmm/vmm.c
@@ -0,0 +1,737 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+
+#include <machine/vm.h>
+#include <machine/pcb.h>
+#include <machine/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_mem.h"
+#include "vmm_util.h"
+#include <machine/vmm_dev.h>
+#include "vlapic.h"
+#include "vmm_msr.h"
+#include "vmm_ipi.h"
+#include "vmm_stat.h"
+
+#include "io/ppt.h"
+#include "io/iommu.h"
+
+struct vlapic;
+
+struct vcpu {
+	int		flags;
+	int		pincpu;		/* host cpuid this vcpu is bound to */
+	int		hostcpu;	/* host cpuid this vcpu last ran on */
+	uint64_t	guest_msrs[VMM_MSR_NUM];
+	struct vlapic	*vlapic;
+	int		 vcpuid;
+	struct savefpu	savefpu;	/* guest fpu state */
+	void		*stats;
+};
+#define	VCPU_F_PINNED	0x0001
+#define	VCPU_F_RUNNING	0x0002
+
+#define	VCPU_PINCPU(vm, vcpuid)	\
+    ((vm->vcpu[vcpuid].flags & VCPU_F_PINNED) ? vm->vcpu[vcpuid].pincpu : -1)
+
+#define	VCPU_UNPIN(vm, vcpuid)	(vm->vcpu[vcpuid].flags &= ~VCPU_F_PINNED)
+
+#define	VCPU_PIN(vm, vcpuid, host_cpuid)				\
+do {									\
+	vm->vcpu[vcpuid].flags |= VCPU_F_PINNED;			\
+	vm->vcpu[vcpuid].pincpu = host_cpuid;				\
+} while(0)
+
+#define	VM_MAX_MEMORY_SEGMENTS	2
+
+struct vm {
+	void		*cookie;	/* processor-specific data */
+	void		*iommu;		/* iommu-specific data */
+	struct vcpu	vcpu[VM_MAXCPU];
+	int		num_mem_segs;
+	struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
+	char		name[VM_MAX_NAMELEN];
+
+	/*
+	 * Mask of active vcpus.
+	 * An active vcpu is one that has been started implicitly (BSP) or
+	 * explicitly (AP) by sending it a startup ipi.
+	 */
+	cpumask_t	active_cpus;
+};
+
+static struct vmm_ops *ops;
+#define	VMM_INIT()	(ops != NULL ? (*ops->init)() : 0)
+#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
+
+#define	VMINIT(vm)	(ops != NULL ? (*ops->vminit)(vm): NULL)
+#define	VMRUN(vmi, vcpu, rip, vmexit) \
+	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, vmexit) : ENXIO)
+#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
+#define	VMMMAP(vmi, gpa, hpa, len, attr, prot, spm)	\
+    (ops != NULL ? (*ops->vmmmap)(vmi, gpa, hpa, len, attr, prot, spm) : ENXIO)
+#define	VMGETREG(vmi, vcpu, num, retval)		\
+	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
+#define	VMSETREG(vmi, vcpu, num, val)		\
+	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
+#define	VMGETDESC(vmi, vcpu, num, desc)		\
+	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
+#define	VMSETDESC(vmi, vcpu, num, desc)		\
+	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
+#define	VMINJECT(vmi, vcpu, type, vec, ec, ecv)	\
+	(ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
+#define	VMNMI(vmi, vcpu)	\
+	(ops != NULL ? (*ops->vmnmi)(vmi, vcpu) : ENXIO)
+#define	VMGETCAP(vmi, vcpu, num, retval)	\
+	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
+#define	VMSETCAP(vmi, vcpu, num, val)		\
+	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
+
+#define	fxrstor(addr)		__asm("fxrstor %0" : : "m" (*(addr)))
+#define	fxsave(addr)		__asm __volatile("fxsave %0" : "=m" (*(addr)))
+#define	fpu_start_emulating()	__asm("smsw %%ax; orb %0,%%al; lmsw %%ax" \
+				      : : "n" (CR0_TS) : "ax")
+#define	fpu_stop_emulating()	__asm("clts")
+
+static MALLOC_DEFINE(M_VM, "vm", "vm");
+CTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
+
+/* statistics */
+static VMM_STAT_DEFINE(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
+
+static void
+vcpu_cleanup(struct vcpu *vcpu)
+{
+	vlapic_cleanup(vcpu->vlapic);
+	vmm_stat_free(vcpu->stats);
+}
+
+static void
+vcpu_init(struct vm *vm, uint32_t vcpu_id)
+{
+	struct vcpu *vcpu;
+	
+	vcpu = &vm->vcpu[vcpu_id];
+
+	vcpu->hostcpu = -1;
+	vcpu->vcpuid = vcpu_id;
+	vcpu->vlapic = vlapic_init(vm, vcpu_id);
+	fpugetregs(curthread, &vcpu->savefpu);
+	vcpu->stats = vmm_stat_alloc();
+}
+
+static int
+vmm_init(void)
+{
+	int error;
+
+	vmm_ipi_init();
+
+	error = vmm_mem_init();
+	if (error)
+		return (error);
+	
+	if (vmm_is_intel())
+		ops = &vmm_ops_intel;
+	else if (vmm_is_amd())
+		ops = &vmm_ops_amd;
+	else
+		return (ENXIO);
+
+	vmm_msr_init();
+
+	return (VMM_INIT());
+}
+
+static int
+vmm_handler(module_t mod, int what, void *arg)
+{
+	int error;
+
+	switch (what) {
+	case MOD_LOAD:
+		vmmdev_init();
+		iommu_init();
+		error = vmm_init();
+		break;
+	case MOD_UNLOAD:
+		vmmdev_cleanup();
+		iommu_cleanup();
+		vmm_ipi_cleanup();
+		error = VMM_CLEANUP();
+		break;
+	default:
+		error = 0;
+		break;
+	}
+	return (error);
+}
+
+static moduledata_t vmm_kmod = {
+	"vmm",
+	vmm_handler,
+	NULL
+};
+
+/*
+ * Execute the module load handler after the pci passthru driver has had
+ * a chance to claim devices. We need this information at the time we do
+ * iommu initialization.
+ */
+DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_CONFIGURE + 1, SI_ORDER_ANY);
+MODULE_VERSION(vmm, 1);
+
+SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
+
+struct vm *
+vm_create(const char *name)
+{
+	int i;
+	struct vm *vm;
+	vm_paddr_t maxaddr;
+
+	const int BSP = 0;
+
+	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
+		return (NULL);
+
+	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
+	strcpy(vm->name, name);
+	vm->cookie = VMINIT(vm);
+
+	for (i = 0; i < VM_MAXCPU; i++) {
+		vcpu_init(vm, i);
+		guest_msrs_init(vm, i);
+	}
+
+	maxaddr = vmm_mem_maxaddr();
+	vm->iommu = iommu_create_domain(maxaddr);
+	vm_activate_cpu(vm, BSP);
+
+	return (vm);
+}
+
+void
+vm_destroy(struct vm *vm)
+{
+	int i;
+
+	ppt_unassign_all(vm);
+
+	for (i = 0; i < vm->num_mem_segs; i++)
+		vmm_mem_free(vm->mem_segs[i].hpa, vm->mem_segs[i].len);
+
+	for (i = 0; i < VM_MAXCPU; i++)
+		vcpu_cleanup(&vm->vcpu[i]);
+
+	iommu_destroy_domain(vm->iommu);
+
+	VMCLEANUP(vm->cookie);
+
+	free(vm, M_VM);
+}
+
+const char *
+vm_name(struct vm *vm)
+{
+	return (vm->name);
+}
+
+int
+vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+
+	return (VMMMAP(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
+		       VM_PROT_RW, spok));
+}
+
+int
+vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+
+	return (VMMMAP(vm->cookie, gpa, 0, len, VM_MEMATTR_UNCACHEABLE,
+		       VM_PROT_NONE, spok));
+}
+
+int
+vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t *ret_hpa)
+{
+	int error;
+	vm_paddr_t hpa;
+
+	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+	
+	/*
+	 * find the hpa if already it was already vm_malloc'd.
+	 */
+	hpa = vm_gpa2hpa(vm, gpa, len);
+	if (hpa != ((vm_paddr_t)-1))
+		goto out;
+
+	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
+		return (E2BIG);
+
+	hpa = vmm_mem_alloc(len);
+	if (hpa == 0)
+		return (ENOMEM);
+
+	error = VMMMAP(vm->cookie, gpa, hpa, len, VM_MEMATTR_WRITE_BACK,
+		       VM_PROT_ALL, spok);
+	if (error) {
+		vmm_mem_free(hpa, len);
+		return (error);
+	}
+
+	iommu_create_mapping(vm->iommu, gpa, hpa, len);
+
+	vm->mem_segs[vm->num_mem_segs].gpa = gpa;
+	vm->mem_segs[vm->num_mem_segs].hpa = hpa;
+	vm->mem_segs[vm->num_mem_segs].len = len;
+	vm->num_mem_segs++;
+out:
+	*ret_hpa = hpa;
+	return (0);
+}
+
+vm_paddr_t
+vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+	int i;
+	vm_paddr_t gpabase, gpalimit, hpabase;
+
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		hpabase = vm->mem_segs[i].hpa;
+		gpabase = vm->mem_segs[i].gpa;
+		gpalimit = gpabase + vm->mem_segs[i].len;
+		if (gpa >= gpabase && gpa + len <= gpalimit)
+			return ((gpa - gpabase) + hpabase);
+	}
+	return ((vm_paddr_t)-1);
+}
+
+int
+vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
+		  struct vm_memory_segment *seg)
+{
+	int i;
+
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		if (gpabase == vm->mem_segs[i].gpa) {
+			*seg = vm->mem_segs[i];
+			return (0);
+		}
+	}
+	return (-1);
+}
+
+int
+vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
+{
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (reg >= VM_REG_LAST)
+		return (EINVAL);
+
+	return (VMGETREG(vm->cookie, vcpu, reg, retval));
+}
+
+int
+vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
+{
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (reg >= VM_REG_LAST)
+		return (EINVAL);
+
+	return (VMSETREG(vm->cookie, vcpu, reg, val));
+}
+
+static boolean_t
+is_descriptor_table(int reg)
+{
+
+	switch (reg) {
+	case VM_REG_GUEST_IDTR:
+	case VM_REG_GUEST_GDTR:
+		return (TRUE);
+	default:
+		return (FALSE);
+	}
+}
+
+static boolean_t
+is_segment_register(int reg)
+{
+	
+	switch (reg) {
+	case VM_REG_GUEST_ES:
+	case VM_REG_GUEST_CS:
+	case VM_REG_GUEST_SS:
+	case VM_REG_GUEST_DS:
+	case VM_REG_GUEST_FS:
+	case VM_REG_GUEST_GS:
+	case VM_REG_GUEST_TR:
+	case VM_REG_GUEST_LDTR:
+		return (TRUE);
+	default:
+		return (FALSE);
+	}
+}
+
+int
+vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
+		struct seg_desc *desc)
+{
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (!is_segment_register(reg) && !is_descriptor_table(reg))
+		return (EINVAL);
+
+	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
+}
+
+int
+vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
+		struct seg_desc *desc)
+{
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (!is_segment_register(reg) && !is_descriptor_table(reg))
+		return (EINVAL);
+
+	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
+}
+
+int
+vm_get_pinning(struct vm *vm, int vcpuid, int *cpuid)
+{
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	*cpuid = VCPU_PINCPU(vm, vcpuid);
+
+	return (0);
+}
+
+int
+vm_set_pinning(struct vm *vm, int vcpuid, int host_cpuid)
+{
+	struct thread *td;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	td = curthread;		/* XXXSMP only safe when muxing vcpus */
+
+	/* unpin */
+	if (host_cpuid < 0) {
+		VCPU_UNPIN(vm, vcpuid);
+		thread_lock(td);
+		sched_unbind(td);
+		thread_unlock(td);
+		return (0);
+	}
+
+	if (CPU_ABSENT(host_cpuid))
+		return (EINVAL);
+
+	/*
+	 * XXX we should check that 'host_cpuid' has not already been pinned
+	 * by another vm.
+	 */
+	thread_lock(td);
+	sched_bind(td, host_cpuid);
+	thread_unlock(td);
+	VCPU_PIN(vm, vcpuid, host_cpuid);
+
+	return (0);
+}
+
+static void
+restore_guest_fpustate(struct vcpu *vcpu)
+{
+	register_t s;
+
+	s = intr_disable();
+	fpu_stop_emulating();
+	fxrstor(&vcpu->savefpu);
+	fpu_start_emulating();
+	intr_restore(s);
+}
+
+static void
+save_guest_fpustate(struct vcpu *vcpu)
+{
+	register_t s;
+
+	s = intr_disable();
+	fpu_stop_emulating();
+	fxsave(&vcpu->savefpu);
+	fpu_start_emulating();
+	intr_restore(s);
+}
+
+int
+vm_run(struct vm *vm, struct vm_run *vmrun)
+{
+	int error, vcpuid;
+	struct vcpu *vcpu;
+	struct pcb *pcb;
+	uint64_t tscval;
+
+	vcpuid = vmrun->cpuid;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	critical_enter();
+
+	tscval = rdtsc();
+
+	pcb = PCPU_GET(curpcb);
+	pcb->pcb_full_iret = 1;
+
+	vcpu->hostcpu = curcpu;
+
+	fpuexit(curthread);
+	restore_guest_msrs(vm, vcpuid);
+	restore_guest_fpustate(vcpu);
+	error = VMRUN(vm->cookie, vcpuid, vmrun->rip, &vmrun->vm_exit);
+	save_guest_fpustate(vcpu);
+	restore_host_msrs(vm, vcpuid);
+
+	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
+
+	critical_exit();
+
+	return (error);
+}
+
+int
+vm_inject_event(struct vm *vm, int vcpuid, int type,
+		int vector, uint32_t code, int code_valid)
+{
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
+		return (EINVAL);
+
+	if (vector < 0 || vector > 255)
+		return (EINVAL);
+
+	return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
+}
+
+int
+vm_inject_nmi(struct vm *vm, int vcpu)
+{
+	int error;
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	error = VMNMI(vm->cookie, vcpu);
+	vm_interrupt_hostcpu(vm, vcpu);
+	return (error);
+}
+
+int
+vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
+{
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (type < 0 || type >= VM_CAP_MAX)
+		return (EINVAL);
+
+	return (VMGETCAP(vm->cookie, vcpu, type, retval));
+}
+
+int
+vm_set_capability(struct vm *vm, int vcpu, int type, int val)
+{
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (type < 0 || type >= VM_CAP_MAX)
+		return (EINVAL);
+
+	return (VMSETCAP(vm->cookie, vcpu, type, val));
+}
+
+uint64_t *
+vm_guest_msrs(struct vm *vm, int cpu)
+{
+	return (vm->vcpu[cpu].guest_msrs);
+}
+
+struct vlapic *
+vm_lapic(struct vm *vm, int cpu)
+{
+	return (vm->vcpu[cpu].vlapic);
+}
+
+boolean_t
+vmm_is_pptdev(int bus, int slot, int func)
+{
+	int found, b, s, f, n;
+	char *val, *cp, *cp2;
+
+	/*
+	 * setenv pptdevs "1/2/3 4/5/6 7/8/9 10/11/12"
+	 */
+	found = 0;
+	cp = val = getenv("pptdevs");
+	while (cp != NULL && *cp != '\0') {
+		if ((cp2 = strchr(cp, ' ')) != NULL)
+			*cp2 = '\0';
+
+		n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
+		if (n == 3 && bus == b && slot == s && func == f) {
+			found = 1;
+			break;
+		}
+		
+		if (cp2 != NULL)
+			*cp2++ = ' ';
+
+		cp = cp2;
+	}
+	freeenv(val);
+	return (found);
+}
+
+void *
+vm_iommu_domain(struct vm *vm)
+{
+
+	return (vm->iommu);
+}
+
+void
+vm_set_run_state(struct vm *vm, int vcpuid, int state)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	if (state == VCPU_RUNNING) {
+		if (vcpu->flags & VCPU_F_RUNNING) {
+			panic("vm_set_run_state: %s[%d] is already running",
+			      vm_name(vm), vcpuid);
+		}
+		vcpu->flags |= VCPU_F_RUNNING;
+	} else {
+		if ((vcpu->flags & VCPU_F_RUNNING) == 0) {
+			panic("vm_set_run_state: %s[%d] is already stopped",
+			      vm_name(vm), vcpuid);
+		}
+		vcpu->flags &= ~VCPU_F_RUNNING;
+	}
+}
+
+int
+vm_get_run_state(struct vm *vm, int vcpuid, int *cpuptr)
+{
+	int retval, hostcpu;
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
+
+	vcpu = &vm->vcpu[vcpuid];
+	if (vcpu->flags & VCPU_F_RUNNING) {
+		retval = VCPU_RUNNING;
+		hostcpu = vcpu->hostcpu;
+	} else {
+		retval = VCPU_STOPPED;
+		hostcpu = -1;
+	}
+
+	if (cpuptr)
+		*cpuptr = hostcpu;
+
+	return (retval);
+}
+
+void
+vm_activate_cpu(struct vm *vm, int vcpuid)
+{
+
+	if (vcpuid >= 0 && vcpuid < VM_MAXCPU)
+		vm->active_cpus |= vcpu_mask(vcpuid);
+}
+
+cpumask_t
+vm_active_cpus(struct vm *vm)
+{
+
+	return (vm->active_cpus);
+}
+
+void *
+vcpu_stats(struct vm *vm, int vcpuid)
+{
+
+	return (vm->vcpu[vcpuid].stats);
+}
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
new file mode 100644
index 0000000..cf443fc
--- /dev/null
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -0,0 +1,468 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/malloc.h>
+#include <sys/conf.h>
+#include <sys/sysctl.h>
+#include <sys/libkern.h>
+#include <sys/ioccom.h>
+#include <sys/mman.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include "vmm_lapic.h"
+#include "vmm_stat.h"
+#include "io/ppt.h"
+#include <machine/vmm_dev.h>
+
+struct vmmdev_softc {
+	struct vm	*vm;		/* vm instance cookie */
+	struct cdev	*cdev;
+	SLIST_ENTRY(vmmdev_softc) link;
+};
+static SLIST_HEAD(, vmmdev_softc) head;
+
+static struct mtx vmmdev_mtx;
+
+static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
+
+SYSCTL_DECL(_hw_vmm);
+
+static struct vmmdev_softc *
+vmmdev_lookup(const char *name)
+{
+	struct vmmdev_softc *sc;
+
+#ifdef notyet	/* XXX kernel is not compiled with invariants */
+	mtx_assert(&vmmdev_mtx, MA_OWNED);
+#endif
+
+	SLIST_FOREACH(sc, &head, link) {
+		if (strcmp(name, vm_name(sc->vm)) == 0)
+			break;
+	}
+
+	return (sc);
+}
+
+static struct vmmdev_softc *
+vmmdev_lookup2(struct cdev *cdev)
+{
+	struct vmmdev_softc *sc;
+
+#ifdef notyet	/* XXX kernel is not compiled with invariants */
+	mtx_assert(&vmmdev_mtx, MA_OWNED);
+#endif
+
+	SLIST_FOREACH(sc, &head, link) {
+		if (sc->cdev == cdev)
+			break;
+	}
+
+	return (sc);
+}
+
+static int
+vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
+{
+	int error, off, c;
+	vm_paddr_t hpa, gpa;
+	struct vmmdev_softc *sc;
+
+	static char zerobuf[PAGE_SIZE];
+
+	error = 0;
+	mtx_lock(&vmmdev_mtx);
+	sc = vmmdev_lookup2(cdev);
+
+	while (uio->uio_resid > 0 && error == 0) {
+		gpa = uio->uio_offset;
+		off = gpa & PAGE_MASK;
+		c = min(uio->uio_resid, PAGE_SIZE - off);
+
+		/*
+		 * The VM has a hole in its physical memory map. If we want to
+		 * use 'dd' to inspect memory beyond the hole we need to
+		 * provide bogus data for memory that lies in the hole.
+		 *
+		 * Since this device does not support lseek(2), dd(1) will
+		 * read(2) blocks of data to simulate the lseek(2).
+		 */
+		hpa = vm_gpa2hpa(sc->vm, gpa, c);
+		if (hpa == (vm_paddr_t)-1) {
+			if (uio->uio_rw == UIO_READ)
+				error = uiomove(zerobuf, c, uio);
+			else
+				error = EFAULT;
+		} else
+			error = uiomove((void *)PHYS_TO_DMAP(hpa), c, uio);
+	}
+
+	mtx_unlock(&vmmdev_mtx);
+	return (error);
+}
+
+static int
+vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
+	     struct thread *td)
+{
+	int error, vcpu;
+	struct vmmdev_softc *sc;
+	struct vm_memory_segment *seg;
+	struct vm_register *vmreg;
+	struct vm_seg_desc* vmsegdesc;
+	struct vm_pin *vmpin;
+	struct vm_run *vmrun;
+	struct vm_event *vmevent;
+	struct vm_lapic_irq *vmirq;
+	struct vm_capability *vmcap;
+	struct vm_pptdev *pptdev;
+	struct vm_pptdev_mmio *pptmmio;
+	struct vm_pptdev_msi *pptmsi;
+	struct vm_nmi *vmnmi;
+	struct vm_stats *vmstats;
+	struct vm_stat_desc *statdesc;
+
+	mtx_lock(&vmmdev_mtx);
+	sc = vmmdev_lookup2(cdev);
+	if (sc == NULL) {
+		mtx_unlock(&vmmdev_mtx);
+		return (ENXIO);
+	}
+
+	/*
+	 * Some VMM ioctls can operate only on vcpus that are not running.
+	 */
+	switch (cmd) {
+	case VM_RUN:
+	case VM_SET_PINNING:
+	case VM_GET_REGISTER:
+	case VM_SET_REGISTER:
+	case VM_GET_SEGMENT_DESCRIPTOR:
+	case VM_SET_SEGMENT_DESCRIPTOR:
+	case VM_INJECT_EVENT:
+	case VM_GET_CAPABILITY:
+	case VM_SET_CAPABILITY:
+	case VM_PPTDEV_MSI:
+		/*
+		 * XXX fragile, handle with care
+		 * Assumes that the first field of the ioctl data is the vcpu.
+		 */
+		vcpu = *(int *)data;
+		if (vcpu < 0 || vcpu >= VM_MAXCPU) {
+			error = EINVAL;
+			goto done;
+		}
+
+		if (vcpu_is_running(sc->vm, vcpu, NULL)) {
+			error = EBUSY;
+			goto done;
+		}
+		break;
+	default:
+		break;
+	}
+
+	switch(cmd) {
+	case VM_RUN:
+		vmrun = (struct vm_run *)data;
+
+		vm_set_run_state(sc->vm, vmrun->cpuid, VCPU_RUNNING);
+		mtx_unlock(&vmmdev_mtx);
+
+		error = vm_run(sc->vm, vmrun);
+
+		mtx_lock(&vmmdev_mtx);
+		vm_set_run_state(sc->vm, vmrun->cpuid, VCPU_STOPPED);
+		break;
+	case VM_STAT_DESC: {
+		const char *desc;
+		statdesc = (struct vm_stat_desc *)data;
+		desc = vmm_stat_desc(statdesc->index);
+		if (desc != NULL) {
+			error = 0;
+			strlcpy(statdesc->desc, desc, sizeof(statdesc->desc));
+		} else
+			error = EINVAL;
+		break;
+	}
+	case VM_STATS: {
+		CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_TYPES);
+		vmstats = (struct vm_stats *)data;
+		getmicrotime(&vmstats->tv);
+		error = vmm_stat_copy(sc->vm, vmstats->cpuid,
+				      &vmstats->num_entries, vmstats->statbuf);
+		break;
+	}
+	case VM_PPTDEV_MSI:
+		pptmsi = (struct vm_pptdev_msi *)data;
+		error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
+				      pptmsi->bus, pptmsi->slot, pptmsi->func,
+				      pptmsi->destcpu, pptmsi->vector,
+				      pptmsi->numvec);
+		break;
+	case VM_MAP_PPTDEV_MMIO:
+		pptmmio = (struct vm_pptdev_mmio *)data;
+		error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
+				     pptmmio->func, pptmmio->gpa, pptmmio->len,
+				     pptmmio->hpa);
+		break;
+	case VM_BIND_PPTDEV:
+		pptdev = (struct vm_pptdev *)data;
+		error = ppt_assign_device(sc->vm, pptdev->bus, pptdev->slot,
+					  pptdev->func);
+		break;
+	case VM_UNBIND_PPTDEV:
+		pptdev = (struct vm_pptdev *)data;
+		error = ppt_unassign_device(sc->vm, pptdev->bus, pptdev->slot,
+					    pptdev->func);
+		break;
+	case VM_INJECT_EVENT:
+		vmevent = (struct vm_event *)data;
+		error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type,
+					vmevent->vector,
+					vmevent->error_code,
+					vmevent->error_code_valid);
+		break;
+	case VM_INJECT_NMI:
+		vmnmi = (struct vm_nmi *)data;
+		error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
+		break;
+	case VM_LAPIC_IRQ:
+		vmirq = (struct vm_lapic_irq *)data;
+		error = lapic_set_intr(sc->vm, vmirq->cpuid, vmirq->vector);
+		break;
+	case VM_SET_PINNING:
+		vmpin = (struct vm_pin *)data;
+		error = vm_set_pinning(sc->vm, vmpin->vm_cpuid,
+				       vmpin->host_cpuid);
+		break;
+	case VM_GET_PINNING:
+		vmpin = (struct vm_pin *)data;
+		error = vm_get_pinning(sc->vm, vmpin->vm_cpuid,
+				       &vmpin->host_cpuid);
+		break;
+	case VM_MAP_MEMORY:
+		seg = (struct vm_memory_segment *)data;
+		error = vm_malloc(sc->vm, seg->gpa, seg->len, &seg->hpa);
+		break;
+	case VM_GET_MEMORY_SEG:
+		seg = (struct vm_memory_segment *)data;
+		seg->hpa = seg->len = 0;
+		(void)vm_gpabase2memseg(sc->vm, seg->gpa, seg);
+		error = 0;
+		break;
+	case VM_GET_REGISTER:
+		vmreg = (struct vm_register *)data;
+		error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
+					&vmreg->regval);
+		break;
+	case VM_SET_REGISTER:
+		vmreg = (struct vm_register *)data;
+		error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
+					vmreg->regval);
+		break;
+	case VM_SET_SEGMENT_DESCRIPTOR:
+		vmsegdesc = (struct vm_seg_desc *)data;
+		error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
+					vmsegdesc->regnum,
+					&vmsegdesc->desc);
+		break;
+	case VM_GET_SEGMENT_DESCRIPTOR:
+		vmsegdesc = (struct vm_seg_desc *)data;
+		error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
+					vmsegdesc->regnum,
+					&vmsegdesc->desc);
+		break;
+	case VM_GET_CAPABILITY:
+		vmcap = (struct vm_capability *)data;
+		error = vm_get_capability(sc->vm, vmcap->cpuid,
+					  vmcap->captype,
+					  &vmcap->capval);
+		break;
+	case VM_SET_CAPABILITY:
+		vmcap = (struct vm_capability *)data;
+		error = vm_set_capability(sc->vm, vmcap->cpuid,
+					  vmcap->captype,
+					  vmcap->capval);
+		break;
+	default:
+		error = ENOTTY;
+		break;
+	}
+done:
+	mtx_unlock(&vmmdev_mtx);
+
+	return (error);
+}
+
+static int
+vmmdev_mmap(struct cdev *cdev, vm_offset_t offset, vm_paddr_t *paddr, int nprot)
+{
+	int error;
+	struct vmmdev_softc *sc;
+
+	error = -1;
+	mtx_lock(&vmmdev_mtx);
+
+	sc = vmmdev_lookup2(cdev);
+	if (sc != NULL && (nprot & PROT_EXEC) == 0) {
+		*paddr = vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE);
+		if (*paddr != (vm_paddr_t)-1)
+			error = 0;
+	}
+
+	mtx_unlock(&vmmdev_mtx);
+
+	return (error);
+}
+
+static void
+vmmdev_destroy(struct vmmdev_softc *sc)
+{
+
+#ifdef notyet	/* XXX kernel is not compiled with invariants */
+	mtx_assert(&vmmdev_mtx, MA_OWNED);
+#endif
+
+	/*
+	 * XXX must stop virtual machine instances that may be still
+	 * running and cleanup their state.
+	 */
+	SLIST_REMOVE(&head, sc, vmmdev_softc, link);
+	destroy_dev(sc->cdev);
+	vm_destroy(sc->vm);
+	free(sc, M_VMMDEV);
+}
+
+static int
+sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	char buf[VM_MAX_NAMELEN];
+	struct vmmdev_softc *sc;
+
+	strlcpy(buf, "beavis", sizeof(buf));
+	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+
+	mtx_lock(&vmmdev_mtx);
+	sc = vmmdev_lookup(buf);
+	if (sc == NULL) {
+		mtx_unlock(&vmmdev_mtx);
+		return (EINVAL);
+	}
+	vmmdev_destroy(sc);
+	mtx_unlock(&vmmdev_mtx);
+	return (0);
+}
+SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
+	    NULL, 0, sysctl_vmm_destroy, "A", NULL);
+
+static struct cdevsw vmmdevsw = {
+	.d_name		= "vmmdev",
+	.d_version	= D_VERSION,
+	.d_ioctl	= vmmdev_ioctl,
+	.d_mmap		= vmmdev_mmap,
+	.d_read		= vmmdev_rw,
+	.d_write	= vmmdev_rw,
+};
+
+static int
+sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	struct vm *vm;
+	struct vmmdev_softc *sc;
+	char buf[VM_MAX_NAMELEN];
+
+	strlcpy(buf, "beavis", sizeof(buf));
+	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+
+	mtx_lock(&vmmdev_mtx);
+
+	sc = vmmdev_lookup(buf);
+	if (sc != NULL) {
+		mtx_unlock(&vmmdev_mtx);
+		return (EEXIST);
+	}
+
+	vm = vm_create(buf);
+	if (vm == NULL) {
+		mtx_unlock(&vmmdev_mtx);
+		return (EINVAL);
+	}
+
+	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
+	sc->vm = vm;
+	sc->cdev = make_dev(&vmmdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
+			    "vmm/%s", buf);
+	sc->cdev->si_drv1 = sc;
+	SLIST_INSERT_HEAD(&head, sc, link);
+
+	mtx_unlock(&vmmdev_mtx);
+	return (0);
+}
+SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
+	    NULL, 0, sysctl_vmm_create, "A", NULL);
+
+void
+vmmdev_init(void)
+{
+	mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
+}
+
+void
+vmmdev_cleanup(void)
+{
+	struct vmmdev_softc *sc, *sc2;
+
+	mtx_lock(&vmmdev_mtx);
+
+	SLIST_FOREACH_SAFE(sc, &head, link, sc2)
+		vmmdev_destroy(sc);
+
+	mtx_unlock(&vmmdev_mtx);
+}
diff --git a/sys/amd64/vmm/vmm_ipi.c b/sys/amd64/vmm/vmm_ipi.c
new file mode 100644
index 0000000..c8e795b
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ipi.c
@@ -0,0 +1,103 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/bus.h>
+
+#include <machine/intr_machdep.h>
+#include <machine/apicvar.h>
+#include <machine/segments.h>
+#include <machine/md_var.h>
+#include <machine/smp.h>
+
+#include <machine/vmm.h>
+#include "vmm_ipi.h"
+
+extern inthand_t IDTVEC(rsvd), IDTVEC(justreturn);
+
+/*
+ * The default is to use the IPI_AST to interrupt a vcpu.
+ */
+static int ipinum = IPI_AST;
+
+CTASSERT(APIC_SPURIOUS_INT == 255);
+
+void
+vmm_ipi_init(void)
+{
+	int idx;
+	uintptr_t func;
+	struct gate_descriptor *ip;
+
+	/*
+	 * Search backwards from the highest IDT vector available for use
+	 * as our IPI vector. We install the 'justreturn' handler at that
+	 * vector and use it to interrupt the vcpus.
+	 *
+	 * We do this because the IPI_AST is heavyweight and saves all
+	 * registers in the trapframe. This is overkill for our use case
+	 * which is simply to EOI the interrupt and return.
+	 */
+	idx = APIC_SPURIOUS_INT;
+	while (--idx >= APIC_IPI_INTS) {
+		ip = &idt[idx];
+		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
+		if (func == (uintptr_t)&IDTVEC(rsvd)) {
+			ipinum = idx;
+			setidt(ipinum, IDTVEC(justreturn), SDT_SYSIGT,
+			       SEL_KPL, 0);
+			break;
+		}
+	}
+	
+	if (ipinum != IPI_AST && bootverbose) {
+		printf("vmm_ipi_init: installing ipi handler to interrupt "
+		       "vcpus at vector %d\n", ipinum);
+	}
+}
+
+void
+vmm_ipi_cleanup(void)
+{
+	if (ipinum != IPI_AST)
+		setidt(ipinum, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
+}
+
+void
+vm_interrupt_hostcpu(struct vm *vm, int vcpu)
+{
+	int hostcpu;
+
+	if (vcpu_is_running(vm, vcpu, &hostcpu) && hostcpu != curcpu)
+		ipi_selected((cpumask_t)1 << hostcpu, ipinum);
+}
diff --git a/sys/amd64/vmm/vmm_ipi.h b/sys/amd64/vmm/vmm_ipi.h
new file mode 100644
index 0000000..7ab94bf
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ipi.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_IPI_H_
+#define _VMM_IPI_H_
+
+struct vm;
+
+void	vmm_ipi_init(void);
+void	vmm_ipi_cleanup(void);
+void	vm_interrupt_hostcpu(struct vm *vm, int vcpu);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_ktr.h b/sys/amd64/vmm/vmm_ktr.h
new file mode 100644
index 0000000..e691c61
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ktr.h
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_KTR_H_
+#define	_VMM_KTR_H_
+
+#include <sys/ktr.h>
+#include <sys/pcpu.h>
+
+#define	KTR_VMM	KTR_GEN
+
+#define	VMM_CTR0(vm, vcpuid, format)					\
+CTR3(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu)
+
+#define	VMM_CTR1(vm, vcpuid, format, p1)				\
+CTR4(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+			(p1))
+
+#define	VMM_CTR2(vm, vcpuid, format, p1, p2)				\
+CTR5(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+			(p1), (p2))
+
+#define	VMM_CTR3(vm, vcpuid, format, p1, p2, p3)			\
+CTR6(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+			(p1), (p2), (p3))
+#endif
diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c
new file mode 100644
index 0000000..8704fcf
--- /dev/null
+++ b/sys/amd64/vmm/vmm_lapic.c
@@ -0,0 +1,121 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/vmm.h>
+#include "vmm_ipi.h"
+#include "vmm_lapic.h"
+#include "vlapic.h"
+
+int
+lapic_write(struct vm *vm, int cpu, u_int offset, uint64_t val)
+{
+	int handled;
+
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	if (vlapic_op_mem_write(vlapic, offset, DWORD, val) == 0)
+		handled = 1;
+	else
+		handled = 0;
+
+	return (handled);
+}
+
+int
+lapic_read(struct vm *vm, int cpu, u_int offset, uint64_t *rv)
+{
+	int handled;
+
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	if (vlapic_op_mem_read(vlapic, offset, DWORD, rv) == 0)
+		handled = 1;
+	else
+		handled = 0;
+
+	return (handled);
+}
+
+int
+lapic_pending_intr(struct vm *vm, int cpu)
+{
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	return (vlapic_pending_intr(vlapic));
+}
+
+void
+lapic_intr_accepted(struct vm *vm, int cpu, int vector)
+{
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	vlapic_intr_accepted(vlapic, vector);
+}
+
+int
+lapic_set_intr(struct vm *vm, int cpu, int vector)
+{
+	struct vlapic *vlapic;
+
+	if (cpu < 0 || cpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (vector < 32 || vector > 255)
+		return (EINVAL);
+
+	vlapic = vm_lapic(vm, cpu);
+	vlapic_set_intr_ready(vlapic, vector);
+
+	vm_interrupt_hostcpu(vm, cpu);
+
+	return (0);
+}
+
+void
+lapic_timer_tick(struct vm *vm, int cpu)
+{
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	vlapic_timer_tick(vlapic);
+}
diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h
new file mode 100644
index 0000000..815b2f7
--- /dev/null
+++ b/sys/amd64/vmm/vmm_lapic.h
@@ -0,0 +1,64 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_LAPIC_H_
+#define	_VMM_LAPIC_H_
+
+struct vm;
+
+int	lapic_write(struct vm *vm, int cpu, u_int offset, uint64_t val);
+int	lapic_read(struct vm *vm, int cpu, u_int offset, uint64_t *retval);
+void	lapic_timer_tick(struct vm *vm, int cpu);
+
+/*
+ * Returns a vector between 32 and 255 if an interrupt is pending in the
+ * IRR that can be delivered based on the current state of ISR and TPR.
+ *
+ * Note that the vector does not automatically transition to the ISR as a
+ * result of calling this function.
+ *
+ * Returns -1 if there is no eligible vector that can be delivered to the
+ * guest at this time.
+ */
+int	lapic_pending_intr(struct vm *vm, int cpu);
+
+/*
+ * Transition 'vector' from IRR to ISR. This function is called with the
+ * vector returned by 'lapic_pending_intr()' when the guest is able to
+ * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that
+ * block interrupt delivery).
+ */
+void	lapic_intr_accepted(struct vm *vm, int cpu, int vector);
+
+/*
+ * Signals to the LAPIC that an interrupt at 'vector' needs to be generated
+ * to the 'cpu', the state is recorded in IRR.
+ */
+int	lapic_set_intr(struct vm *vm, int cpu, int vector);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c
new file mode 100644
index 0000000..9ce1e80
--- /dev/null
+++ b/sys/amd64/vmm/vmm_mem.c
@@ -0,0 +1,413 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/linker.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/md_var.h>
+#include <machine/metadata.h>
+#include <machine/pc/bios.h>
+#include <machine/vmparam.h>
+#include <machine/pmap.h>
+
+#include "vmm_util.h"
+#include "vmm_mem.h"
+
+static MALLOC_DEFINE(M_VMM_MEM, "vmm memory", "vmm memory");
+
+#define	MB		(1024 * 1024)
+#define	GB		(1024 * MB)
+
+#define	VMM_MEM_MAXSEGS	64
+
+/* protected by vmm_mem_mtx */
+static struct {
+	vm_paddr_t	base;
+	vm_size_t	length;
+} vmm_mem_avail[VMM_MEM_MAXSEGS];
+
+static int vmm_mem_nsegs;
+
+static vm_paddr_t maxaddr;
+
+static struct mtx vmm_mem_mtx;
+
+/*
+ * Steal any memory that was deliberately hidden from FreeBSD either by
+ * the use of MAXMEM kernel config option or the hw.physmem loader tunable.
+ */
+static int
+vmm_mem_steal_memory(void)
+{
+	int nsegs;
+	caddr_t kmdp;
+	uint32_t smapsize;
+	uint64_t base, length;
+	struct bios_smap *smapbase, *smap, *smapend;
+
+	/*
+	 * Borrowed from hammer_time() and getmemsize() in machdep.c
+	 */
+	kmdp = preload_search_by_type("elf kernel");
+	if (kmdp == NULL)
+		kmdp = preload_search_by_type("elf64 kernel");
+
+	smapbase = (struct bios_smap *)preload_search_info(kmdp,
+		MODINFO_METADATA | MODINFOMD_SMAP);
+	if (smapbase == NULL)
+		panic("No BIOS smap info from loader!");
+
+	smapsize = *((uint32_t *)smapbase - 1);
+	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
+
+	nsegs = 0;
+	for (smap = smapbase; smap < smapend; smap++) {
+		/*
+		 * XXX
+		 * Assuming non-overlapping, monotonically increasing
+		 * memory segments.
+		 */
+		if (smap->type != SMAP_TYPE_MEMORY)
+			continue;
+		if (smap->length == 0)
+			break;
+
+		base = roundup(smap->base, NBPDR);
+		length = rounddown(smap->length, NBPDR);
+
+		/* Skip this segment if FreeBSD is using all of it. */
+		if (base + length <= ptoa(Maxmem))
+			continue;
+
+		/*
+		 * If FreeBSD is using part of this segment then adjust
+		 * 'base' and 'length' accordingly.
+		 */
+		if (base < ptoa(Maxmem)) {
+			uint64_t used;
+			used = roundup(ptoa(Maxmem), NBPDR) - base;
+			base += used;
+			length -= used;
+		}
+
+		if (length == 0)
+			continue;
+
+		vmm_mem_avail[nsegs].base = base;
+		vmm_mem_avail[nsegs].length = length;
+
+		if (base + length > maxaddr)
+			maxaddr = base + length;
+
+		if (0 && bootverbose) {
+			printf("vmm_mem_populate: index %d, base 0x%0lx, "
+			       "length %ld\n",
+			       nsegs, vmm_mem_avail[nsegs].base,
+			       vmm_mem_avail[nsegs].length);
+		}
+
+		nsegs++;
+		if (nsegs >= VMM_MEM_MAXSEGS) {
+			printf("vmm_mem_populate: maximum number of vmm memory "
+			       "segments reached!\n");
+			return (ENOSPC);
+		}
+	}
+
+	vmm_mem_nsegs = nsegs;
+
+	return (0);
+}
+
+static void
+vmm_mem_direct_map(vm_paddr_t start, vm_paddr_t end)
+{
+	vm_paddr_t addr, remaining;
+	int pdpi, pdi, superpage_size;
+	pml4_entry_t *pml4p;
+	pdp_entry_t *pdp;
+	pd_entry_t *pd;
+	uint64_t page_attr_bits;
+
+	if (end >= NBPML4)
+		panic("Cannot map memory beyond %ldGB", NBPML4 / GB);
+
+	/* XXX FreeBSD 8.1 does not use 1G superpages in the direct map */
+	if (0 && vmm_supports_1G_pages())
+		superpage_size = NBPDP;
+	else
+		superpage_size = NBPDR;
+
+	/*
+	 * Get the page directory pointer page that contains the direct
+	 * map address mappings.
+	 */
+	pml4p = kernel_pmap->pm_pml4;
+	pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4p[DMPML4I] & ~PAGE_MASK);
+
+	page_attr_bits = PG_RW | PG_V | PG_PS | PG_G;
+	addr = start;
+	while (addr < end) {
+		remaining = end - addr;
+		pdpi = addr / NBPDP;
+		if (superpage_size == NBPDP &&
+		    remaining >= NBPDP &&
+		    addr % NBPDP == 0) {
+			/*
+			 * If there isn't a mapping for this address then
+			 * create one but if there is one already make sure
+			 * it matches what we expect it to be.
+			 */
+			if (pdp[pdpi] == 0) {
+				pdp[pdpi] = addr | page_attr_bits;
+				if (0 && bootverbose) {
+					printf("vmm_mem_populate: mapping "
+					       "0x%lx with 1GB page at "
+					       "pdpi %d\n", addr, pdpi);
+				}
+			} else {
+				pdp_entry_t pdpe = pdp[pdpi];
+				if ((pdpe & ~PAGE_MASK) != addr ||
+				    (pdpe & page_attr_bits) != page_attr_bits) {
+					panic("An invalid mapping 0x%016lx "
+					      "already exists for 0x%016lx\n",
+					      pdpe, addr);
+				}
+			}
+			addr += NBPDP;
+		} else {
+			if (remaining < NBPDR) {
+				panic("vmm_mem_populate: remaining (%ld) must "
+				      "be greater than NBPDR (%d)\n",
+				      remaining, NBPDR);
+			}
+			if (pdp[pdpi] == 0) {
+				/*
+				 * XXX we lose this memory forever because
+				 * we do not keep track of the virtual address
+				 * that would be required to free this page.
+				 */
+				pd = malloc(PAGE_SIZE, M_VMM_MEM,
+					    M_WAITOK | M_ZERO);
+				if ((uintptr_t)pd & PAGE_MASK) {
+					panic("vmm_mem_populate: page directory"
+					      "page not aligned on %d "
+					      "boundary\n", PAGE_SIZE);
+				}
+				pdp[pdpi] = vtophys(pd);
+				pdp[pdpi] |= PG_RW | PG_V | PG_U;
+				if (0 && bootverbose) {
+					printf("Creating page directory "
+					       "at pdp index %d for 0x%016lx\n",
+					       pdpi, addr);
+				}
+			}
+			pdi = (addr % NBPDP) / NBPDR;
+			pd = (pd_entry_t *)PHYS_TO_DMAP(pdp[pdpi] & ~PAGE_MASK);
+
+			/*
+			 * Create a new mapping if one doesn't already exist
+			 * or validate it if it does.
+			 */
+			if (pd[pdi] == 0) {
+				pd[pdi] = addr | page_attr_bits;
+				if (0 && bootverbose) {
+					printf("vmm_mem_populate: mapping "
+					       "0x%lx with 2MB page at "
+					       "pdpi %d, pdi %d\n",
+					       addr, pdpi, pdi);
+				}
+			} else {
+				pd_entry_t pde = pd[pdi];
+				if ((pde & ~PAGE_MASK) != addr ||
+				    (pde & page_attr_bits) != page_attr_bits) {
+					panic("An invalid mapping 0x%016lx "
+					      "already exists for 0x%016lx\n",
+					      pde, addr);
+				}
+			}
+			addr += NBPDR;
+		}
+	}
+}
+
+static int
+vmm_mem_populate(void)
+{
+	int seg, error;
+	vm_paddr_t start, end;
+
+	/* populate the vmm_mem_avail[] array */
+	error = vmm_mem_steal_memory();
+	if (error)
+		return (error);
+	
+	/*
+	 * Now map the memory that was hidden from FreeBSD in
+	 * the direct map VA space.
+	 */
+	for (seg = 0; seg < vmm_mem_nsegs; seg++) {
+		start = vmm_mem_avail[seg].base;
+		end = start + vmm_mem_avail[seg].length;
+		if ((start & PDRMASK) != 0 || (end & PDRMASK) != 0) {
+			panic("start (0x%016lx) and end (0x%016lx) must be "
+			      "aligned on a %dMB boundary\n",
+			      start, end, NBPDR / MB);
+		}
+		vmm_mem_direct_map(start, end);
+	}
+
+	return (0);
+}
+
+int
+vmm_mem_init(void)
+{
+	int error;
+
+	mtx_init(&vmm_mem_mtx, "vmm_mem_mtx", NULL, MTX_DEF);
+
+	error = vmm_mem_populate();
+	if (error)
+		return (error);
+
+	return (0);
+}
+
+vm_paddr_t
+vmm_mem_alloc(size_t size)
+{
+	int i;
+	vm_paddr_t addr;
+
+	if ((size & PDRMASK) != 0) {
+		panic("vmm_mem_alloc: size 0x%0lx must be "
+		      "aligned on a 0x%0x boundary\n", size, NBPDR);
+	}
+
+	addr = 0;
+
+	mtx_lock(&vmm_mem_mtx);
+	for (i = 0; i < vmm_mem_nsegs; i++) {
+		if (vmm_mem_avail[i].length >= size) {
+			addr = vmm_mem_avail[i].base;
+			vmm_mem_avail[i].base += size;
+			vmm_mem_avail[i].length -= size;
+			/* remove a zero length segment */
+			if (vmm_mem_avail[i].length == 0) {
+				memmove(&vmm_mem_avail[i],
+					&vmm_mem_avail[i + 1],
+					(vmm_mem_nsegs - (i + 1)) *
+					 sizeof(vmm_mem_avail[0]));
+				vmm_mem_nsegs--;
+			}
+			break;
+		}
+	}
+	mtx_unlock(&vmm_mem_mtx);
+
+	return (addr);
+}
+
+void
+vmm_mem_free(vm_paddr_t base, size_t length)
+{
+	int i;
+
+	if ((base & PDRMASK) != 0 || (length & PDRMASK) != 0) {
+		panic("vmm_mem_free: base 0x%0lx and length 0x%0lx must be "
+		      "aligned on a 0x%0x boundary\n", base, length, NBPDR);
+	}
+
+	mtx_lock(&vmm_mem_mtx);
+
+	for (i = 0; i < vmm_mem_nsegs; i++) {
+		if (vmm_mem_avail[i].base > base)
+			break;
+	}
+
+	if (vmm_mem_nsegs >= VMM_MEM_MAXSEGS)
+		panic("vmm_mem_free: cannot free any more segments");
+
+	/* Create a new segment at index 'i' */
+	memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i],
+		(vmm_mem_nsegs - i) * sizeof(vmm_mem_avail[0]));
+
+	vmm_mem_avail[i].base = base;
+	vmm_mem_avail[i].length = length;
+
+	vmm_mem_nsegs++;
+
+coalesce_some_more:
+	for (i = 0; i < vmm_mem_nsegs - 1; i++) {
+		if (vmm_mem_avail[i].base + vmm_mem_avail[i].length ==
+		    vmm_mem_avail[i + 1].base) {
+			vmm_mem_avail[i].length += vmm_mem_avail[i + 1].length;
+			memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i + 2],
+			  (vmm_mem_nsegs - (i + 2)) * sizeof(vmm_mem_avail[0]));
+			vmm_mem_nsegs--;
+			goto coalesce_some_more;
+		}
+	}
+
+	mtx_unlock(&vmm_mem_mtx);
+}
+
+vm_paddr_t
+vmm_mem_maxaddr(void)
+{
+
+	return (maxaddr);
+}
+
+void
+vmm_mem_dump(void)
+{
+	int i;
+	vm_paddr_t base;
+	vm_size_t length;
+
+	mtx_lock(&vmm_mem_mtx);
+	for (i = 0; i < vmm_mem_nsegs; i++) {
+		base = vmm_mem_avail[i].base;
+		length = vmm_mem_avail[i].length;
+		printf("%-4d0x%016lx    0x%016lx\n", i, base, base + length);
+	}
+	mtx_unlock(&vmm_mem_mtx);
+}
diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h
new file mode 100644
index 0000000..ef1bf1a
--- /dev/null
+++ b/sys/amd64/vmm/vmm_mem.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_VMM_MEM_H_
+#define	_VMM_MEM_H_
+
+int		vmm_mem_init(void);
+vm_paddr_t	vmm_mem_alloc(size_t size);
+void		vmm_mem_free(vm_paddr_t start, size_t size);
+vm_paddr_t	vmm_mem_maxaddr(void);
+void		vmm_mem_dump(void);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_msr.c b/sys/amd64/vmm/vmm_msr.c
new file mode 100644
index 0000000..152aa7b
--- /dev/null
+++ b/sys/amd64/vmm/vmm_msr.c
@@ -0,0 +1,264 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/specialreg.h>
+#include <machine/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_lapic.h"
+#include "vmm_msr.h"
+
+#define	VMM_MSR_F_EMULATE	0x01
+#define	VMM_MSR_F_READONLY	0x02
+
+struct vmm_msr {
+	int		num;
+	int		flags;
+	uint64_t	hostval;
+};
+
+static struct vmm_msr vmm_msr[] = {
+	{ MSR_LSTAR,	0 },
+	{ MSR_CSTAR,	0 },
+	{ MSR_STAR,	0 },
+	{ MSR_SF_MASK,	0 },
+	{ MSR_APICBASE,	VMM_MSR_F_EMULATE },
+	{ MSR_BIOS_SIGN,VMM_MSR_F_EMULATE },
+	{ MSR_MCG_CAP,	VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY },
+};
+
+#define	vmm_msr_num	(sizeof(vmm_msr) / sizeof(vmm_msr[0]))
+CTASSERT(VMM_MSR_NUM >= vmm_msr_num);
+
+#define	readonly_msr(idx)	\
+	((vmm_msr[(idx)].flags & VMM_MSR_F_READONLY) != 0)
+
+#define	emulated_msr(idx)	\
+	((vmm_msr[(idx)].flags & VMM_MSR_F_EMULATE) != 0)
+
+void
+vmm_msr_init(void)
+{
+	int i;
+
+	for (i = 0; i < vmm_msr_num; i++) {
+		if (emulated_msr(i))
+			continue;
+		/*
+		 * XXX this assumes that the value of the host msr does not
+		 * change after we have cached it.
+		 */
+		vmm_msr[i].hostval = rdmsr(vmm_msr[i].num);
+	}
+}
+
+void
+guest_msrs_init(struct vm *vm, int cpu)
+{
+	int i;
+	uint64_t *guest_msrs;
+
+	guest_msrs = vm_guest_msrs(vm, cpu);
+	
+	for (i = 0; i < vmm_msr_num; i++) {
+		switch (vmm_msr[i].num) {
+		case MSR_LSTAR:
+		case MSR_CSTAR:
+		case MSR_STAR:
+		case MSR_SF_MASK:
+		case MSR_BIOS_SIGN:
+		case MSR_MCG_CAP:
+			guest_msrs[i] = 0;
+			break;
+		case MSR_APICBASE:
+			guest_msrs[i] = DEFAULT_APIC_BASE | APICBASE_ENABLED |
+					APICBASE_X2APIC;
+			if (cpu == 0)
+				guest_msrs[i] |= APICBASE_BSP;
+			break;
+		default:
+			panic("guest_msrs_init: missing initialization for msr "
+			      "0x%0x", vmm_msr[i].num);
+		}
+	}
+}
+
+static boolean_t
+x2apic_msr(u_int num)
+{
+
+	if (num >= 0x800 && num <= 0xBFF)
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+static u_int
+x2apic_msr_to_regoff(u_int msr)
+{
+
+	return ((msr - 0x800) << 4);
+}
+
+static boolean_t
+x2apic_msr_id(u_int num)
+{
+	return (num == 0x802);
+}
+
+static int
+msr_num_to_idx(u_int num)
+{
+	int i;
+
+	for (i = 0; i < vmm_msr_num; i++)
+		if (vmm_msr[i].num == num)
+			return (i);
+
+	return (-1);
+}
+
+int
+emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val)
+{
+	int handled, idx;
+	uint64_t *guest_msrs;
+
+	handled = 0;
+
+	if (x2apic_msr(num))
+		return (lapic_write(vm, cpu, x2apic_msr_to_regoff(num), val));
+
+	idx = msr_num_to_idx(num);
+	if (idx < 0)
+		goto done;
+
+	if (!readonly_msr(idx)) {
+		guest_msrs = vm_guest_msrs(vm, cpu);
+
+		/* Stash the value */
+		guest_msrs[idx] = val;
+
+		/* Update processor state for non-emulated MSRs */
+		if (!emulated_msr(idx))
+			wrmsr(vmm_msr[idx].num, val);
+	}
+
+	handled = 1;
+done:
+	return (handled);
+}
+
+int
+emulate_rdmsr(struct vm *vm, int cpu, u_int num)
+{
+	int error, handled, idx;
+	uint32_t eax, edx;
+	uint64_t result, *guest_msrs;
+
+	handled = 0;
+
+	if (x2apic_msr(num)) {
+		handled = lapic_read(vm, cpu, x2apic_msr_to_regoff(num),
+				     &result);
+		/*
+		 * The version ID needs to be massaged
+		 */
+		if (x2apic_msr_id(num)) {
+			result = result >> 24;
+		}
+		goto done;
+	}
+
+	idx = msr_num_to_idx(num);
+	if (idx < 0)
+		goto done;
+
+	guest_msrs = vm_guest_msrs(vm, cpu);
+	result = guest_msrs[idx];
+
+	/*
+	 * If this is not an emulated msr register make sure that the processor
+	 * state matches our cached state.
+	 */
+	if (!emulated_msr(idx) && (rdmsr(num) != result)) {
+		panic("emulate_rdmsr: msr 0x%0x has inconsistent cached "
+		      "(0x%016lx) and actual (0x%016lx) values", num,
+		      result, rdmsr(num));
+	}
+
+	handled = 1;
+
+done:
+	if (handled) {
+		eax = result;
+		edx = result >> 32;
+		error = vm_set_register(vm, cpu, VM_REG_GUEST_RAX, eax);
+		if (error)
+			panic("vm_set_register(rax) error %d", error);
+		error = vm_set_register(vm, cpu, VM_REG_GUEST_RDX, edx);
+		if (error)
+			panic("vm_set_register(rdx) error %d", error);
+	}
+	return (handled);
+}
+
+void
+restore_guest_msrs(struct vm *vm, int cpu)
+{
+	int i;
+	uint64_t *guest_msrs;
+
+	guest_msrs = vm_guest_msrs(vm, cpu);
+
+	for (i = 0; i < vmm_msr_num; i++) {
+		if (emulated_msr(i))
+			continue;
+		else
+			wrmsr(vmm_msr[i].num, guest_msrs[i]);
+	}
+}
+
+void
+restore_host_msrs(struct vm *vm, int cpu)
+{
+	int i;
+
+	for (i = 0; i < vmm_msr_num; i++) {
+		if (emulated_msr(i))
+			continue;
+		else
+			wrmsr(vmm_msr[i].num, vmm_msr[i].hostval);
+	}
+}
diff --git a/sys/amd64/vmm/vmm_msr.h b/sys/amd64/vmm/vmm_msr.h
new file mode 100644
index 0000000..1e15787
--- /dev/null
+++ b/sys/amd64/vmm/vmm_msr.h
@@ -0,0 +1,42 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_VMM_MSR_H_
+#define	_VMM_MSR_H_
+
+#define	VMM_MSR_NUM	16
+struct vm;
+
+void	vmm_msr_init(void);
+int	emulate_wrmsr(struct vm *vm, int vcpu, u_int msr, uint64_t val);
+int	emulate_rdmsr(struct vm *vm, int vcpu, u_int msr);
+void	guest_msrs_init(struct vm *vm, int cpu);
+void	restore_host_msrs(struct vm *vm, int cpu);
+void	restore_guest_msrs(struct vm *vm, int cpu);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_stat.c b/sys/amd64/vmm/vmm_stat.c
new file mode 100644
index 0000000..e6f5c48
--- /dev/null
+++ b/sys/amd64/vmm/vmm_stat.c
@@ -0,0 +1,103 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include <machine/vmm.h>
+#include "vmm_stat.h"
+
+static int vstnum;
+static struct vmm_stat_type *vsttab[MAX_VMM_STAT_TYPES];
+
+static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat");
+
+void
+vmm_stat_init(void *arg)
+{
+	struct vmm_stat_type *vst = arg;
+
+	/* We require all stats to identify themselves with a description */
+	if (vst->desc == NULL)
+		return;
+
+	if (vstnum >= MAX_VMM_STAT_TYPES) {
+		printf("Cannot accomodate vmm stat type \"%s\"!\n", vst->desc);
+		return;
+	}
+
+	vst->index = vstnum;
+	vsttab[vstnum++] = vst;
+}
+
+int
+vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf)
+{
+	int i;
+	uint64_t *stats;
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+		
+	stats = vcpu_stats(vm, vcpu);
+	for (i = 0; i < vstnum; i++)
+		buf[i] = stats[i];
+	*num_stats = vstnum;
+	return (0);
+}
+
+void *
+vmm_stat_alloc(void)
+{
+	u_long size;
+	
+	size = vstnum * sizeof(uint64_t);
+
+	return (malloc(size, M_VMM_STAT, M_ZERO | M_WAITOK));
+}
+
+void
+vmm_stat_free(void *vp)
+{
+	free(vp, M_VMM_STAT);
+}
+
+const char *
+vmm_stat_desc(int index)
+{
+
+	if (index >= 0 && index < vstnum)
+		return (vsttab[index]->desc);
+	else
+		return (NULL);
+}
diff --git a/sys/amd64/vmm/vmm_stat.h b/sys/amd64/vmm/vmm_stat.h
new file mode 100644
index 0000000..7c075a6
--- /dev/null
+++ b/sys/amd64/vmm/vmm_stat.h
@@ -0,0 +1,71 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_STAT_H_
+#define	_VMM_STAT_H_
+
+struct vm;
+
+#define	MAX_VMM_STAT_TYPES	64		/* arbitrary */
+
+struct vmm_stat_type {
+	const char	*desc;		/* description of statistic */
+	int	index;			/* position in the stats buffer */
+};
+
+void	vmm_stat_init(void *arg);
+
+#define	VMM_STAT_DEFINE(type, desc)					\
+	struct vmm_stat_type type[1] = {				\
+		{ desc, -1 }						\
+	};								\
+	SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_init, type)
+
+void	*vmm_stat_alloc(void);
+void 	vmm_stat_free(void *vp);
+
+/*
+ * 'buf' should be at least fit 'MAX_VMM_STAT_TYPES' entries
+ */
+int	vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf);
+const char *vmm_stat_desc(int index);
+
+static void __inline
+vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x)
+{
+#ifdef	VMM_KEEP_STATS
+	uint64_t *stats = vcpu_stats(vm, vcpu);
+	if (vst->index >= 0)
+		stats[vst->index] += x;
+#endif
+}
+
+#endif
diff --git a/sys/amd64/vmm/vmm_support.S b/sys/amd64/vmm/vmm_support.S
new file mode 100644
index 0000000..2afc608
--- /dev/null
+++ b/sys/amd64/vmm/vmm_support.S
@@ -0,0 +1,42 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#define	LOCORE
+
+#include <machine/asmacros.h>
+
+#define	LA_EOI	0xB0
+
+	.text
+	SUPERALIGN_TEXT
+IDTVEC(justreturn)
+	pushq	%rax
+	movq	lapic, %rax
+	movl	$0, LA_EOI(%rax)
+	popq	%rax
+	iretq
diff --git a/sys/amd64/vmm/vmm_util.c b/sys/amd64/vmm/vmm_util.c
new file mode 100644
index 0000000..f245f92
--- /dev/null
+++ b/sys/amd64/vmm/vmm_util.c
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/libkern.h>
+
+#include <machine/md_var.h>
+
+#include "vmm_util.h"
+
+boolean_t
+vmm_is_intel(void)
+{
+
+	if (strcmp(cpu_vendor, "GenuineIntel") == 0)
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+boolean_t
+vmm_is_amd(void)
+{
+	if (strcmp(cpu_vendor, "AuthenticAMD") == 0)
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+boolean_t
+vmm_supports_1G_pages(void)
+{
+	unsigned int regs[4];
+
+	/*
+	 * CPUID.80000001:EDX[bit 26] = 1 indicates support for 1GB pages
+	 *
+	 * Both Intel and AMD support this bit.
+	 */
+	if (cpu_exthigh >= 0x80000001) {
+		do_cpuid(0x80000001, regs);
+		if (regs[3] & (1 << 26))
+			return (TRUE);
+	}
+	return (FALSE);
+}
+
+#include <sys/proc.h>
+#include <machine/frame.h>
+#define	DUMP_REG(x)	printf(#x "\t\t0x%016lx\n", (long)(tf->tf_ ## x))
+#define	DUMP_SEG(x)	printf(#x "\t\t0x%04x\n", (unsigned)(tf->tf_ ## x))
+void
+dump_trapframe(struct trapframe *tf)
+{
+	DUMP_REG(rdi);
+	DUMP_REG(rsi);
+	DUMP_REG(rdx);
+	DUMP_REG(rcx);
+	DUMP_REG(r8);
+	DUMP_REG(r9);
+	DUMP_REG(rax);
+	DUMP_REG(rbx);
+	DUMP_REG(rbp);
+	DUMP_REG(r10);
+	DUMP_REG(r11);
+	DUMP_REG(r12);
+	DUMP_REG(r13);
+	DUMP_REG(r14);
+	DUMP_REG(r15);
+	DUMP_REG(trapno);
+	DUMP_REG(addr);
+	DUMP_REG(flags);
+	DUMP_REG(err);
+	DUMP_REG(rip);
+	DUMP_REG(rflags);
+	DUMP_REG(rsp);
+	DUMP_SEG(cs);
+	DUMP_SEG(ss);
+	DUMP_SEG(fs);
+	DUMP_SEG(gs);
+	DUMP_SEG(es);
+	DUMP_SEG(ds);
+}
diff --git a/sys/amd64/vmm/vmm_util.h b/sys/amd64/vmm/vmm_util.h
new file mode 100644
index 0000000..7f82332
--- /dev/null
+++ b/sys/amd64/vmm/vmm_util.h
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_UTIL_H_
+#define	_VMM_UTIL_H_
+
+struct trapframe;
+
+boolean_t	vmm_is_intel(void);
+boolean_t	vmm_is_amd(void);
+boolean_t	vmm_supports_1G_pages(void);
+
+void		dump_trapframe(struct trapframe *tf);
+
+#endif
diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c
new file mode 100644
index 0000000..45c4c53
--- /dev/null
+++ b/sys/amd64/vmm/x86.c
@@ -0,0 +1,113 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <machine/cpufunc.h>
+#include <machine/specialreg.h>
+
+#include "x86.h"
+
+int
+x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+{
+	unsigned int 	func, regs[4];
+
+	func = *eax;
+
+	cpuid_count(*eax, *ecx, regs);
+
+	switch(func) {
+		case CPUID_0000_0000:
+		case CPUID_0000_0002:
+		case CPUID_0000_0003:
+		case CPUID_0000_0004:
+		case CPUID_0000_000A:
+			break;
+
+		case CPUID_8000_0000:
+		case CPUID_8000_0001:
+		case CPUID_8000_0002:
+		case CPUID_8000_0003:
+		case CPUID_8000_0004:
+		case CPUID_8000_0006:
+		case CPUID_8000_0007:
+		case CPUID_8000_0008:
+
+			break;
+
+		case CPUID_0000_0001:
+			/*
+			 * Override the APIC ID only in ebx
+			 */
+			regs[1] &= ~(CPUID_0000_0001_APICID_MASK);
+			/*
+			 * XXX fixme for MP case, set apicid properly for cpu. 
+			 */
+			regs[1] |= (0 << CPUID_0000_0001_APICID_SHIFT);
+
+			/*
+			 * Don't expose VMX capability.
+			 * Advertise x2APIC capability.
+			 */
+			regs[2] &= ~CPUID_0000_0001_FEAT0_VMX;
+			regs[2] |= CPUID2_X2APIC;
+
+			/*
+			 * Machine check handling is done in the host.
+			 * Hide MTRR capability.
+			 */
+			regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
+
+			break;
+
+		case CPUID_0000_000B:
+			/*
+			 * XXXSMP fixme
+			 * Processor topology enumeration
+			 */
+			regs[0] = 0;
+			regs[1] = 0;
+			regs[2] = *ecx & 0xff;
+			regs[3] = 0;
+			break;
+
+		default:
+			return (0);
+	}
+
+	*eax = regs[0];
+	*ebx = regs[1];
+	*ecx = regs[2];
+	*edx = regs[3];
+	return (1);
+}
+
diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h
new file mode 100644
index 0000000..bc4f8a4
--- /dev/null
+++ b/sys/amd64/vmm/x86.h
@@ -0,0 +1,62 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _X86_H_
+#define	_X86_H_
+
+#define CPUID_0000_0000 (0x0)
+#define CPUID_0000_0001	(0x1)
+#define CPUID_0000_0002 (0x2)
+#define CPUID_0000_0003 (0x3)
+#define CPUID_0000_0004 (0x4)
+#define	CPUID_0000_000A	(0xA)
+#define	CPUID_0000_000B	(0xB)
+#define CPUID_8000_0000	(0x80000000)
+#define CPUID_8000_0001	(0x80000001)
+#define CPUID_8000_0002	(0x80000002)
+#define CPUID_8000_0003	(0x80000003)
+#define CPUID_8000_0004	(0x80000004)
+#define CPUID_8000_0006	(0x80000006)
+#define CPUID_8000_0007	(0x80000007)
+#define CPUID_8000_0008	(0x80000008)
+
+/*
+ * CPUID instruction Fn0000_0001:
+ */
+#define CPUID_0000_0001_APICID_MASK			(0xff<<24)
+#define CPUID_0000_0001_APICID_SHIFT			24
+
+/*
+ * CPUID instruction Fn0000_0001 ECX
+ */
+#define CPUID_0000_0001_FEAT0_VMX	(1<<5)
+
+int x86_emulate_cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
+		      uint32_t *edx);
+
+#endif