Merge projects/bhyve to head.

'bhyve' was developed by grehan@ and myself at NetApp (thanks!). Special thanks to Peter Snyder, Joe Caradonna and Michael Dexter for their support and encouragement. Obtained from: NetApp
author: neel <neel@FreeBSD.org> 2013-01-19 04:18:52 +0000
committer: neel <neel@FreeBSD.org> 2013-01-19 04:18:52 +0000
commit: 363335d53e3c955602378aa434d0054c48a6e0d6 (patch)
tree: 5af6fe77acd4da3002c907484fd64133deb95c8d /sys/amd64
parent: 3600c83b820e00959d61600e67e9dcb32ef6b518 (diff)
parent: dde8bf641fc7c8e9541167cd7c01523973d0b569 (diff)
download: FreeBSD-src-363335d53e3c955602378aa434d0054c48a6e0d6.zip
FreeBSD-src-363335d53e3c955602378aa434d0054c48a6e0d6.tar.gz
46 files changed, 12243 insertions, 0 deletions
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
new file mode 100644
index 0000000..024c30e
--- /dev/null
+++ b/sys/amd64/include/vmm.h
@@ -0,0 +1,293 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: vmm.h 482 2011-05-09 21:22:43Z grehan $
+ */
+
+#ifndef _VMM_H_
+#define	_VMM_H_
+
+#ifdef _KERNEL
+
+#define	VM_MAX_NAMELEN	32
+
+struct vm;
+struct vm_memory_segment;
+struct seg_desc;
+struct vm_exit;
+struct vm_run;
+struct vlapic;
+
+enum x2apic_state;
+
+typedef int	(*vmm_init_func_t)(void);
+typedef int	(*vmm_cleanup_func_t)(void);
+typedef void *	(*vmi_init_func_t)(struct vm *vm); /* instance specific apis */
+typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, register_t rip);
+typedef void	(*vmi_cleanup_func_t)(void *vmi);
+typedef int	(*vmi_mmap_set_func_t)(void *vmi, vm_paddr_t gpa,
+				       vm_paddr_t hpa, size_t length,
+				       vm_memattr_t attr, int prot,
+				       boolean_t superpages_ok);
+typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *vmi, vm_paddr_t gpa);
+typedef int	(*vmi_get_register_t)(void *vmi, int vcpu, int num,
+				      uint64_t *retval);
+typedef int	(*vmi_set_register_t)(void *vmi, int vcpu, int num,
+				      uint64_t val);
+typedef int	(*vmi_get_desc_t)(void *vmi, int vcpu, int num,
+				  struct seg_desc *desc);
+typedef int	(*vmi_set_desc_t)(void *vmi, int vcpu, int num,
+				  struct seg_desc *desc);
+typedef int	(*vmi_inject_event_t)(void *vmi, int vcpu,
+				      int type, int vector,
+				      uint32_t code, int code_valid);
+typedef int	(*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
+typedef int	(*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
+
+struct vmm_ops {
+	vmm_init_func_t		init;		/* module wide initialization */
+	vmm_cleanup_func_t	cleanup;
+
+	vmi_init_func_t		vminit;		/* vm-specific initialization */
+	vmi_run_func_t		vmrun;
+	vmi_cleanup_func_t	vmcleanup;
+	vmi_mmap_set_func_t	vmmmap_set;
+	vmi_mmap_get_func_t	vmmmap_get;
+	vmi_get_register_t	vmgetreg;
+	vmi_set_register_t	vmsetreg;
+	vmi_get_desc_t		vmgetdesc;
+	vmi_set_desc_t		vmsetdesc;
+	vmi_inject_event_t	vminject;
+	vmi_get_cap_t		vmgetcap;
+	vmi_set_cap_t		vmsetcap;
+};
+
+extern struct vmm_ops vmm_ops_intel;
+extern struct vmm_ops vmm_ops_amd;
+
+struct vm *vm_create(const char *name);
+void vm_destroy(struct vm *vm);
+const char *vm_name(struct vm *vm);
+int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
+int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
+vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size);
+int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
+	      struct vm_memory_segment *seg);
+int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
+int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
+int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
+		    struct seg_desc *ret_desc);
+int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
+		    struct seg_desc *desc);
+int vm_get_pinning(struct vm *vm, int vcpu, int *cpuid);
+int vm_set_pinning(struct vm *vm, int vcpu, int cpuid);
+int vm_run(struct vm *vm, struct vm_run *vmrun);
+int vm_inject_event(struct vm *vm, int vcpu, int type,
+		    int vector, uint32_t error_code, int error_code_valid);
+int vm_inject_nmi(struct vm *vm, int vcpu);
+int vm_nmi_pending(struct vm *vm, int vcpuid);
+void vm_nmi_clear(struct vm *vm, int vcpuid);
+uint64_t *vm_guest_msrs(struct vm *vm, int cpu);
+struct vlapic *vm_lapic(struct vm *vm, int cpu);
+int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
+int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
+int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
+int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
+void vm_activate_cpu(struct vm *vm, int vcpu);
+cpuset_t vm_active_cpus(struct vm *vm);
+struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
+
+/*
+ * Return 1 if device indicated by bus/slot/func is supposed to be a
+ * pci passthrough device.
+ *
+ * Return 0 otherwise.
+ */
+int vmm_is_pptdev(int bus, int slot, int func);
+
+void *vm_iommu_domain(struct vm *vm);
+
+enum vcpu_state {
+	VCPU_IDLE,
+	VCPU_RUNNING,
+	VCPU_CANNOT_RUN,
+};
+
+int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state);
+enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu);
+
+static int __inline
+vcpu_is_running(struct vm *vm, int vcpu)
+{
+	return (vcpu_get_state(vm, vcpu) == VCPU_RUNNING);
+}
+
+void *vcpu_stats(struct vm *vm, int vcpu);
+void vm_interrupt_hostcpu(struct vm *vm, int vcpu);
+
+#endif	/* KERNEL */
+
+#include <machine/vmm_instruction_emul.h>
+
+#define	VM_MAXCPU	8			/* maximum virtual cpus */
+
+/*
+ * Identifiers for events that can be injected into the VM
+ */
+enum vm_event_type {
+	VM_EVENT_NONE,
+	VM_HW_INTR,
+	VM_NMI,
+	VM_HW_EXCEPTION,
+	VM_SW_INTR,
+	VM_PRIV_SW_EXCEPTION,
+	VM_SW_EXCEPTION,
+	VM_EVENT_MAX
+};
+
+/*
+ * Identifiers for architecturally defined registers.
+ */
+enum vm_reg_name {
+	VM_REG_GUEST_RAX,
+	VM_REG_GUEST_RBX,
+	VM_REG_GUEST_RCX,
+	VM_REG_GUEST_RDX,
+	VM_REG_GUEST_RSI,
+	VM_REG_GUEST_RDI,
+	VM_REG_GUEST_RBP,
+	VM_REG_GUEST_R8,
+	VM_REG_GUEST_R9,
+	VM_REG_GUEST_R10,
+	VM_REG_GUEST_R11,
+	VM_REG_GUEST_R12,
+	VM_REG_GUEST_R13,
+	VM_REG_GUEST_R14,
+	VM_REG_GUEST_R15,
+	VM_REG_GUEST_CR0,
+	VM_REG_GUEST_CR3,
+	VM_REG_GUEST_CR4,
+	VM_REG_GUEST_DR7,
+	VM_REG_GUEST_RSP,
+	VM_REG_GUEST_RIP,
+	VM_REG_GUEST_RFLAGS,
+	VM_REG_GUEST_ES,
+	VM_REG_GUEST_CS,
+	VM_REG_GUEST_SS,
+	VM_REG_GUEST_DS,
+	VM_REG_GUEST_FS,
+	VM_REG_GUEST_GS,
+	VM_REG_GUEST_LDTR,
+	VM_REG_GUEST_TR,
+	VM_REG_GUEST_IDTR,
+	VM_REG_GUEST_GDTR,
+	VM_REG_GUEST_EFER,
+	VM_REG_LAST
+};
+
+/*
+ * Identifiers for optional vmm capabilities
+ */
+enum vm_cap_type {
+	VM_CAP_HALT_EXIT,
+	VM_CAP_MTRAP_EXIT,
+	VM_CAP_PAUSE_EXIT,
+	VM_CAP_UNRESTRICTED_GUEST,
+	VM_CAP_MAX
+};
+
+enum x2apic_state {
+	X2APIC_ENABLED,
+	X2APIC_AVAILABLE,
+	X2APIC_DISABLED,
+	X2APIC_STATE_LAST
+};
+
+/*
+ * The 'access' field has the format specified in Table 21-2 of the Intel
+ * Architecture Manual vol 3b.
+ *
+ * XXX The contents of the 'access' field are architecturally defined except
+ * bit 16 - Segment Unusable.
+ */
+struct seg_desc {
+	uint64_t	base;
+	uint32_t	limit;
+	uint32_t	access;
+};
+
+enum vm_exitcode {
+	VM_EXITCODE_INOUT,
+	VM_EXITCODE_VMX,
+	VM_EXITCODE_BOGUS,
+	VM_EXITCODE_RDMSR,
+	VM_EXITCODE_WRMSR,
+	VM_EXITCODE_HLT,
+	VM_EXITCODE_MTRAP,
+	VM_EXITCODE_PAUSE,
+	VM_EXITCODE_PAGING,
+	VM_EXITCODE_SPINUP_AP,
+	VM_EXITCODE_MAX
+};
+
+struct vm_exit {
+	enum vm_exitcode	exitcode;
+	int			inst_length;	/* 0 means unknown */
+	uint64_t		rip;
+	union {
+		struct {
+			uint16_t	bytes:3;	/* 1 or 2 or 4 */
+			uint16_t	in:1;		/* out is 0, in is 1 */
+			uint16_t	string:1;
+			uint16_t	rep:1;
+			uint16_t	port;
+			uint32_t	eax;		/* valid for out */
+		} inout;
+		struct {
+			uint64_t	gpa;
+			struct vie	vie;
+		} paging;
+		/*
+		 * VMX specific payload. Used when there is no "better"
+		 * exitcode to represent the VM-exit.
+		 */
+		struct {
+			int		error;		/* vmx inst error */
+			uint32_t	exit_reason;
+			uint64_t	exit_qualification;
+		} vmx;
+		struct {
+			uint32_t	code;		/* ecx value */
+			uint64_t	wval;
+		} msr;
+		struct {
+			int		vcpu;
+			uint64_t	rip;
+		} spinup_ap;
+	} u;
+};
+
+#endif	/* _VMM_H_ */
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
new file mode 100644
index 0000000..79f893d
--- /dev/null
+++ b/sys/amd64/include/vmm_dev.h
@@ -0,0 +1,215 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: vmm_dev.h 482 2011-05-09 21:22:43Z grehan $
+ */
+
+#ifndef	_VMM_DEV_H_
+#define	_VMM_DEV_H_
+
+#ifdef _KERNEL
+void	vmmdev_init(void);
+int	vmmdev_cleanup(void);
+#endif
+
+struct vm_memory_segment {
+	vm_paddr_t	gpa;	/* in */
+	size_t		len;	/* in */
+};
+
+struct vm_register {
+	int		cpuid;
+	int		regnum;		/* enum vm_reg_name */
+	uint64_t	regval;
+};
+
+struct vm_seg_desc {			/* data or code segment */
+	int		cpuid;
+	int		regnum;		/* enum vm_reg_name */
+	struct seg_desc desc;
+};
+
+struct vm_pin {
+	int		vm_cpuid;
+	int		host_cpuid;	/* -1 to unpin */
+};
+
+struct vm_run {
+	int		cpuid;
+	uint64_t	rip;		/* start running here */
+	struct vm_exit	vm_exit;
+};
+
+struct vm_event {
+	int		cpuid;
+	enum vm_event_type type;
+	int		vector;
+	uint32_t	error_code;
+	int		error_code_valid;
+};
+
+struct vm_lapic_irq {
+	int		cpuid;
+	int		vector;
+};
+
+struct vm_capability {
+	int		cpuid;
+	enum vm_cap_type captype;
+	int		capval;
+	int		allcpus;
+};
+
+struct vm_pptdev {
+	int		bus;
+	int		slot;
+	int		func;
+};
+
+struct vm_pptdev_mmio {
+	int		bus;
+	int		slot;
+	int		func;
+	vm_paddr_t	gpa;
+	vm_paddr_t	hpa;
+	size_t		len;
+};
+
+struct vm_pptdev_msi {
+	int		vcpu;
+	int		bus;
+	int		slot;
+	int		func;
+	int		numvec;		/* 0 means disabled */
+	int		vector;
+	int		destcpu;
+};
+
+struct vm_pptdev_msix {
+	int		vcpu;
+	int		bus;
+	int		slot;
+	int		func;
+	int		idx;
+	uint32_t	msg;
+	uint32_t	vector_control;
+	uint64_t	addr;
+};
+
+struct vm_nmi {
+	int		cpuid;
+};
+
+#define	MAX_VM_STATS	64
+struct vm_stats {
+	int		cpuid;				/* in */
+	int		num_entries;			/* out */
+	struct timeval	tv;
+	uint64_t	statbuf[MAX_VM_STATS];
+};
+
+struct vm_stat_desc {
+	int		index;				/* in */
+	char		desc[128];			/* out */
+};
+
+struct vm_x2apic {
+	int			cpuid;
+	enum x2apic_state	state;
+};
+
+enum {
+	IOCNUM_RUN,
+	IOCNUM_SET_PINNING,
+	IOCNUM_GET_PINNING,
+	IOCNUM_MAP_MEMORY,
+	IOCNUM_GET_MEMORY_SEG,
+	IOCNUM_SET_REGISTER,
+	IOCNUM_GET_REGISTER,
+	IOCNUM_SET_SEGMENT_DESCRIPTOR,
+	IOCNUM_GET_SEGMENT_DESCRIPTOR,
+	IOCNUM_INJECT_EVENT,
+	IOCNUM_LAPIC_IRQ,
+	IOCNUM_SET_CAPABILITY,
+	IOCNUM_GET_CAPABILITY,
+	IOCNUM_BIND_PPTDEV,
+	IOCNUM_UNBIND_PPTDEV,
+	IOCNUM_MAP_PPTDEV_MMIO,
+	IOCNUM_PPTDEV_MSI,
+	IOCNUM_PPTDEV_MSIX,
+	IOCNUM_INJECT_NMI,
+	IOCNUM_VM_STATS,
+	IOCNUM_VM_STAT_DESC,
+	IOCNUM_SET_X2APIC_STATE,
+	IOCNUM_GET_X2APIC_STATE,
+};
+
+#define	VM_RUN		\
+	_IOWR('v', IOCNUM_RUN, struct vm_run)
+#define	VM_SET_PINNING	\
+	_IOW('v', IOCNUM_SET_PINNING, struct vm_pin)
+#define	VM_GET_PINNING	\
+	_IOWR('v', IOCNUM_GET_PINNING, struct vm_pin)
+#define	VM_MAP_MEMORY	\
+	_IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment)
+#define	VM_GET_MEMORY_SEG \
+	_IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment)
+#define	VM_SET_REGISTER \
+	_IOW('v', IOCNUM_SET_REGISTER, struct vm_register)
+#define	VM_GET_REGISTER \
+	_IOWR('v', IOCNUM_GET_REGISTER, struct vm_register)
+#define	VM_SET_SEGMENT_DESCRIPTOR \
+	_IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define	VM_GET_SEGMENT_DESCRIPTOR \
+	_IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define	VM_INJECT_EVENT	\
+	_IOW('v', IOCNUM_INJECT_EVENT, struct vm_event)
+#define	VM_LAPIC_IRQ 		\
+	_IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq)
+#define	VM_SET_CAPABILITY \
+	_IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability)
+#define	VM_GET_CAPABILITY \
+	_IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability)
+#define	VM_BIND_PPTDEV \
+	_IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev)
+#define	VM_UNBIND_PPTDEV \
+	_IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev)
+#define	VM_MAP_PPTDEV_MMIO \
+	_IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio)
+#define	VM_PPTDEV_MSI \
+	_IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi)
+#define	VM_PPTDEV_MSIX \
+	_IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix)
+#define VM_INJECT_NMI \
+	_IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi)
+#define	VM_STATS \
+	_IOWR('v', IOCNUM_VM_STATS, struct vm_stats)
+#define	VM_STAT_DESC \
+	_IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc)
+#define	VM_SET_X2APIC_STATE \
+	_IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic)
+#define	VM_GET_X2APIC_STATE \
+	_IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic)
+#endif
diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h
new file mode 100644
index 0000000..4cc494b
--- /dev/null
+++ b/sys/amd64/include/vmm_instruction_emul.h
@@ -0,0 +1,113 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_VMM_INSTRUCTION_EMUL_H_
+#define	_VMM_INSTRUCTION_EMUL_H_
+
+/*
+ * The data structures 'vie' and 'vie_op' are meant to be opaque to the
+ * consumers of instruction decoding. The only reason why their contents
+ * need to be exposed is because they are part of the 'vm_exit' structure.
+ */
+struct vie_op {
+	uint8_t		op_byte;	/* actual opcode byte */
+	uint8_t		op_type;	/* type of operation (e.g. MOV) */
+	uint16_t	op_flags;
+};
+
+#define	VIE_INST_SIZE	15
+struct vie {
+	uint8_t		inst[VIE_INST_SIZE];	/* instruction bytes */
+	uint8_t		num_valid;		/* size of the instruction */
+	uint8_t		num_processed;
+
+	uint8_t		rex_w:1,		/* REX prefix */
+			rex_r:1,
+			rex_x:1,
+			rex_b:1;
+
+	uint8_t		mod:2,			/* ModRM byte */
+			reg:4,
+			rm:4;
+
+	uint8_t		ss:2,			/* SIB byte */
+			index:4,
+			base:4;
+
+	uint8_t		disp_bytes;
+	uint8_t		imm_bytes;
+
+	uint8_t		scale;
+	int		base_register;		/* VM_REG_GUEST_xyz */
+	int		index_register;		/* VM_REG_GUEST_xyz */
+
+	int64_t		displacement;		/* optional addr displacement */
+	int64_t		immediate;		/* optional immediate operand */
+
+	uint8_t		decoded;	/* set to 1 if successfully decoded */
+
+	struct vie_op	op;			/* opcode description */
+};
+
+/*
+ * Callback functions to read and write memory regions.
+ */
+typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa,
+				 uint64_t *rval, int rsize, void *arg);
+
+typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa,
+				  uint64_t wval, int wsize, void *arg);
+
+/*
+ * Emulate the decoded 'vie' instruction.
+ *
+ * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region
+ * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the
+ * callback functions.
+ *
+ * 'void *vm' should be 'struct vm *' when called from kernel context and
+ * 'struct vmctx *' when called from user context.
+ * s
+ */
+int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
+			    mem_region_read_t mrr, mem_region_write_t mrw,
+			    void *mrarg);
+
+#ifdef _KERNEL
+/*
+ * APIs to fetch and decode the instruction from nested page fault handler.
+ */
+int vmm_fetch_instruction(struct vm *vm, int cpuid,
+			  uint64_t rip, int inst_length, uint64_t cr3,
+			  struct vie *vie);
+
+int vmm_decode_instruction(struct vm *vm, int cpuid,
+			   uint64_t gla, struct vie *vie);
+#endif	/* _KERNEL */
+
+#endif	/* _VMM_INSTRUCTION_EMUL_H_ */
diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c
new file mode 100644
index 0000000..dc071d3
--- /dev/null
+++ b/sys/amd64/vmm/amd/amdv.c
@@ -0,0 +1,265 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/smp.h>
+
+#include <machine/vmm.h>
+#include "io/iommu.h"
+
+static int
+amdv_init(void)
+{
+
+	printf("amdv_init: not implemented\n");
+	return (ENXIO);
+}
+
+static int
+amdv_cleanup(void)
+{
+
+	printf("amdv_cleanup: not implemented\n");
+	return (ENXIO);
+}
+
+static void *
+amdv_vminit(struct vm *vm)
+{
+
+	printf("amdv_vminit: not implemented\n");
+	return (NULL);
+}
+
+static int
+amdv_vmrun(void *arg, int vcpu, register_t rip)
+{
+
+	printf("amdv_vmrun: not implemented\n");
+	return (ENXIO);
+}
+
+static void
+amdv_vmcleanup(void *arg)
+{
+
+	printf("amdv_vmcleanup: not implemented\n");
+	return;
+}
+
+static int
+amdv_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+	    vm_memattr_t attr, int prot, boolean_t spok)
+{
+
+	printf("amdv_vmmmap_set: not implemented\n");
+	return (EINVAL);
+}
+
+static vm_paddr_t
+amdv_vmmmap_get(void *arg, vm_paddr_t gpa)
+{
+
+	printf("amdv_vmmmap_get: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval)
+{
+	
+	printf("amdv_getreg: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_setreg(void *arg, int vcpu, int regnum, uint64_t val)
+{
+	
+	printf("amdv_setreg: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_getdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
+{
+
+	printf("amdv_get_desc: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_setdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
+{
+
+	printf("amdv_get_desc: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_inject_event(void *vmi, int vcpu, int type, int vector,
+		  uint32_t error_code, int error_code_valid)
+{
+
+	printf("amdv_inject_event: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_getcap(void *arg, int vcpu, int type, int *retval)
+{
+
+	printf("amdv_getcap: not implemented\n");
+	return (EINVAL);
+}
+
+static int
+amdv_setcap(void *arg, int vcpu, int type, int val)
+{
+
+	printf("amdv_setcap: not implemented\n");
+	return (EINVAL);
+}
+
+struct vmm_ops vmm_ops_amd = {
+	amdv_init,
+	amdv_cleanup,
+	amdv_vminit,
+	amdv_vmrun,
+	amdv_vmcleanup,
+	amdv_vmmmap_set,
+	amdv_vmmmap_get,
+	amdv_getreg,
+	amdv_setreg,
+	amdv_getdesc,
+	amdv_setdesc,
+	amdv_inject_event,
+	amdv_getcap,
+	amdv_setcap
+};
+
+static int
+amd_iommu_init(void)
+{
+
+	printf("amd_iommu_init: not implemented\n");
+	return (ENXIO);
+}
+
+static void
+amd_iommu_cleanup(void)
+{
+
+	printf("amd_iommu_cleanup: not implemented\n");
+}
+
+static void
+amd_iommu_enable(void)
+{
+
+	printf("amd_iommu_enable: not implemented\n");
+}
+
+static void
+amd_iommu_disable(void)
+{
+
+	printf("amd_iommu_disable: not implemented\n");
+}
+
+static void *
+amd_iommu_create_domain(vm_paddr_t maxaddr)
+{
+
+	printf("amd_iommu_create_domain: not implemented\n");
+	return (NULL);
+}
+
+static void
+amd_iommu_destroy_domain(void *domain)
+{
+
+	printf("amd_iommu_destroy_domain: not implemented\n");
+}
+
+static uint64_t
+amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa,
+			 uint64_t len)
+{
+
+	printf("amd_iommu_create_mapping: not implemented\n");
+	return (0);
+}
+
+static uint64_t
+amd_iommu_remove_mapping(void *domain, vm_paddr_t gpa, uint64_t len)
+{
+
+	printf("amd_iommu_remove_mapping: not implemented\n");
+	return (0);
+}
+
+static void
+amd_iommu_add_device(void *domain, int bus, int slot, int func)
+{
+
+	printf("amd_iommu_add_device: not implemented\n");
+}
+
+static void
+amd_iommu_remove_device(void *domain, int bus, int slot, int func)
+{
+
+	printf("amd_iommu_remove_device: not implemented\n");
+}
+
+static void
+amd_iommu_invalidate_tlb(void *domain)
+{
+
+	printf("amd_iommu_invalidate_tlb: not implemented\n");
+}
+
+struct iommu_ops iommu_ops_amd = {
+	amd_iommu_init,
+	amd_iommu_cleanup,
+	amd_iommu_enable,
+	amd_iommu_disable,
+	amd_iommu_create_domain,
+	amd_iommu_destroy_domain,
+	amd_iommu_create_mapping,
+	amd_iommu_remove_mapping,
+	amd_iommu_add_device,
+	amd_iommu_remove_device,
+	amd_iommu_invalidate_tlb,
+};
diff --git a/sys/amd64/vmm/intel/ept.c b/sys/amd64/vmm/intel/ept.c
new file mode 100644
index 0000000..4f91601
--- /dev/null
+++ b/sys/amd64/vmm/intel/ept.c
@@ -0,0 +1,392 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/smp.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/param.h>
+#include <machine/cpufunc.h>
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include "vmx_cpufunc.h"
+#include "vmx_msr.h"
+#include "vmx.h"
+#include "ept.h"
+
+#define	EPT_PWL4(cap)			((cap) & (1UL << 6))
+#define	EPT_MEMORY_TYPE_WB(cap)		((cap) & (1UL << 14))
+#define	EPT_PDE_SUPERPAGE(cap)		((cap) & (1UL << 16))	/* 2MB pages */
+#define	EPT_PDPTE_SUPERPAGE(cap)	((cap) & (1UL << 17))	/* 1GB pages */
+#define	INVVPID_SUPPORTED(cap)		((cap) & (1UL << 32))
+#define	INVEPT_SUPPORTED(cap)		((cap) & (1UL << 20))
+
+#define	INVVPID_ALL_TYPES_MASK		0xF0000000000UL
+#define	INVVPID_ALL_TYPES_SUPPORTED(cap)	\
+	(((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK)
+
+#define	INVEPT_ALL_TYPES_MASK		0x6000000UL
+#define	INVEPT_ALL_TYPES_SUPPORTED(cap)		\
+	(((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
+
+#define	EPT_PG_RD			(1 << 0)
+#define	EPT_PG_WR			(1 << 1)
+#define	EPT_PG_EX			(1 << 2)
+#define	EPT_PG_MEMORY_TYPE(x)		((x) << 3)
+#define	EPT_PG_IGNORE_PAT		(1 << 6)
+#define	EPT_PG_SUPERPAGE		(1 << 7)
+
+#define	EPT_ADDR_MASK			((uint64_t)-1 << 12)
+
+MALLOC_DECLARE(M_VMX);
+
+static uint64_t page_sizes_mask;
+
+int
+ept_init(void)
+{
+	int page_shift;
+	uint64_t cap;
+
+	cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
+
+	/*
+	 * Verify that:
+	 * - page walk length is 4 steps
+	 * - extended page tables can be laid out in write-back memory
+	 * - invvpid instruction with all possible types is supported
+	 * - invept instruction with all possible types is supported
+	 */
+	if (!EPT_PWL4(cap) ||
+	    !EPT_MEMORY_TYPE_WB(cap) ||
+	    !INVVPID_SUPPORTED(cap) ||
+	    !INVVPID_ALL_TYPES_SUPPORTED(cap) ||
+	    !INVEPT_SUPPORTED(cap) ||
+	    !INVEPT_ALL_TYPES_SUPPORTED(cap))
+		return (EINVAL);
+
+	/* Set bits in 'page_sizes_mask' for each valid page size */
+	page_shift = PAGE_SHIFT;
+	page_sizes_mask = 1UL << page_shift;		/* 4KB page */
+
+	page_shift += 9;
+	if (EPT_PDE_SUPERPAGE(cap))
+		page_sizes_mask |= 1UL << page_shift;	/* 2MB superpage */
+
+	page_shift += 9;
+	if (EPT_PDPTE_SUPERPAGE(cap))
+		page_sizes_mask |= 1UL << page_shift;	/* 1GB superpage */
+
+	return (0);
+}
+
+#if 0
+static void
+ept_dump(uint64_t *ptp, int nlevels)
+{
+	int i, t, tabs;
+	uint64_t *ptpnext, ptpval;
+
+	if (--nlevels < 0)
+		return;
+
+	tabs = 3 - nlevels;
+	for (t = 0; t < tabs; t++)
+		printf("\t");
+	printf("PTP = %p\n", ptp);
+
+	for (i = 0; i < 512; i++) {
+		ptpval = ptp[i];
+
+		if (ptpval == 0)
+			continue;
+		
+		for (t = 0; t < tabs; t++)
+			printf("\t");
+		printf("%3d 0x%016lx\n", i, ptpval);
+
+		if (nlevels != 0 && (ptpval & EPT_PG_SUPERPAGE) == 0) {
+			ptpnext = (uint64_t *)
+				  PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
+			ept_dump(ptpnext, nlevels);
+		}
+	}
+}
+#endif
+
+static size_t
+ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+		   vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
+{
+	int spshift, ptpshift, ptpindex, nlevels;
+
+	/*
+	 * Compute the size of the mapping that we can accomodate.
+	 *
+	 * This is based on three factors:
+	 * - super page sizes supported by the processor
+	 * - alignment of the region starting at 'gpa' and 'hpa'
+	 * - length of the region 'len'
+	 */
+	spshift = PAGE_SHIFT;
+	if (spok)
+		spshift += (EPT_PWLEVELS - 1) * 9;
+	while (spshift >= PAGE_SHIFT) {
+		uint64_t spsize = 1UL << spshift;
+		if ((page_sizes_mask & spsize) != 0 &&
+		    (gpa & (spsize - 1)) == 0 &&
+		    (hpa & (spsize - 1)) == 0 &&
+		    length >= spsize) {
+			break;
+		}
+		spshift -= 9;
+	}
+
+	if (spshift < PAGE_SHIFT) {
+		panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, "
+		      "length 0x%016lx, page_sizes_mask 0x%016lx",
+		      gpa, hpa, length, page_sizes_mask);
+	}
+
+	nlevels = EPT_PWLEVELS;
+	while (--nlevels >= 0) {
+		ptpshift = PAGE_SHIFT + nlevels * 9;
+		ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+		/* We have reached the leaf mapping */
+		if (spshift >= ptpshift)
+			break;
+
+		/*
+		 * We are working on a non-leaf page table page.
+		 *
+		 * Create the next level page table page if necessary and point
+		 * to it from the current page table.
+		 */
+		if (ptp[ptpindex] == 0) {
+			void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO);
+			ptp[ptpindex] = vtophys(nlp);
+			ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX;
+		}
+
+		/* Work our way down to the next level page table page */
+		ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK);
+	}
+
+	if ((gpa & ((1UL << ptpshift) - 1)) != 0) {
+		panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d "
+		      "mismatch\n", gpa, ptpshift);
+	}
+
+	if (prot != VM_PROT_NONE) {
+		/* Do the mapping */
+		ptp[ptpindex] = hpa;
+
+		/* Apply the access controls */
+		if (prot & VM_PROT_READ)
+			ptp[ptpindex] |= EPT_PG_RD;
+		if (prot & VM_PROT_WRITE)
+			ptp[ptpindex] |= EPT_PG_WR;
+		if (prot & VM_PROT_EXECUTE)
+			ptp[ptpindex] |= EPT_PG_EX;
+
+		/*
+		 * XXX should we enforce this memory type by setting the
+		 * ignore PAT bit to 1.
+		 */
+		ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
+
+		if (nlevels > 0)
+			ptp[ptpindex] |= EPT_PG_SUPERPAGE;
+	} else {
+		/* Remove the mapping */
+		ptp[ptpindex] = 0;
+	}
+
+	return (1UL << ptpshift);
+}
+
+static vm_paddr_t
+ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa)
+{
+	int nlevels, ptpshift, ptpindex;
+	uint64_t ptpval, hpabase, pgmask;
+
+	nlevels = EPT_PWLEVELS;
+	while (--nlevels >= 0) {
+		ptpshift = PAGE_SHIFT + nlevels * 9;
+		ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+		ptpval = ptp[ptpindex];
+
+		/* Cannot make progress beyond this point */
+		if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0)
+			break;
+
+		if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) {
+			pgmask = (1UL << ptpshift) - 1;
+			hpabase = ptpval & ~pgmask;
+			return (hpabase | (gpa & pgmask));
+		}
+
+		/* Work our way down to the next level page table page */
+		ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
+	}
+
+	return ((vm_paddr_t)-1);
+}
+
+static void
+ept_free_pt_entry(pt_entry_t pte)
+{
+	if (pte == 0)
+		return;
+
+	/* sanity check */
+	if ((pte & EPT_PG_SUPERPAGE) != 0)
+		panic("ept_free_pt_entry: pte cannot have superpage bit");
+
+	return;
+}
+
+static void
+ept_free_pd_entry(pd_entry_t pde)
+{
+	pt_entry_t	*pt;
+	int		i;
+
+	if (pde == 0)
+		return;
+
+	if ((pde & EPT_PG_SUPERPAGE) == 0) {
+		pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK);
+		for (i = 0; i < NPTEPG; i++)
+			ept_free_pt_entry(pt[i]);
+		free(pt, M_VMX);	/* free the page table page */
+	}
+}
+
+static void
+ept_free_pdp_entry(pdp_entry_t pdpe)
+{
+	pd_entry_t 	*pd;
+	int		 i;
+
+	if (pdpe == 0)
+		return;
+
+	if ((pdpe & EPT_PG_SUPERPAGE) == 0) {
+		pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK);
+		for (i = 0; i < NPDEPG; i++)
+			ept_free_pd_entry(pd[i]);
+		free(pd, M_VMX);	/* free the page directory page */
+	}
+}
+
+static void
+ept_free_pml4_entry(pml4_entry_t pml4e)
+{
+	pdp_entry_t	*pdp;
+	int		i;
+
+	if (pml4e == 0)
+		return;
+
+	if ((pml4e & EPT_PG_SUPERPAGE) == 0) {
+		pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK);
+		for (i = 0; i < NPDPEPG; i++)
+			ept_free_pdp_entry(pdp[i]);
+		free(pdp, M_VMX);	/* free the page directory ptr page */
+	}
+}
+
+void
+ept_vmcleanup(struct vmx *vmx)
+{
+	int 		 i;
+
+	for (i = 0; i < NPML4EPG; i++)
+		ept_free_pml4_entry(vmx->pml4ept[i]);
+}
+
+int
+ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
+		vm_memattr_t attr, int prot, boolean_t spok)
+{
+	size_t n;
+	struct vmx *vmx = arg;
+
+	while (len > 0) {
+		n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr,
+				       prot, spok);
+		len -= n;
+		gpa += n;
+		hpa += n;
+	}
+
+	return (0);
+}
+
+vm_paddr_t
+ept_vmmmap_get(void *arg, vm_paddr_t gpa)
+{
+	vm_paddr_t hpa;
+	struct vmx *vmx;
+
+	vmx = arg;
+	hpa = ept_lookup_mapping(vmx->pml4ept, gpa);
+	return (hpa);
+}
+
+static void
+invept_single_context(void *arg)
+{
+	struct invept_desc desc = *(struct invept_desc *)arg;
+
+	invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
+}
+
+void
+ept_invalidate_mappings(u_long pml4ept)
+{
+	struct invept_desc invept_desc = { 0 };
+
+	invept_desc.eptp = EPTP(pml4ept);
+
+	smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
+}
diff --git a/sys/amd64/vmm/intel/ept.h b/sys/amd64/vmm/intel/ept.h
new file mode 100644
index 0000000..2d7258d
--- /dev/null
+++ b/sys/amd64/vmm/intel/ept.h
@@ -0,0 +1,43 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_EPT_H_
+#define	_EPT_H_
+
+struct vmx;
+
+#define	EPT_PWLEVELS	4		/* page walk levels */
+#define	EPTP(pml4)	((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK)
+
+int	ept_init(void);
+int	ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+	    vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings);
+vm_paddr_t ept_vmmmap_get(void *arg, vm_paddr_t gpa);
+void	ept_invalidate_mappings(u_long ept_pml4);
+void	ept_vmcleanup(struct vmx *vmx);
+#endif
diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
new file mode 100644
index 0000000..a5784dd
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -0,0 +1,551 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/pcpu.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/segments.h>
+#include <machine/pmap.h>
+
+#include <machine/vmm.h>
+#include "vmm_host.h"
+#include "vmcs.h"
+#include "vmx_cpufunc.h"
+#include "ept.h"
+#include "vmx.h"
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+static uint64_t
+vmcs_fix_regval(uint32_t encoding, uint64_t val)
+{
+
+	switch (encoding) {
+	case VMCS_GUEST_CR0:
+		val = vmx_fix_cr0(val);
+		break;
+	case VMCS_GUEST_CR4:
+		val = vmx_fix_cr4(val);
+		break;
+	default:
+		break;
+	}
+	return (val);
+}
+
+static uint32_t
+vmcs_field_encoding(int ident)
+{
+	switch (ident) {
+	case VM_REG_GUEST_CR0:
+		return (VMCS_GUEST_CR0);
+	case VM_REG_GUEST_CR3:
+		return (VMCS_GUEST_CR3);
+	case VM_REG_GUEST_CR4:
+		return (VMCS_GUEST_CR4);
+	case VM_REG_GUEST_DR7:
+		return (VMCS_GUEST_DR7);
+	case VM_REG_GUEST_RSP:
+		return (VMCS_GUEST_RSP);
+	case VM_REG_GUEST_RIP:
+		return (VMCS_GUEST_RIP);
+	case VM_REG_GUEST_RFLAGS:
+		return (VMCS_GUEST_RFLAGS);
+	case VM_REG_GUEST_ES:
+		return (VMCS_GUEST_ES_SELECTOR);
+	case VM_REG_GUEST_CS:
+		return (VMCS_GUEST_CS_SELECTOR);
+	case VM_REG_GUEST_SS:
+		return (VMCS_GUEST_SS_SELECTOR);
+	case VM_REG_GUEST_DS:
+		return (VMCS_GUEST_DS_SELECTOR);
+	case VM_REG_GUEST_FS:
+		return (VMCS_GUEST_FS_SELECTOR);
+	case VM_REG_GUEST_GS:
+		return (VMCS_GUEST_GS_SELECTOR);
+	case VM_REG_GUEST_TR:
+		return (VMCS_GUEST_TR_SELECTOR);
+	case VM_REG_GUEST_LDTR:
+		return (VMCS_GUEST_LDTR_SELECTOR);
+	case VM_REG_GUEST_EFER:
+		return (VMCS_GUEST_IA32_EFER);
+	default:
+		return (-1);
+	}
+
+}
+
+static int
+vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc)
+{
+
+	switch (seg) {
+	case VM_REG_GUEST_ES:
+		*base = VMCS_GUEST_ES_BASE;
+		*lim = VMCS_GUEST_ES_LIMIT;
+		*acc = VMCS_GUEST_ES_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_CS:
+		*base = VMCS_GUEST_CS_BASE;
+		*lim = VMCS_GUEST_CS_LIMIT;
+		*acc = VMCS_GUEST_CS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_SS:
+		*base = VMCS_GUEST_SS_BASE;
+		*lim = VMCS_GUEST_SS_LIMIT;
+		*acc = VMCS_GUEST_SS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_DS:
+		*base = VMCS_GUEST_DS_BASE;
+		*lim = VMCS_GUEST_DS_LIMIT;
+		*acc = VMCS_GUEST_DS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_FS:
+		*base = VMCS_GUEST_FS_BASE;
+		*lim = VMCS_GUEST_FS_LIMIT;
+		*acc = VMCS_GUEST_FS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_GS:
+		*base = VMCS_GUEST_GS_BASE;
+		*lim = VMCS_GUEST_GS_LIMIT;
+		*acc = VMCS_GUEST_GS_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_TR:
+		*base = VMCS_GUEST_TR_BASE;
+		*lim = VMCS_GUEST_TR_LIMIT;
+		*acc = VMCS_GUEST_TR_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_LDTR:
+		*base = VMCS_GUEST_LDTR_BASE;
+		*lim = VMCS_GUEST_LDTR_LIMIT;
+		*acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS;
+		break;
+	case VM_REG_GUEST_IDTR:
+		*base = VMCS_GUEST_IDTR_BASE;
+		*lim = VMCS_GUEST_IDTR_LIMIT;
+		*acc = VMCS_INVALID_ENCODING;
+		break;
+	case VM_REG_GUEST_GDTR:
+		*base = VMCS_GUEST_GDTR_BASE;
+		*lim = VMCS_GUEST_GDTR_LIMIT;
+		*acc = VMCS_INVALID_ENCODING;
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+int
+vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval)
+{
+	int error;
+	uint32_t encoding;
+
+	/*
+	 * If we need to get at vmx-specific state in the VMCS we can bypass
+	 * the translation of 'ident' to 'encoding' by simply setting the
+	 * sign bit. As it so happens the upper 16 bits are reserved (i.e
+	 * set to 0) in the encodings for the VMCS so we are free to use the
+	 * sign bit.
+	 */
+	if (ident < 0)
+		encoding = ident & 0x7fffffff;
+	else
+		encoding = vmcs_field_encoding(ident);
+
+	if (encoding == (uint32_t)-1)
+		return (EINVAL);
+
+	VMPTRLD(vmcs);
+	error = vmread(encoding, retval);
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+int
+vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val)
+{
+	int error;
+	uint32_t encoding;
+
+	if (ident < 0)
+		encoding = ident & 0x7fffffff;
+	else
+		encoding = vmcs_field_encoding(ident);
+
+	if (encoding == (uint32_t)-1)
+		return (EINVAL);
+
+	val = vmcs_fix_regval(encoding, val);
+
+	VMPTRLD(vmcs);
+	error = vmwrite(encoding, val);
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+int
+vmcs_setdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc)
+{
+	int error;
+	uint32_t base, limit, access;
+
+	error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
+	if (error != 0)
+		panic("vmcs_setdesc: invalid segment register %d", seg);
+
+	VMPTRLD(vmcs);
+	if ((error = vmwrite(base, desc->base)) != 0)
+		goto done;
+
+	if ((error = vmwrite(limit, desc->limit)) != 0)
+		goto done;
+
+	if (access != VMCS_INVALID_ENCODING) {
+		if ((error = vmwrite(access, desc->access)) != 0)
+			goto done;
+	}
+done:
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+int
+vmcs_getdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc)
+{
+	int error;
+	uint32_t base, limit, access;
+	uint64_t u64;
+
+	error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
+	if (error != 0)
+		panic("vmcs_getdesc: invalid segment register %d", seg);
+
+	VMPTRLD(vmcs);
+	if ((error = vmread(base, &u64)) != 0)
+		goto done;
+	desc->base = u64;
+
+	if ((error = vmread(limit, &u64)) != 0)
+		goto done;
+	desc->limit = u64;
+
+	if (access != VMCS_INVALID_ENCODING) {
+		if ((error = vmread(access, &u64)) != 0)
+			goto done;
+		desc->access = u64;
+	}
+done:
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+int
+vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count)
+{
+	int error;
+
+	VMPTRLD(vmcs);
+
+	/*
+	 * Guest MSRs are saved in the VM-exit MSR-store area.
+	 * Guest MSRs are loaded from the VM-entry MSR-load area.
+	 * Both areas point to the same location in memory.
+	 */
+	if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0)
+		goto done;
+
+	error = 0;
+done:
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+int
+vmcs_set_defaults(struct vmcs *vmcs,
+		  u_long host_rip, u_long host_rsp, u_long ept_pml4,
+		  uint32_t pinbased_ctls, uint32_t procbased_ctls,
+		  uint32_t procbased_ctls2, uint32_t exit_ctls,
+		  uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid)
+{
+	int error, codesel, datasel, tsssel;
+	u_long cr0, cr4, efer;
+	uint64_t eptp, pat, fsbase, idtrbase;
+	uint32_t exc_bitmap;
+
+	codesel = vmm_get_host_codesel();
+	datasel = vmm_get_host_datasel();
+	tsssel = vmm_get_host_tsssel();
+
+	/*
+	 * Make sure we have a "current" VMCS to work with.
+	 */
+	VMPTRLD(vmcs);
+
+	/*
+	 * Load the VMX controls
+	 */
+	if ((error = vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_EXIT_CTLS, exit_ctls)) != 0)
+		goto done;
+	if ((error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls)) != 0)
+		goto done;
+
+	/* Guest state */
+
+	/* Initialize guest IA32_PAT MSR with the default value */
+	pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
+	      PAT_VALUE(1, PAT_WRITE_THROUGH)	|
+	      PAT_VALUE(2, PAT_UNCACHED)	|
+	      PAT_VALUE(3, PAT_UNCACHEABLE)	|
+	      PAT_VALUE(4, PAT_WRITE_BACK)	|
+	      PAT_VALUE(5, PAT_WRITE_THROUGH)	|
+	      PAT_VALUE(6, PAT_UNCACHED)	|
+	      PAT_VALUE(7, PAT_UNCACHEABLE);
+	if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0)
+		goto done;
+
+	/* Host state */
+
+	/* Initialize host IA32_PAT MSR */
+	pat = vmm_get_host_pat();
+	if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0)
+		goto done;
+
+	/* Load the IA32_EFER MSR */
+	efer = vmm_get_host_efer();
+	if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0)
+		goto done;
+
+	/* Load the control registers */
+
+	cr0 = vmm_get_host_cr0();
+	if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0)
+		goto done;
+	
+	cr4 = vmm_get_host_cr4() | CR4_VMXE;
+	if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0)
+		goto done;
+
+	/* Load the segment selectors */
+	if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0)
+		goto done;
+
+	if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0)
+		goto done;
+
+	/*
+	 * Load the Base-Address for %fs and idtr.
+	 *
+	 * Note that we exclude %gs, tss and gdtr here because their base
+	 * address is pcpu specific.
+	 */
+	fsbase = vmm_get_host_fsbase();
+	if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0)
+		goto done;
+
+	idtrbase = vmm_get_host_idtrbase();
+	if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0)
+		goto done;
+
+	/* instruction pointer */
+	if ((error = vmwrite(VMCS_HOST_RIP, host_rip)) != 0)
+		goto done;
+
+	/* stack pointer */
+	if ((error = vmwrite(VMCS_HOST_RSP, host_rsp)) != 0)
+		goto done;
+
+	/* eptp */
+	eptp = EPTP(ept_pml4);
+	if ((error = vmwrite(VMCS_EPTP, eptp)) != 0)
+		goto done;
+
+	/* vpid */
+	if ((error = vmwrite(VMCS_VPID, vpid)) != 0)
+		goto done;
+
+	/* msr bitmap */
+	if ((error = vmwrite(VMCS_MSR_BITMAP, msr_bitmap)) != 0)
+		goto done;
+
+	/* exception bitmap */
+	exc_bitmap = 1 << IDT_MC;
+	if ((error = vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap)) != 0)
+		goto done;
+
+	/* link pointer */
+	if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0)
+		goto done;
+done:
+	VMCLEAR(vmcs);
+	return (error);
+}
+
+uint64_t
+vmcs_read(uint32_t encoding)
+{
+	int error;
+	uint64_t val;
+
+	error = vmread(encoding, &val);
+	if (error != 0)
+		panic("vmcs_read(%u) error %d", encoding, error);
+
+	return (val);
+}
+
+#ifdef DDB
+extern int vmxon_enabled[];
+
+DB_SHOW_COMMAND(vmcs, db_show_vmcs)
+{
+	uint64_t cur_vmcs, val;
+	uint32_t exit;
+
+	if (!vmxon_enabled[curcpu]) {
+		db_printf("VMX not enabled\n");
+		return;
+	}
+
+	if (have_addr) {
+		db_printf("Only current VMCS supported\n");
+		return;
+	}
+
+	vmptrst(&cur_vmcs);
+	if (cur_vmcs == VMCS_INITIAL) {
+		db_printf("No current VM context\n");
+		return;
+	}
+	db_printf("VMCS: %jx\n", cur_vmcs);
+	db_printf("VPID: %lu\n", vmcs_read(VMCS_VPID));
+	db_printf("Activity: ");
+	val = vmcs_read(VMCS_GUEST_ACTIVITY);
+	switch (val) {
+	case 0:
+		db_printf("Active");
+		break;
+	case 1:
+		db_printf("HLT");
+		break;
+	case 2:
+		db_printf("Shutdown");
+		break;
+	case 3:
+		db_printf("Wait for SIPI");
+		break;
+	default:
+		db_printf("Unknown: %#lx", val);
+	}
+	db_printf("\n");
+	exit = vmcs_read(VMCS_EXIT_REASON);
+	if (exit & 0x80000000)
+		db_printf("Entry Failure Reason: %u\n", exit & 0xffff);
+	else
+		db_printf("Exit Reason: %u\n", exit & 0xffff);
+	db_printf("Qualification: %#lx\n", vmcs_exit_qualification());
+	db_printf("Guest Linear Address: %#lx\n",
+	    vmcs_read(VMCS_GUEST_LINEAR_ADDRESS));
+	switch (exit & 0x8000ffff) {
+	case EXIT_REASON_EXCEPTION:
+	case EXIT_REASON_EXT_INTR:
+		val = vmcs_read(VMCS_EXIT_INTERRUPTION_INFO);
+		db_printf("Interrupt Type: ");
+		switch (val >> 8 & 0x7) {
+		case 0:
+			db_printf("external");
+			break;
+		case 2:
+			db_printf("NMI");
+			break;
+		case 3:
+			db_printf("HW exception");
+			break;
+		case 4:
+			db_printf("SW exception");
+			break;
+		default:
+			db_printf("?? %lu", val >> 8 & 0x7);
+			break;
+		}
+		db_printf("  Vector: %lu", val & 0xff);
+		if (val & 0x800)
+			db_printf("  Error Code: %lx",
+			    vmcs_read(VMCS_EXIT_INTERRUPTION_ERROR));
+		db_printf("\n");
+		break;
+	case EXIT_REASON_EPT_FAULT:
+	case EXIT_REASON_EPT_MISCONFIG:
+		db_printf("Guest Physical Address: %#lx\n",
+		    vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS));
+		break;
+	}
+	db_printf("VM-instruction error: %#lx\n", vmcs_instruction_error());
+}
+#endif
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
new file mode 100644
index 0000000..f39eed2
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -0,0 +1,338 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMCS_H_
+#define	_VMCS_H_
+
+#ifdef _KERNEL
+struct vmcs {
+	uint32_t	identifier;
+	uint32_t	abort_code;
+	char		_impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2];
+};
+CTASSERT(sizeof(struct vmcs) == PAGE_SIZE);
+
+/* MSR save region is composed of an array of 'struct msr_entry' */
+struct msr_entry {
+	uint32_t	index;
+	uint32_t	reserved;
+	uint64_t	val;
+
+};
+
+int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
+int	vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp,
+			  u_long ept_pml4,
+			  uint32_t pinbased_ctls, uint32_t procbased_ctls,
+			  uint32_t procbased_ctls2, uint32_t exit_ctls,
+			  uint32_t entry_ctls, u_long msr_bitmap,
+			  uint16_t vpid);
+int	vmcs_getreg(struct vmcs *vmcs, int ident, uint64_t *retval);
+int	vmcs_setreg(struct vmcs *vmcs, int ident, uint64_t val);
+int	vmcs_getdesc(struct vmcs *vmcs, int ident,
+		     struct seg_desc *desc);
+int	vmcs_setdesc(struct vmcs *vmcs, int ident,
+		     struct seg_desc *desc);
+uint64_t vmcs_read(uint32_t encoding);
+
+#define	vmexit_instruction_length()	vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH)
+#define	vmcs_guest_rip()		vmcs_read(VMCS_GUEST_RIP)
+#define	vmcs_instruction_error()	vmcs_read(VMCS_INSTRUCTION_ERROR)
+#define	vmcs_exit_reason()		(vmcs_read(VMCS_EXIT_REASON) & 0xffff)
+#define	vmcs_exit_qualification()	vmcs_read(VMCS_EXIT_QUALIFICATION)
+#define	vmcs_guest_cr3()		vmcs_read(VMCS_GUEST_CR3)
+#define	vmcs_gpa()			vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)
+#define	vmcs_gla()			vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)
+
+#endif	/* _KERNEL */
+
+#define	VMCS_INITIAL			0xffffffffffffffff
+
+#define	VMCS_IDENT(encoding)		((encoding) | 0x80000000)
+/*
+ * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B.
+ */
+#define	VMCS_INVALID_ENCODING		0xffffffff
+
+/* 16-bit control fields */
+#define	VMCS_VPID			0x00000000
+
+/* 16-bit guest-state fields */
+#define	VMCS_GUEST_ES_SELECTOR		0x00000800
+#define	VMCS_GUEST_CS_SELECTOR		0x00000802
+#define	VMCS_GUEST_SS_SELECTOR		0x00000804
+#define	VMCS_GUEST_DS_SELECTOR		0x00000806
+#define	VMCS_GUEST_FS_SELECTOR		0x00000808
+#define	VMCS_GUEST_GS_SELECTOR		0x0000080A
+#define	VMCS_GUEST_LDTR_SELECTOR	0x0000080C
+#define	VMCS_GUEST_TR_SELECTOR		0x0000080E
+
+/* 16-bit host-state fields */
+#define	VMCS_HOST_ES_SELECTOR		0x00000C00
+#define	VMCS_HOST_CS_SELECTOR		0x00000C02
+#define	VMCS_HOST_SS_SELECTOR		0x00000C04
+#define	VMCS_HOST_DS_SELECTOR		0x00000C06
+#define	VMCS_HOST_FS_SELECTOR		0x00000C08
+#define	VMCS_HOST_GS_SELECTOR		0x00000C0A
+#define	VMCS_HOST_TR_SELECTOR		0x00000C0C
+
+/* 64-bit control fields */
+#define	VMCS_IO_BITMAP_A		0x00002000
+#define	VMCS_IO_BITMAP_B		0x00002002
+#define	VMCS_MSR_BITMAP			0x00002004
+#define	VMCS_EXIT_MSR_STORE		0x00002006
+#define	VMCS_EXIT_MSR_LOAD		0x00002008
+#define	VMCS_ENTRY_MSR_LOAD		0x0000200A
+#define	VMCS_EXECUTIVE_VMCS		0x0000200C
+#define	VMCS_TSC_OFFSET			0x00002010
+#define	VMCS_VIRTUAL_APIC		0x00002012
+#define	VMCS_APIC_ACCESS		0x00002014
+#define	VMCS_EPTP			0x0000201A
+
+/* 64-bit read-only fields */
+#define	VMCS_GUEST_PHYSICAL_ADDRESS	0x00002400
+
+/* 64-bit guest-state fields */
+#define	VMCS_LINK_POINTER		0x00002800
+#define	VMCS_GUEST_IA32_DEBUGCTL	0x00002802
+#define	VMCS_GUEST_IA32_PAT		0x00002804
+#define	VMCS_GUEST_IA32_EFER		0x00002806
+#define	VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808
+#define	VMCS_GUEST_PDPTE0		0x0000280A
+#define	VMCS_GUEST_PDPTE1		0x0000280C
+#define	VMCS_GUEST_PDPTE2		0x0000280E
+#define	VMCS_GUEST_PDPTE3		0x00002810
+
+/* 64-bit host-state fields */
+#define	VMCS_HOST_IA32_PAT		0x00002C00
+#define	VMCS_HOST_IA32_EFER		0x00002C02
+#define	VMCS_HOST_IA32_PERF_GLOBAL_CTRL	0x00002C04
+
+/* 32-bit control fields */
+#define	VMCS_PIN_BASED_CTLS		0x00004000
+#define	VMCS_PRI_PROC_BASED_CTLS	0x00004002
+#define	VMCS_EXCEPTION_BITMAP		0x00004004
+#define	VMCS_PF_ERROR_MASK		0x00004006
+#define	VMCS_PF_ERROR_MATCH		0x00004008
+#define	VMCS_CR3_TARGET_COUNT		0x0000400A
+#define	VMCS_EXIT_CTLS			0x0000400C
+#define	VMCS_EXIT_MSR_STORE_COUNT	0x0000400E
+#define	VMCS_EXIT_MSR_LOAD_COUNT	0x00004010
+#define	VMCS_ENTRY_CTLS			0x00004012
+#define	VMCS_ENTRY_MSR_LOAD_COUNT	0x00004014
+#define	VMCS_ENTRY_INTR_INFO		0x00004016
+#define	VMCS_ENTRY_EXCEPTION_ERROR	0x00004018
+#define	VMCS_ENTRY_INST_LENGTH		0x0000401A
+#define	VMCS_TPR_THRESHOLD		0x0000401C
+#define	VMCS_SEC_PROC_BASED_CTLS	0x0000401E
+#define	VMCS_PLE_GAP			0x00004020
+#define	VMCS_PLE_WINDOW			0x00004022
+
+/* 32-bit read-only data fields */
+#define	VMCS_INSTRUCTION_ERROR		0x00004400
+#define	VMCS_EXIT_REASON		0x00004402
+#define	VMCS_EXIT_INTERRUPTION_INFO	0x00004404
+#define	VMCS_EXIT_INTERRUPTION_ERROR	0x00004406
+#define	VMCS_IDT_VECTORING_INFO		0x00004408
+#define	VMCS_IDT_VECTORING_ERROR	0x0000440A
+#define	VMCS_EXIT_INSTRUCTION_LENGTH	0x0000440C
+#define	VMCS_EXIT_INSTRUCTION_INFO	0x0000440E
+
+/* 32-bit guest-state fields */
+#define	VMCS_GUEST_ES_LIMIT		0x00004800
+#define	VMCS_GUEST_CS_LIMIT		0x00004802
+#define	VMCS_GUEST_SS_LIMIT		0x00004804
+#define	VMCS_GUEST_DS_LIMIT		0x00004806
+#define	VMCS_GUEST_FS_LIMIT		0x00004808
+#define	VMCS_GUEST_GS_LIMIT		0x0000480A
+#define	VMCS_GUEST_LDTR_LIMIT		0x0000480C
+#define	VMCS_GUEST_TR_LIMIT		0x0000480E
+#define	VMCS_GUEST_GDTR_LIMIT		0x00004810
+#define	VMCS_GUEST_IDTR_LIMIT		0x00004812
+#define	VMCS_GUEST_ES_ACCESS_RIGHTS	0x00004814
+#define	VMCS_GUEST_CS_ACCESS_RIGHTS	0x00004816
+#define	VMCS_GUEST_SS_ACCESS_RIGHTS	0x00004818
+#define	VMCS_GUEST_DS_ACCESS_RIGHTS	0x0000481A
+#define	VMCS_GUEST_FS_ACCESS_RIGHTS	0x0000481C
+#define	VMCS_GUEST_GS_ACCESS_RIGHTS	0x0000481E
+#define	VMCS_GUEST_LDTR_ACCESS_RIGHTS	0x00004820
+#define	VMCS_GUEST_TR_ACCESS_RIGHTS	0x00004822
+#define	VMCS_GUEST_INTERRUPTIBILITY	0x00004824
+#define	VMCS_GUEST_ACTIVITY		0x00004826
+#define VMCS_GUEST_SMBASE		0x00004828
+#define	VMCS_GUEST_IA32_SYSENTER_CS	0x0000482A
+#define	VMCS_PREEMPTION_TIMER_VALUE	0x0000482E
+
+/* 32-bit host state fields */
+#define	VMCS_HOST_IA32_SYSENTER_CS	0x00004C00
+
+/* Natural Width control fields */
+#define	VMCS_CR0_MASK			0x00006000
+#define	VMCS_CR4_MASK			0x00006002
+#define	VMCS_CR0_SHADOW			0x00006004
+#define	VMCS_CR4_SHADOW			0x00006006
+#define	VMCS_CR3_TARGET0		0x00006008
+#define	VMCS_CR3_TARGET1		0x0000600A
+#define	VMCS_CR3_TARGET2		0x0000600C
+#define	VMCS_CR3_TARGET3		0x0000600E
+
+/* Natural Width read-only fields */
+#define	VMCS_EXIT_QUALIFICATION		0x00006400
+#define	VMCS_IO_RCX			0x00006402
+#define	VMCS_IO_RSI			0x00006404
+#define	VMCS_IO_RDI			0x00006406
+#define	VMCS_IO_RIP			0x00006408
+#define	VMCS_GUEST_LINEAR_ADDRESS	0x0000640A
+
+/* Natural Width guest-state fields */
+#define	VMCS_GUEST_CR0			0x00006800
+#define	VMCS_GUEST_CR3			0x00006802
+#define	VMCS_GUEST_CR4			0x00006804
+#define	VMCS_GUEST_ES_BASE		0x00006806
+#define	VMCS_GUEST_CS_BASE		0x00006808
+#define	VMCS_GUEST_SS_BASE		0x0000680A
+#define	VMCS_GUEST_DS_BASE		0x0000680C
+#define	VMCS_GUEST_FS_BASE		0x0000680E
+#define	VMCS_GUEST_GS_BASE		0x00006810
+#define	VMCS_GUEST_LDTR_BASE		0x00006812
+#define	VMCS_GUEST_TR_BASE		0x00006814
+#define	VMCS_GUEST_GDTR_BASE		0x00006816
+#define	VMCS_GUEST_IDTR_BASE		0x00006818
+#define	VMCS_GUEST_DR7			0x0000681A
+#define	VMCS_GUEST_RSP			0x0000681C
+#define	VMCS_GUEST_RIP			0x0000681E
+#define	VMCS_GUEST_RFLAGS		0x00006820
+#define	VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822
+#define	VMCS_GUEST_IA32_SYSENTER_ESP	0x00006824
+#define	VMCS_GUEST_IA32_SYSENTER_EIP	0x00006826
+
+/* Natural Width host-state fields */
+#define	VMCS_HOST_CR0			0x00006C00
+#define	VMCS_HOST_CR3			0x00006C02
+#define	VMCS_HOST_CR4			0x00006C04
+#define	VMCS_HOST_FS_BASE		0x00006C06
+#define	VMCS_HOST_GS_BASE		0x00006C08
+#define	VMCS_HOST_TR_BASE		0x00006C0A
+#define	VMCS_HOST_GDTR_BASE		0x00006C0C
+#define	VMCS_HOST_IDTR_BASE		0x00006C0E
+#define	VMCS_HOST_IA32_SYSENTER_ESP	0x00006C10
+#define	VMCS_HOST_IA32_SYSENTER_EIP	0x00006C12
+#define	VMCS_HOST_RSP			0x00006C14
+#define	VMCS_HOST_RIP			0x00006c16
+
+/*
+ * VM instruction error numbers
+ */
+#define	VMRESUME_WITH_NON_LAUNCHED_VMCS	5
+
+/*
+ * VMCS exit reasons
+ */
+#define EXIT_REASON_EXCEPTION		0
+#define EXIT_REASON_EXT_INTR		1
+#define EXIT_REASON_TRIPLE_FAULT	2
+#define EXIT_REASON_INIT		3
+#define EXIT_REASON_SIPI		4
+#define EXIT_REASON_IO_SMI		5
+#define EXIT_REASON_SMI			6
+#define EXIT_REASON_INTR_WINDOW		7
+#define EXIT_REASON_NMI_WINDOW		8
+#define EXIT_REASON_TASK_SWITCH		9
+#define EXIT_REASON_CPUID		10
+#define EXIT_REASON_GETSEC		11
+#define EXIT_REASON_HLT			12
+#define EXIT_REASON_INVD		13
+#define EXIT_REASON_INVLPG		14
+#define EXIT_REASON_RDPMC		15
+#define EXIT_REASON_RDTSC		16
+#define EXIT_REASON_RSM			17
+#define EXIT_REASON_VMCALL		18
+#define EXIT_REASON_VMCLEAR		19
+#define EXIT_REASON_VMLAUNCH		20
+#define EXIT_REASON_VMPTRLD		21
+#define EXIT_REASON_VMPTRST		22
+#define EXIT_REASON_VMREAD		23
+#define EXIT_REASON_VMRESUME		24
+#define EXIT_REASON_VMWRITE		25
+#define EXIT_REASON_VMXOFF		26
+#define EXIT_REASON_VMXON		27
+#define EXIT_REASON_CR_ACCESS		28
+#define EXIT_REASON_DR_ACCESS		29
+#define EXIT_REASON_INOUT		30
+#define EXIT_REASON_RDMSR		31
+#define EXIT_REASON_WRMSR		32
+#define EXIT_REASON_INVAL_VMCS		33
+#define EXIT_REASON_INVAL_MSR		34
+#define EXIT_REASON_MWAIT		36
+#define EXIT_REASON_MTF			37
+#define EXIT_REASON_MONITOR		39
+#define EXIT_REASON_PAUSE		40
+#define EXIT_REASON_MCE			41
+#define EXIT_REASON_TPR			43
+#define EXIT_REASON_APIC		44
+#define EXIT_REASON_GDTR_IDTR		46
+#define EXIT_REASON_LDTR_TR		47
+#define EXIT_REASON_EPT_FAULT		48
+#define EXIT_REASON_EPT_MISCONFIG	49
+#define EXIT_REASON_INVEPT		50
+#define EXIT_REASON_RDTSCP		51
+#define EXIT_REASON_VMX_PREEMPT		52
+#define EXIT_REASON_INVVPID		53
+#define EXIT_REASON_WBINVD		54
+#define EXIT_REASON_XSETBV		55
+
+/*
+ * VMCS interrupt information fields
+ */
+#define	VMCS_INTERRUPTION_INFO_VALID	(1U << 31)
+#define	VMCS_INTERRUPTION_INFO_HW_INTR	(0 << 8)
+#define	VMCS_INTERRUPTION_INFO_NMI	(2 << 8)
+
+/*
+ * VMCS Guest interruptibility field
+ */
+#define	VMCS_INTERRUPTIBILITY_STI_BLOCKING	(1 << 0)
+#define	VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING	(1 << 1)
+#define	VMCS_INTERRUPTIBILITY_SMI_BLOCKING	(1 << 2)
+#define	VMCS_INTERRUPTIBILITY_NMI_BLOCKING	(1 << 3)
+
+/*
+ * Exit qualification for EXIT_REASON_INVAL_VMCS
+ */
+#define	EXIT_QUAL_NMI_WHILE_STI_BLOCKING	3
+
+/*
+ * Exit qualification for EPT violation
+ */
+#define	EPT_VIOLATION_DATA_READ		(1UL << 0)
+#define	EPT_VIOLATION_DATA_WRITE	(1UL << 1)
+#define	EPT_VIOLATION_INST_FETCH	(1UL << 2)
+#define	EPT_VIOLATION_GLA_VALID		(1UL << 7)
+#define	EPT_VIOLATION_XLAT_VALID	(1UL << 8)
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
new file mode 100644
index 0000000..4f267bb
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -0,0 +1,1845 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/psl.h>
+#include <machine/cpufunc.h>
+#include <machine/md_var.h>
+#include <machine/pmap.h>
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+#include <machine/vmparam.h>
+
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_host.h"
+#include "vmm_lapic.h"
+#include "vmm_msr.h"
+#include "vmm_ktr.h"
+#include "vmm_stat.h"
+
+#include "vmx_msr.h"
+#include "ept.h"
+#include "vmx_cpufunc.h"
+#include "vmx.h"
+#include "x86.h"
+#include "vmx_controls.h"
+
+#define	PINBASED_CTLS_ONE_SETTING					\
+	(PINBASED_EXTINT_EXITING	|				\
+	 PINBASED_NMI_EXITING		|				\
+	 PINBASED_VIRTUAL_NMI)
+#define	PINBASED_CTLS_ZERO_SETTING	0
+
+#define PROCBASED_CTLS_WINDOW_SETTING					\
+	(PROCBASED_INT_WINDOW_EXITING	|				\
+	 PROCBASED_NMI_WINDOW_EXITING)
+
+#define	PROCBASED_CTLS_ONE_SETTING 					\
+	(PROCBASED_SECONDARY_CONTROLS	|				\
+	 PROCBASED_IO_EXITING		|				\
+	 PROCBASED_MSR_BITMAPS		|				\
+	 PROCBASED_CTLS_WINDOW_SETTING)
+#define	PROCBASED_CTLS_ZERO_SETTING	\
+	(PROCBASED_CR3_LOAD_EXITING |	\
+	PROCBASED_CR3_STORE_EXITING |	\
+	PROCBASED_IO_BITMAPS)
+
+#define	PROCBASED_CTLS2_ONE_SETTING	PROCBASED2_ENABLE_EPT
+#define	PROCBASED_CTLS2_ZERO_SETTING	0
+
+#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT					\
+	(VM_EXIT_HOST_LMA			|			\
+	VM_EXIT_SAVE_EFER			|			\
+	VM_EXIT_LOAD_EFER)
+
+#define	VM_EXIT_CTLS_ONE_SETTING					\
+	(VM_EXIT_CTLS_ONE_SETTING_NO_PAT       	|			\
+	VM_EXIT_SAVE_PAT			|			\
+	VM_EXIT_LOAD_PAT)
+#define	VM_EXIT_CTLS_ZERO_SETTING	VM_EXIT_SAVE_DEBUG_CONTROLS
+
+#define	VM_ENTRY_CTLS_ONE_SETTING_NO_PAT	VM_ENTRY_LOAD_EFER
+
+#define	VM_ENTRY_CTLS_ONE_SETTING					\
+	(VM_ENTRY_CTLS_ONE_SETTING_NO_PAT     	|			\
+	VM_ENTRY_LOAD_PAT)
+#define	VM_ENTRY_CTLS_ZERO_SETTING					\
+	(VM_ENTRY_LOAD_DEBUG_CONTROLS		|			\
+	VM_ENTRY_INTO_SMM			|			\
+	VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
+
+#define	guest_msr_rw(vmx, msr) \
+	msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
+
+#define	HANDLED		1
+#define	UNHANDLED	0
+
+MALLOC_DEFINE(M_VMX, "vmx", "vmx");
+
+int vmxon_enabled[MAXCPU];
+static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
+
+static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
+static uint32_t exit_ctls, entry_ctls;
+
+static uint64_t cr0_ones_mask, cr0_zeros_mask;
+static uint64_t cr4_ones_mask, cr4_zeros_mask;
+
+static volatile u_int nextvpid;
+
+static int vmx_no_patmsr;
+
+/*
+ * Virtual NMI blocking conditions.
+ *
+ * Some processor implementations also require NMI to be blocked if
+ * the STI_BLOCKING bit is set. It is possible to detect this at runtime
+ * based on the (exit_reason,exit_qual) tuple being set to 
+ * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING).
+ *
+ * We take the easy way out and also include STI_BLOCKING as one of the
+ * gating items for vNMI injection.
+ */
+static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING |
+				    VMCS_INTERRUPTIBILITY_NMI_BLOCKING |
+				    VMCS_INTERRUPTIBILITY_STI_BLOCKING;
+
+/*
+ * Optional capabilities
+ */
+static int cap_halt_exit;
+static int cap_pause_exit;
+static int cap_unrestricted_guest;
+static int cap_monitor_trap;
+ 
+/* statistics */
+static VMM_STAT_DEFINE(VCPU_MIGRATIONS, "vcpu migration across host cpus");
+static VMM_STAT_DEFINE(VMEXIT_EXTINT, "vm exits due to external interrupt");
+static VMM_STAT_DEFINE(VMEXIT_HLT_IGNORED, "number of times hlt was ignored");
+static VMM_STAT_DEFINE(VMEXIT_HLT, "number of times hlt was intercepted");
+
+#ifdef KTR
+static const char *
+exit_reason_to_str(int reason)
+{
+	static char reasonbuf[32];
+
+	switch (reason) {
+	case EXIT_REASON_EXCEPTION:
+		return "exception";
+	case EXIT_REASON_EXT_INTR:
+		return "extint";
+	case EXIT_REASON_TRIPLE_FAULT:
+		return "triplefault";
+	case EXIT_REASON_INIT:
+		return "init";
+	case EXIT_REASON_SIPI:
+		return "sipi";
+	case EXIT_REASON_IO_SMI:
+		return "iosmi";
+	case EXIT_REASON_SMI:
+		return "smi";
+	case EXIT_REASON_INTR_WINDOW:
+		return "intrwindow";
+	case EXIT_REASON_NMI_WINDOW:
+		return "nmiwindow";
+	case EXIT_REASON_TASK_SWITCH:
+		return "taskswitch";
+	case EXIT_REASON_CPUID:
+		return "cpuid";
+	case EXIT_REASON_GETSEC:
+		return "getsec";
+	case EXIT_REASON_HLT:
+		return "hlt";
+	case EXIT_REASON_INVD:
+		return "invd";
+	case EXIT_REASON_INVLPG:
+		return "invlpg";
+	case EXIT_REASON_RDPMC:
+		return "rdpmc";
+	case EXIT_REASON_RDTSC:
+		return "rdtsc";
+	case EXIT_REASON_RSM:
+		return "rsm";
+	case EXIT_REASON_VMCALL:
+		return "vmcall";
+	case EXIT_REASON_VMCLEAR:
+		return "vmclear";
+	case EXIT_REASON_VMLAUNCH:
+		return "vmlaunch";
+	case EXIT_REASON_VMPTRLD:
+		return "vmptrld";
+	case EXIT_REASON_VMPTRST:
+		return "vmptrst";
+	case EXIT_REASON_VMREAD:
+		return "vmread";
+	case EXIT_REASON_VMRESUME:
+		return "vmresume";
+	case EXIT_REASON_VMWRITE:
+		return "vmwrite";
+	case EXIT_REASON_VMXOFF:
+		return "vmxoff";
+	case EXIT_REASON_VMXON:
+		return "vmxon";
+	case EXIT_REASON_CR_ACCESS:
+		return "craccess";
+	case EXIT_REASON_DR_ACCESS:
+		return "draccess";
+	case EXIT_REASON_INOUT:
+		return "inout";
+	case EXIT_REASON_RDMSR:
+		return "rdmsr";
+	case EXIT_REASON_WRMSR:
+		return "wrmsr";
+	case EXIT_REASON_INVAL_VMCS:
+		return "invalvmcs";
+	case EXIT_REASON_INVAL_MSR:
+		return "invalmsr";
+	case EXIT_REASON_MWAIT:
+		return "mwait";
+	case EXIT_REASON_MTF:
+		return "mtf";
+	case EXIT_REASON_MONITOR:
+		return "monitor";
+	case EXIT_REASON_PAUSE:
+		return "pause";
+	case EXIT_REASON_MCE:
+		return "mce";
+	case EXIT_REASON_TPR:
+		return "tpr";
+	case EXIT_REASON_APIC:
+		return "apic";
+	case EXIT_REASON_GDTR_IDTR:
+		return "gdtridtr";
+	case EXIT_REASON_LDTR_TR:
+		return "ldtrtr";
+	case EXIT_REASON_EPT_FAULT:
+		return "eptfault";
+	case EXIT_REASON_EPT_MISCONFIG:
+		return "eptmisconfig";
+	case EXIT_REASON_INVEPT:
+		return "invept";
+	case EXIT_REASON_RDTSCP:
+		return "rdtscp";
+	case EXIT_REASON_VMX_PREEMPT:
+		return "vmxpreempt";
+	case EXIT_REASON_INVVPID:
+		return "invvpid";
+	case EXIT_REASON_WBINVD:
+		return "wbinvd";
+	case EXIT_REASON_XSETBV:
+		return "xsetbv";
+	default:
+		snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
+		return (reasonbuf);
+	}
+}
+
+#ifdef SETJMP_TRACE
+static const char *
+vmx_setjmp_rc2str(int rc)
+{
+	switch (rc) {
+	case VMX_RETURN_DIRECT:
+		return "direct";
+	case VMX_RETURN_LONGJMP:
+		return "longjmp";
+	case VMX_RETURN_VMRESUME:
+		return "vmresume";
+	case VMX_RETURN_VMLAUNCH:
+		return "vmlaunch";
+	case VMX_RETURN_AST:
+		return "ast";
+	default:
+		return "unknown";
+	}
+}
+
+#define	SETJMP_TRACE(vmx, vcpu, vmxctx, regname)			  \
+	VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \
+		 (vmxctx)->regname)
+
+static void
+vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
+{
+	uint64_t host_rip, host_rsp;
+
+	if (vmxctx != &vmx->ctx[vcpu])
+		panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p",
+			vmxctx, &vmx->ctx[vcpu]);
+
+	VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx);
+	VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)",
+		 vmx_setjmp_rc2str(rc), rc);
+
+	host_rsp = host_rip = ~0;
+	vmread(VMCS_HOST_RIP, &host_rip);
+	vmread(VMCS_HOST_RSP, &host_rsp);
+	VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx",
+		 host_rip, host_rsp);
+
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip);
+
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15);
+	SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2);
+}
+#endif
+#else
+static void __inline
+vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
+{
+	return;
+}
+#endif	/* KTR */
+
+u_long
+vmx_fix_cr0(u_long cr0)
+{
+
+	return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
+}
+
+u_long
+vmx_fix_cr4(u_long cr4)
+{
+
+	return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
+}
+
+static void
+msr_save_area_init(struct msr_entry *g_area, int *g_count)
+{
+	int cnt;
+
+	static struct msr_entry guest_msrs[] = {
+		{ MSR_KGSBASE, 0, 0 },
+	};
+
+	cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
+	if (cnt > GUEST_MSR_MAX_ENTRIES)
+		panic("guest msr save area overrun");
+	bcopy(guest_msrs, g_area, sizeof(guest_msrs));
+	*g_count = cnt;
+}
+
+static void
+vmx_disable(void *arg __unused)
+{
+	struct invvpid_desc invvpid_desc = { 0 };
+	struct invept_desc invept_desc = { 0 };
+
+	if (vmxon_enabled[curcpu]) {
+		/*
+		 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
+		 *
+		 * VMXON or VMXOFF are not required to invalidate any TLB
+		 * caching structures. This prevents potential retention of
+		 * cached information in the TLB between distinct VMX episodes.
+		 */
+		invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
+		invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
+		vmxoff();
+	}
+	load_cr4(rcr4() & ~CR4_VMXE);
+}
+
+static int
+vmx_cleanup(void)
+{
+
+	smp_rendezvous(NULL, vmx_disable, NULL, NULL);
+
+	return (0);
+}
+
+static void
+vmx_enable(void *arg __unused)
+{
+	int error;
+
+	load_cr4(rcr4() | CR4_VMXE);
+
+	*(uint32_t *)vmxon_region[curcpu] = vmx_revision();
+	error = vmxon(vmxon_region[curcpu]);
+	if (error == 0)
+		vmxon_enabled[curcpu] = 1;
+}
+
+static int
+vmx_init(void)
+{
+	int error;
+	uint64_t fixed0, fixed1, feature_control;
+	uint32_t tmp;
+
+	/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
+	if (!(cpu_feature2 & CPUID2_VMX)) {
+		printf("vmx_init: processor does not support VMX operation\n");
+		return (ENXIO);
+	}
+
+	/*
+	 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
+	 * are set (bits 0 and 2 respectively).
+	 */
+	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
+	if ((feature_control & 0x5) != 0x5) {
+		printf("vmx_init: VMX operation disabled by BIOS\n");
+		return (ENXIO);
+	}
+
+	/* Check support for primary processor-based VM-execution controls */
+	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+			       MSR_VMX_TRUE_PROCBASED_CTLS,
+			       PROCBASED_CTLS_ONE_SETTING,
+			       PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
+	if (error) {
+		printf("vmx_init: processor does not support desired primary "
+		       "processor-based controls\n");
+		return (error);
+	}
+
+	/* Clear the processor-based ctl bits that are set on demand */
+	procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
+
+	/* Check support for secondary processor-based VM-execution controls */
+	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+			       MSR_VMX_PROCBASED_CTLS2,
+			       PROCBASED_CTLS2_ONE_SETTING,
+			       PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
+	if (error) {
+		printf("vmx_init: processor does not support desired secondary "
+		       "processor-based controls\n");
+		return (error);
+	}
+
+	/* Check support for VPID */
+	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
+			       PROCBASED2_ENABLE_VPID, 0, &tmp);
+	if (error == 0)
+		procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
+
+	/* Check support for pin-based VM-execution controls */
+	error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
+			       MSR_VMX_TRUE_PINBASED_CTLS,
+			       PINBASED_CTLS_ONE_SETTING,
+			       PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
+	if (error) {
+		printf("vmx_init: processor does not support desired "
+		       "pin-based controls\n");
+		return (error);
+	}
+
+	/* Check support for VM-exit controls */
+	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
+			       VM_EXIT_CTLS_ONE_SETTING,
+			       VM_EXIT_CTLS_ZERO_SETTING,
+			       &exit_ctls);
+	if (error) {
+		/* Try again without the PAT MSR bits */
+		error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS,
+				       MSR_VMX_TRUE_EXIT_CTLS,
+				       VM_EXIT_CTLS_ONE_SETTING_NO_PAT,
+				       VM_EXIT_CTLS_ZERO_SETTING,
+				       &exit_ctls);
+		if (error) {
+			printf("vmx_init: processor does not support desired "
+			       "exit controls\n");
+			return (error);
+		} else {
+			if (bootverbose)
+				printf("vmm: PAT MSR access not supported\n");
+			guest_msr_valid(MSR_PAT);
+			vmx_no_patmsr = 1;
+		}
+	}
+
+	/* Check support for VM-entry controls */
+	if (!vmx_no_patmsr) {
+		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
+				       MSR_VMX_TRUE_ENTRY_CTLS,
+				       VM_ENTRY_CTLS_ONE_SETTING,
+				       VM_ENTRY_CTLS_ZERO_SETTING,
+				       &entry_ctls);
+	} else {
+		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
+				       MSR_VMX_TRUE_ENTRY_CTLS,
+				       VM_ENTRY_CTLS_ONE_SETTING_NO_PAT,
+				       VM_ENTRY_CTLS_ZERO_SETTING,
+				       &entry_ctls);
+	}
+
+	if (error) {
+		printf("vmx_init: processor does not support desired "
+		       "entry controls\n");
+		       return (error);
+	}
+
+	/*
+	 * Check support for optional features by testing them
+	 * as individual bits
+	 */
+	cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+					MSR_VMX_TRUE_PROCBASED_CTLS,
+					PROCBASED_HLT_EXITING, 0,
+					&tmp) == 0);
+
+	cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+					MSR_VMX_PROCBASED_CTLS,
+					PROCBASED_MTF, 0,
+					&tmp) == 0);
+
+	cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+					 MSR_VMX_TRUE_PROCBASED_CTLS,
+					 PROCBASED_PAUSE_EXITING, 0,
+					 &tmp) == 0);
+
+	cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+					MSR_VMX_PROCBASED_CTLS2,
+					PROCBASED2_UNRESTRICTED_GUEST, 0,
+				        &tmp) == 0);
+
+	/* Initialize EPT */
+	error = ept_init();
+	if (error) {
+		printf("vmx_init: ept initialization failed (%d)\n", error);
+		return (error);
+	}
+
+	/*
+	 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
+	 */
+	fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
+	fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
+	cr0_ones_mask = fixed0 & fixed1;
+	cr0_zeros_mask = ~fixed0 & ~fixed1;
+
+	/*
+	 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
+	 * if unrestricted guest execution is allowed.
+	 */
+	if (cap_unrestricted_guest)
+		cr0_ones_mask &= ~(CR0_PG | CR0_PE);
+
+	/*
+	 * Do not allow the guest to set CR0_NW or CR0_CD.
+	 */
+	cr0_zeros_mask |= (CR0_NW | CR0_CD);
+
+	fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
+	fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
+	cr4_ones_mask = fixed0 & fixed1;
+	cr4_zeros_mask = ~fixed0 & ~fixed1;
+
+	/* enable VMX operation */
+	smp_rendezvous(NULL, vmx_enable, NULL, NULL);
+
+	return (0);
+}
+
+/*
+ * If this processor does not support VPIDs then simply return 0.
+ *
+ * Otherwise generate the next value of VPID to use. Any value is alright
+ * as long as it is non-zero.
+ *
+ * We always execute in VMX non-root context with EPT enabled. Thus all
+ * combined mappings are tagged with the (EP4TA, VPID, PCID) tuple. This
+ * in turn means that multiple VMs can share the same VPID as long as
+ * they have distinct EPT page tables.
+ *
+ * XXX
+ * We should optimize this so that it returns VPIDs that are not in
+ * use. Then we will not unnecessarily invalidate mappings in
+ * vmx_set_pcpu_defaults() just because two or more vcpus happen to
+ * use the same 'vpid'.
+ */
+static uint16_t
+vmx_vpid(void)
+{
+	uint16_t vpid = 0;
+
+	if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) != 0) {
+		do {
+			vpid = atomic_fetchadd_int(&nextvpid, 1);
+		} while (vpid == 0);
+	}
+
+	return (vpid);
+}
+
+static int
+vmx_setup_cr_shadow(int which, struct vmcs *vmcs)
+{
+	int error, mask_ident, shadow_ident;
+	uint64_t mask_value, shadow_value;
+
+	if (which != 0 && which != 4)
+		panic("vmx_setup_cr_shadow: unknown cr%d", which);
+
+	if (which == 0) {
+		mask_ident = VMCS_CR0_MASK;
+		mask_value = cr0_ones_mask | cr0_zeros_mask;
+		shadow_ident = VMCS_CR0_SHADOW;
+		shadow_value = cr0_ones_mask;
+	} else {
+		mask_ident = VMCS_CR4_MASK;
+		mask_value = cr4_ones_mask | cr4_zeros_mask;
+		shadow_ident = VMCS_CR4_SHADOW;
+		shadow_value = cr4_ones_mask;
+	}
+
+	error = vmcs_setreg(vmcs, VMCS_IDENT(mask_ident), mask_value);
+	if (error)
+		return (error);
+
+	error = vmcs_setreg(vmcs, VMCS_IDENT(shadow_ident), shadow_value);
+	if (error)
+		return (error);
+
+	return (0);
+}
+#define	vmx_setup_cr0_shadow(vmcs)	vmx_setup_cr_shadow(0, (vmcs))
+#define	vmx_setup_cr4_shadow(vmcs)	vmx_setup_cr_shadow(4, (vmcs))
+
+static void *
+vmx_vminit(struct vm *vm)
+{
+	uint16_t vpid;
+	int i, error, guest_msr_count;
+	struct vmx *vmx;
+
+	vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
+	if ((uintptr_t)vmx & PAGE_MASK) {
+		panic("malloc of struct vmx not aligned on %d byte boundary",
+		      PAGE_SIZE);
+	}
+	vmx->vm = vm;
+
+	/*
+	 * Clean up EPTP-tagged guest physical and combined mappings
+	 *
+	 * VMX transitions are not required to invalidate any guest physical
+	 * mappings. So, it may be possible for stale guest physical mappings
+	 * to be present in the processor TLBs.
+	 *
+	 * Combined mappings for this EP4TA are also invalidated for all VPIDs.
+	 */
+	ept_invalidate_mappings(vtophys(vmx->pml4ept));
+
+	msr_bitmap_initialize(vmx->msr_bitmap);
+
+	/*
+	 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
+	 * The guest FSBASE and GSBASE are saved and restored during
+	 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
+	 * always restored from the vmcs host state area on vm-exit.
+	 *
+	 * Guest KGSBASE is saved and restored in the guest MSR save area.
+	 * Host KGSBASE is restored before returning to userland from the pcb.
+	 * There will be a window of time when we are executing in the host
+	 * kernel context with a value of KGSBASE from the guest. This is ok
+	 * because the value of KGSBASE is inconsequential in kernel context.
+	 *
+	 * MSR_EFER is saved and restored in the guest VMCS area on a
+	 * VM exit and entry respectively. It is also restored from the
+	 * host VMCS area on a VM exit.
+	 */
+	if (guest_msr_rw(vmx, MSR_GSBASE) ||
+	    guest_msr_rw(vmx, MSR_FSBASE) ||
+	    guest_msr_rw(vmx, MSR_KGSBASE) ||
+	    guest_msr_rw(vmx, MSR_EFER))
+		panic("vmx_vminit: error setting guest msr access");
+
+	/*
+	 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
+	 * and entry respectively. It is also restored from the host VMCS
+	 * area on a VM exit. However, if running on a system with no
+	 * MSR_PAT save/restore support, leave access disabled so accesses
+	 * will be trapped.
+	 */
+	if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
+		panic("vmx_vminit: error setting guest pat msr access");
+
+	for (i = 0; i < VM_MAXCPU; i++) {
+		vmx->vmcs[i].identifier = vmx_revision();
+		error = vmclear(&vmx->vmcs[i]);
+		if (error != 0) {
+			panic("vmx_vminit: vmclear error %d on vcpu %d\n",
+			      error, i);
+		}
+
+		vpid = vmx_vpid();
+
+		error = vmcs_set_defaults(&vmx->vmcs[i],
+					  (u_long)vmx_longjmp,
+					  (u_long)&vmx->ctx[i],
+					  vtophys(vmx->pml4ept),
+					  pinbased_ctls,
+					  procbased_ctls,
+					  procbased_ctls2,
+					  exit_ctls, entry_ctls,
+					  vtophys(vmx->msr_bitmap),
+					  vpid);
+
+		if (error != 0)
+			panic("vmx_vminit: vmcs_set_defaults error %d", error);
+
+		vmx->cap[i].set = 0;
+		vmx->cap[i].proc_ctls = procbased_ctls;
+
+		vmx->state[i].lastcpu = -1;
+		vmx->state[i].vpid = vpid;
+
+		msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
+
+		error = vmcs_set_msr_save(&vmx->vmcs[i],
+					  vtophys(vmx->guest_msrs[i]),
+					  guest_msr_count);
+		if (error != 0)
+			panic("vmcs_set_msr_save error %d", error);
+
+		error = vmx_setup_cr0_shadow(&vmx->vmcs[i]);
+		if (error != 0)
+			panic("vmx_setup_cr0_shadow %d", error);
+
+		error = vmx_setup_cr4_shadow(&vmx->vmcs[i]);
+		if (error != 0)
+			panic("vmx_setup_cr4_shadow %d", error);
+	}
+
+	return (vmx);
+}
+
+static int
+vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
+{
+	int handled, func;
+	
+	func = vmxctx->guest_rax;
+
+	handled = x86_emulate_cpuid(vm, vcpu,
+				    (uint32_t*)(&vmxctx->guest_rax),
+				    (uint32_t*)(&vmxctx->guest_rbx),
+				    (uint32_t*)(&vmxctx->guest_rcx),
+				    (uint32_t*)(&vmxctx->guest_rdx));
+	return (handled);
+}
+
+static __inline void
+vmx_run_trace(struct vmx *vmx, int vcpu)
+{
+#ifdef KTR
+	VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip());
+#endif
+}
+
+static __inline void
+vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
+	       int handled)
+{
+#ifdef KTR
+	VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
+		 handled ? "handled" : "unhandled",
+		 exit_reason_to_str(exit_reason), rip);
+#endif
+}
+
+static __inline void
+vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
+{
+#ifdef KTR
+	VMM_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
+#endif
+}
+
+static int
+vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu)
+{
+	int error, lastcpu;
+	struct vmxstate *vmxstate;
+	struct invvpid_desc invvpid_desc = { 0 };
+
+	vmxstate = &vmx->state[vcpu];
+	lastcpu = vmxstate->lastcpu;
+	vmxstate->lastcpu = curcpu;
+
+	if (lastcpu == curcpu) {
+		error = 0;
+		goto done;
+	}
+
+	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
+
+	error = vmwrite(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
+	if (error != 0)
+		goto done;
+
+	error = vmwrite(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
+	if (error != 0)
+		goto done;
+
+	error = vmwrite(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
+	if (error != 0)
+		goto done;
+
+	/*
+	 * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
+	 *
+	 * We do this because this vcpu was executing on a different host
+	 * cpu when it last ran. We do not track whether it invalidated
+	 * mappings associated with its 'vpid' during that run. So we must
+	 * assume that the mappings associated with 'vpid' on 'curcpu' are
+	 * stale and invalidate them.
+	 *
+	 * Note that we incur this penalty only when the scheduler chooses to
+	 * move the thread associated with this vcpu between host cpus.
+	 *
+	 * Note also that this will invalidate mappings tagged with 'vpid'
+	 * for "all" EP4TAs.
+	 */
+	if (vmxstate->vpid != 0) {
+		invvpid_desc.vpid = vmxstate->vpid;
+		invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
+	}
+done:
+	return (error);
+}
+
+static void 
+vm_exit_update_rip(struct vm_exit *vmexit)
+{
+	int error;
+
+	error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length);
+	if (error)
+		panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
+}
+
+/*
+ * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
+ */
+CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
+
+static void __inline
+vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
+{
+	int error;
+
+	vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
+
+	error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+	if (error)
+		panic("vmx_set_int_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
+{
+	int error;
+
+	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
+
+	error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+	if (error)
+		panic("vmx_clear_int_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+	int error;
+
+	vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
+
+	error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+	if (error)
+		panic("vmx_set_nmi_window_exiting: vmwrite error %d", error);
+}
+
+static void __inline
+vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+	int error;
+
+	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
+
+	error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+	if (error)
+		panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error);
+}
+
+static int
+vmx_inject_nmi(struct vmx *vmx, int vcpu)
+{
+	int error;
+	uint64_t info, interruptibility;
+
+	/* Bail out if no NMI requested */
+	if (!vm_nmi_pending(vmx->vm, vcpu))
+		return (0);
+
+	error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility);
+	if (error) {
+		panic("vmx_inject_nmi: vmread(interruptibility) %d",
+			error);
+	}
+	if (interruptibility & nmi_blocking_bits)
+		goto nmiblocked;
+
+	/*
+	 * Inject the virtual NMI. The vector must be the NMI IDT entry
+	 * or the VMCS entry check will fail.
+	 */
+	info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID;
+	info |= IDT_NMI;
+
+	error = vmwrite(VMCS_ENTRY_INTR_INFO, info);
+	if (error)
+		panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error);
+
+	VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI");
+
+	/* Clear the request */
+	vm_nmi_clear(vmx->vm, vcpu);
+	return (1);
+
+nmiblocked:
+	/*
+	 * Set the NMI Window Exiting execution control so we can inject
+	 * the virtual NMI as soon as blocking condition goes away.
+	 */
+	vmx_set_nmi_window_exiting(vmx, vcpu);
+
+	VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
+	return (1);
+}
+
+static void
+vmx_inject_interrupts(struct vmx *vmx, int vcpu)
+{
+	int error, vector;
+	uint64_t info, rflags, interruptibility;
+
+	const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING |
+				   VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING;
+
+	/*
+	 * If there is already an interrupt pending then just return.
+	 *
+	 * This could happen if an interrupt was injected on a prior
+	 * VM entry but the actual entry into guest mode was aborted
+	 * because of a pending AST.
+	 */
+	error = vmread(VMCS_ENTRY_INTR_INFO, &info);
+	if (error)
+		panic("vmx_inject_interrupts: vmread(intrinfo) %d", error);
+	if (info & VMCS_INTERRUPTION_INFO_VALID)
+		return;
+
+	/*
+	 * NMI injection has priority so deal with those first
+	 */
+	if (vmx_inject_nmi(vmx, vcpu))
+		return;
+
+	/* Ask the local apic for a vector to inject */
+	vector = lapic_pending_intr(vmx->vm, vcpu);
+	if (vector < 0)
+		return;
+
+	if (vector < 32 || vector > 255)
+		panic("vmx_inject_interrupts: invalid vector %d\n", vector);
+
+	/* Check RFLAGS.IF and the interruptibility state of the guest */
+	error = vmread(VMCS_GUEST_RFLAGS, &rflags);
+	if (error)
+		panic("vmx_inject_interrupts: vmread(rflags) %d", error);
+
+	if ((rflags & PSL_I) == 0)
+		goto cantinject;
+
+	error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility);
+	if (error) {
+		panic("vmx_inject_interrupts: vmread(interruptibility) %d",
+			error);
+	}
+	if (interruptibility & HWINTR_BLOCKED)
+		goto cantinject;
+
+	/* Inject the interrupt */
+	info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID;
+	info |= vector;
+	error = vmwrite(VMCS_ENTRY_INTR_INFO, info);
+	if (error)
+		panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error);
+
+	/* Update the Local APIC ISR */
+	lapic_intr_accepted(vmx->vm, vcpu, vector);
+
+	VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
+
+	return;
+
+cantinject:
+	/*
+	 * Set the Interrupt Window Exiting execution control so we can inject
+	 * the interrupt as soon as blocking condition goes away.
+	 */
+	vmx_set_int_window_exiting(vmx, vcpu);
+
+	VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
+}
+
+static int
+vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
+{
+	int error, cr, vmcs_guest_cr;
+	uint64_t regval, ones_mask, zeros_mask;
+	const struct vmxctx *vmxctx;
+
+	/* We only handle mov to %cr0 or %cr4 at this time */
+	if ((exitqual & 0xf0) != 0x00)
+		return (UNHANDLED);
+
+	cr = exitqual & 0xf;
+	if (cr != 0 && cr != 4)
+		return (UNHANDLED);
+
+	vmxctx = &vmx->ctx[vcpu];
+
+	/*
+	 * We must use vmwrite() directly here because vmcs_setreg() will
+	 * call vmclear(vmcs) as a side-effect which we certainly don't want.
+	 */
+	switch ((exitqual >> 8) & 0xf) {
+	case 0:
+		regval = vmxctx->guest_rax;
+		break;
+	case 1:
+		regval = vmxctx->guest_rcx;
+		break;
+	case 2:
+		regval = vmxctx->guest_rdx;
+		break;
+	case 3:
+		regval = vmxctx->guest_rbx;
+		break;
+	case 4:
+		error = vmread(VMCS_GUEST_RSP, &regval);
+		if (error) {
+			panic("vmx_emulate_cr_access: "
+			      "error %d reading guest rsp", error);
+		}
+		break;
+	case 5:
+		regval = vmxctx->guest_rbp;
+		break;
+	case 6:
+		regval = vmxctx->guest_rsi;
+		break;
+	case 7:
+		regval = vmxctx->guest_rdi;
+		break;
+	case 8:
+		regval = vmxctx->guest_r8;
+		break;
+	case 9:
+		regval = vmxctx->guest_r9;
+		break;
+	case 10:
+		regval = vmxctx->guest_r10;
+		break;
+	case 11:
+		regval = vmxctx->guest_r11;
+		break;
+	case 12:
+		regval = vmxctx->guest_r12;
+		break;
+	case 13:
+		regval = vmxctx->guest_r13;
+		break;
+	case 14:
+		regval = vmxctx->guest_r14;
+		break;
+	case 15:
+		regval = vmxctx->guest_r15;
+		break;
+	}
+
+	if (cr == 0) {
+		ones_mask = cr0_ones_mask;
+		zeros_mask = cr0_zeros_mask;
+		vmcs_guest_cr = VMCS_GUEST_CR0;
+	} else {
+		ones_mask = cr4_ones_mask;
+		zeros_mask = cr4_zeros_mask;
+		vmcs_guest_cr = VMCS_GUEST_CR4;
+	}
+	regval |= ones_mask;
+	regval &= ~zeros_mask;
+	error = vmwrite(vmcs_guest_cr, regval);
+	if (error) {
+		panic("vmx_emulate_cr_access: error %d writing cr%d",
+		      error, cr);
+	}
+
+	return (HANDLED);
+}
+
+static int
+vmx_ept_fault(struct vm *vm, int cpu,
+	      uint64_t gla, uint64_t gpa, uint64_t rip, int inst_length,
+	      uint64_t cr3, uint64_t ept_qual, struct vie *vie)
+{
+	int read, write, error;
+
+	/* EPT violation on an instruction fetch doesn't make sense here */
+	if (ept_qual & EPT_VIOLATION_INST_FETCH)
+		return (UNHANDLED);
+
+	/* EPT violation must be a read fault or a write fault */
+	read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
+	write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
+	if ((read | write) == 0)
+		return (UNHANDLED);
+
+	/*
+	 * The EPT violation must have been caused by accessing a
+	 * guest-physical address that is a translation of a guest-linear
+	 * address.
+	 */
+	if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
+	    (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
+		return (UNHANDLED);
+	}
+
+	/* Fetch, decode and emulate the faulting instruction */
+	if (vmm_fetch_instruction(vm, cpu, rip, inst_length, cr3, vie) != 0)
+		return (UNHANDLED);
+
+	if (vmm_decode_instruction(vm, cpu, gla, vie) != 0)
+		return (UNHANDLED);
+
+	/*
+	 * Check if this is a local apic access
+	 */
+	if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE)
+		return (UNHANDLED);
+
+	error = vmm_emulate_instruction(vm, cpu, gpa, vie,
+					lapic_mmio_read, lapic_mmio_write, 0);
+
+	return (error ? UNHANDLED : HANDLED);
+}
+
+static int
+vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
+{
+	int error, handled;
+	struct vmcs *vmcs;
+	struct vmxctx *vmxctx;
+	uint32_t eax, ecx, edx;
+	uint64_t qual, gla, gpa, cr3, intr_info;
+
+	handled = 0;
+	vmcs = &vmx->vmcs[vcpu];
+	vmxctx = &vmx->ctx[vcpu];
+	qual = vmexit->u.vmx.exit_qualification;
+	vmexit->exitcode = VM_EXITCODE_BOGUS;
+
+	switch (vmexit->u.vmx.exit_reason) {
+	case EXIT_REASON_CR_ACCESS:
+		handled = vmx_emulate_cr_access(vmx, vcpu, qual);
+		break;
+	case EXIT_REASON_RDMSR:
+		ecx = vmxctx->guest_rcx;
+		error = emulate_rdmsr(vmx->vm, vcpu, ecx);
+		if (error) {
+			vmexit->exitcode = VM_EXITCODE_RDMSR;
+			vmexit->u.msr.code = ecx;
+		} else
+			handled = 1;
+		break;
+	case EXIT_REASON_WRMSR:
+		eax = vmxctx->guest_rax;
+		ecx = vmxctx->guest_rcx;
+		edx = vmxctx->guest_rdx;
+		error = emulate_wrmsr(vmx->vm, vcpu, ecx,
+					(uint64_t)edx << 32 | eax);
+		if (error) {
+			vmexit->exitcode = VM_EXITCODE_WRMSR;
+			vmexit->u.msr.code = ecx;
+			vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
+		} else
+			handled = 1;
+		break;
+	case EXIT_REASON_HLT:
+		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
+		/*
+		 * If there is an event waiting to be injected then there is
+		 * no need to 'hlt'.
+		 */
+		error = vmread(VMCS_ENTRY_INTR_INFO, &intr_info);
+		if (error)
+			panic("vmx_exit_process: vmread(intrinfo) %d", error);
+
+		if (intr_info & VMCS_INTERRUPTION_INFO_VALID) {
+			handled = 1;
+			vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT_IGNORED, 1);
+		} else
+			vmexit->exitcode = VM_EXITCODE_HLT;
+		break;
+	case EXIT_REASON_MTF:
+		vmexit->exitcode = VM_EXITCODE_MTRAP;
+		break;
+	case EXIT_REASON_PAUSE:
+		vmexit->exitcode = VM_EXITCODE_PAUSE;
+		break;
+	case EXIT_REASON_INTR_WINDOW:
+		vmx_clear_int_window_exiting(vmx, vcpu);
+		VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
+		/* FALLTHRU */
+	case EXIT_REASON_EXT_INTR:
+		/*
+		 * External interrupts serve only to cause VM exits and allow
+		 * the host interrupt handler to run.
+		 *
+		 * If this external interrupt triggers a virtual interrupt
+		 * to a VM, then that state will be recorded by the
+		 * host interrupt handler in the VM's softc. We will inject
+		 * this virtual interrupt during the subsequent VM enter.
+		 */
+
+		/*
+		 * This is special. We want to treat this as an 'handled'
+		 * VM-exit but not increment the instruction pointer.
+		 */
+		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
+		return (1);
+	case EXIT_REASON_NMI_WINDOW:
+		/* Exit to allow the pending virtual NMI to be injected */
+		vmx_clear_nmi_window_exiting(vmx, vcpu);
+		VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
+		return (1);
+	case EXIT_REASON_INOUT:
+		vmexit->exitcode = VM_EXITCODE_INOUT;
+		vmexit->u.inout.bytes = (qual & 0x7) + 1;
+		vmexit->u.inout.in = (qual & 0x8) ? 1 : 0;
+		vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
+		vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
+		vmexit->u.inout.port = (uint16_t)(qual >> 16);
+		vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
+		break;
+	case EXIT_REASON_CPUID:
+		handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
+		break;
+	case EXIT_REASON_EPT_FAULT:
+		gla = vmcs_gla();
+		gpa = vmcs_gpa();
+		cr3 = vmcs_guest_cr3();
+		handled = vmx_ept_fault(vmx->vm, vcpu, gla, gpa,
+					vmexit->rip, vmexit->inst_length,
+					cr3, qual, &vmexit->u.paging.vie);
+		if (!handled) {
+			vmexit->exitcode = VM_EXITCODE_PAGING;
+			vmexit->u.paging.gpa = gpa;
+		}
+		break;
+	default:
+		break;
+	}
+
+	if (handled) {
+		/*
+		 * It is possible that control is returned to userland
+		 * even though we were able to handle the VM exit in the
+		 * kernel.
+		 *
+		 * In such a case we want to make sure that the userland
+		 * restarts guest execution at the instruction *after*
+		 * the one we just processed. Therefore we update the
+		 * guest rip in the VMCS and in 'vmexit'.
+		 */
+		vm_exit_update_rip(vmexit);
+		vmexit->rip += vmexit->inst_length;
+		vmexit->inst_length = 0;
+
+		/*
+		 * Special case for spinning up an AP - exit to userspace to
+		 * give the controlling process a chance to intercept and
+		 * spin up a thread for the AP.
+		 */
+		if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP)
+			handled = 0;
+	} else {
+		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
+			/*
+			 * If this VM exit was not claimed by anybody then
+			 * treat it as a generic VMX exit.
+			 */
+			vmexit->exitcode = VM_EXITCODE_VMX;
+			vmexit->u.vmx.error = 0;
+		} else {
+			/*
+			 * The exitcode and collateral have been populated.
+			 * The VM exit will be processed further in userland.
+			 */
+		}
+	}
+	return (handled);
+}
+
+static int
+vmx_run(void *arg, int vcpu, register_t rip)
+{
+	int error, vie, rc, handled, astpending;
+	uint32_t exit_reason;
+	struct vmx *vmx;
+	struct vmxctx *vmxctx;
+	struct vmcs *vmcs;
+	struct vm_exit *vmexit;
+	
+	vmx = arg;
+	vmcs = &vmx->vmcs[vcpu];
+	vmxctx = &vmx->ctx[vcpu];
+	vmxctx->launched = 0;
+
+	astpending = 0;
+	vmexit = vm_exitinfo(vmx->vm, vcpu);
+
+	/*
+	 * XXX Can we avoid doing this every time we do a vm run?
+	 */
+	VMPTRLD(vmcs);
+
+	/*
+	 * XXX
+	 * We do this every time because we may setup the virtual machine
+	 * from a different process than the one that actually runs it.
+	 *
+	 * If the life of a virtual machine was spent entirely in the context
+	 * of a single process we could do this once in vmcs_set_defaults().
+	 */
+	if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0)
+		panic("vmx_run: error %d writing to VMCS_HOST_CR3", error);
+
+	if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0)
+		panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
+
+	if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0)
+		panic("vmx_run: error %d setting up pcpu defaults", error);
+
+	do {
+		lapic_timer_tick(vmx->vm, vcpu);
+		vmx_inject_interrupts(vmx, vcpu);
+		vmx_run_trace(vmx, vcpu);
+		rc = vmx_setjmp(vmxctx);
+#ifdef SETJMP_TRACE
+		vmx_setjmp_trace(vmx, vcpu, vmxctx, rc);
+#endif
+		switch (rc) {
+		case VMX_RETURN_DIRECT:
+			if (vmxctx->launched == 0) {
+				vmxctx->launched = 1;
+				vmx_launch(vmxctx);
+			} else
+				vmx_resume(vmxctx);
+			panic("vmx_launch/resume should not return");
+			break;
+		case VMX_RETURN_LONGJMP:
+			break;			/* vm exit */
+		case VMX_RETURN_AST:
+			astpending = 1;
+			break;
+		case VMX_RETURN_VMRESUME:
+			vie = vmcs_instruction_error();
+			if (vmxctx->launch_error == VM_FAIL_INVALID ||
+			    vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) {
+				printf("vmresume error %d vmcs inst error %d\n",
+					vmxctx->launch_error, vie);
+				goto err_exit;
+			}
+			vmx_launch(vmxctx);	/* try to launch the guest */
+			panic("vmx_launch should not return");
+			break;
+		case VMX_RETURN_VMLAUNCH:
+			vie = vmcs_instruction_error();
+#if 1
+			printf("vmlaunch error %d vmcs inst error %d\n",
+				vmxctx->launch_error, vie);
+#endif
+			goto err_exit;
+		default:
+			panic("vmx_setjmp returned %d", rc);
+		}
+		
+		/* enable interrupts */
+		enable_intr();
+
+		/* collect some basic information for VM exit processing */
+		vmexit->rip = rip = vmcs_guest_rip();
+		vmexit->inst_length = vmexit_instruction_length();
+		vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
+		vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
+
+		if (astpending) {
+			handled = 1;
+			vmexit->inst_length = 0;
+			vmexit->exitcode = VM_EXITCODE_BOGUS;
+			vmx_astpending_trace(vmx, vcpu, rip);
+			break;
+		}
+
+		handled = vmx_exit_process(vmx, vcpu, vmexit);
+		vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
+
+	} while (handled);
+
+	/*
+	 * If a VM exit has been handled then the exitcode must be BOGUS
+	 * If a VM exit is not handled then the exitcode must not be BOGUS
+	 */
+	if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
+	    (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
+		panic("Mismatch between handled (%d) and exitcode (%d)",
+		      handled, vmexit->exitcode);
+	}
+
+	VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode);
+
+	/*
+	 * XXX
+	 * We need to do this to ensure that any VMCS state cached by the
+	 * processor is flushed to memory. We need to do this in case the
+	 * VM moves to a different cpu the next time it runs.
+	 *
+	 * Can we avoid doing this?
+	 */
+	VMCLEAR(vmcs);
+	return (0);
+
+err_exit:
+	vmexit->exitcode = VM_EXITCODE_VMX;
+	vmexit->u.vmx.exit_reason = (uint32_t)-1;
+	vmexit->u.vmx.exit_qualification = (uint32_t)-1;
+	vmexit->u.vmx.error = vie;
+	VMCLEAR(vmcs);
+	return (ENOEXEC);
+}
+
+static void
+vmx_vmcleanup(void *arg)
+{
+	int error;
+	struct vmx *vmx = arg;
+
+	/*
+	 * XXXSMP we also need to clear the VMCS active on the other vcpus.
+	 */
+	error = vmclear(&vmx->vmcs[0]);
+	if (error != 0)
+		panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
+
+	ept_vmcleanup(vmx);
+	free(vmx, M_VMX);
+
+	return;
+}
+
+static register_t *
+vmxctx_regptr(struct vmxctx *vmxctx, int reg)
+{
+
+	switch (reg) {
+	case VM_REG_GUEST_RAX:
+		return (&vmxctx->guest_rax);
+	case VM_REG_GUEST_RBX:
+		return (&vmxctx->guest_rbx);
+	case VM_REG_GUEST_RCX:
+		return (&vmxctx->guest_rcx);
+	case VM_REG_GUEST_RDX:
+		return (&vmxctx->guest_rdx);
+	case VM_REG_GUEST_RSI:
+		return (&vmxctx->guest_rsi);
+	case VM_REG_GUEST_RDI:
+		return (&vmxctx->guest_rdi);
+	case VM_REG_GUEST_RBP:
+		return (&vmxctx->guest_rbp);
+	case VM_REG_GUEST_R8:
+		return (&vmxctx->guest_r8);
+	case VM_REG_GUEST_R9:
+		return (&vmxctx->guest_r9);
+	case VM_REG_GUEST_R10:
+		return (&vmxctx->guest_r10);
+	case VM_REG_GUEST_R11:
+		return (&vmxctx->guest_r11);
+	case VM_REG_GUEST_R12:
+		return (&vmxctx->guest_r12);
+	case VM_REG_GUEST_R13:
+		return (&vmxctx->guest_r13);
+	case VM_REG_GUEST_R14:
+		return (&vmxctx->guest_r14);
+	case VM_REG_GUEST_R15:
+		return (&vmxctx->guest_r15);
+	default:
+		break;
+	}
+	return (NULL);
+}
+
+static int
+vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
+{
+	register_t *regp;
+
+	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
+		*retval = *regp;
+		return (0);
+	} else
+		return (EINVAL);
+}
+
+static int
+vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
+{
+	register_t *regp;
+
+	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
+		*regp = val;
+		return (0);
+	} else
+		return (EINVAL);
+}
+
+static int
+vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
+{
+	struct vmx *vmx = arg;
+
+	if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
+		return (0);
+
+	/*
+	 * If the vcpu is running then don't mess with the VMCS.
+	 *
+	 * vmcs_getreg will VMCLEAR the vmcs when it is done which will cause
+	 * the subsequent vmlaunch/vmresume to fail.
+	 */
+	if (vcpu_is_running(vmx->vm, vcpu))
+		panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+	return (vmcs_getreg(&vmx->vmcs[vcpu], reg, retval));
+}
+
+static int
+vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
+{
+	int error;
+	uint64_t ctls;
+	struct vmx *vmx = arg;
+
+	/*
+	 * XXX Allow caller to set contents of the guest registers saved in
+	 * the 'vmxctx' even though the vcpu might be running. We need this
+	 * specifically to support the rdmsr emulation that will set the
+	 * %eax and %edx registers during vm exit processing.
+	 */
+	if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
+		return (0);
+
+	/*
+	 * If the vcpu is running then don't mess with the VMCS.
+	 *
+	 * vmcs_setreg will VMCLEAR the vmcs when it is done which will cause
+	 * the subsequent vmlaunch/vmresume to fail.
+	 */
+	if (vcpu_is_running(vmx->vm, vcpu))
+		panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+	error = vmcs_setreg(&vmx->vmcs[vcpu], reg, val);
+
+	if (error == 0) {
+		/*
+		 * If the "load EFER" VM-entry control is 1 then the
+		 * value of EFER.LMA must be identical to "IA-32e mode guest"
+		 * bit in the VM-entry control.
+		 */
+		if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
+		    (reg == VM_REG_GUEST_EFER)) {
+			vmcs_getreg(&vmx->vmcs[vcpu],
+				    VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
+			if (val & EFER_LMA)
+				ctls |= VM_ENTRY_GUEST_LMA;
+			else
+				ctls &= ~VM_ENTRY_GUEST_LMA;
+			vmcs_setreg(&vmx->vmcs[vcpu],
+				    VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
+		}
+	}
+
+	return (error);
+}
+
+static int
+vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+	struct vmx *vmx = arg;
+
+	return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc));
+}
+
+static int
+vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+	struct vmx *vmx = arg;
+
+	return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc));
+}
+
+static int
+vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
+	   int code_valid)
+{
+	int error;
+	uint64_t info;
+	struct vmx *vmx = arg;
+	struct vmcs *vmcs = &vmx->vmcs[vcpu];
+
+	static uint32_t type_map[VM_EVENT_MAX] = {
+		0x1,		/* VM_EVENT_NONE */
+		0x0,		/* VM_HW_INTR */
+		0x2,		/* VM_NMI */
+		0x3,		/* VM_HW_EXCEPTION */
+		0x4,		/* VM_SW_INTR */
+		0x5,		/* VM_PRIV_SW_EXCEPTION */
+		0x6,		/* VM_SW_EXCEPTION */
+	};
+
+	/*
+	 * If there is already an exception pending to be delivered to the
+	 * vcpu then just return.
+	 */
+	error = vmcs_getreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info);
+	if (error)
+		return (error);
+
+	if (info & VMCS_INTERRUPTION_INFO_VALID)
+		return (EAGAIN);
+
+	info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0);
+	info |= VMCS_INTERRUPTION_INFO_VALID;
+	error = vmcs_setreg(vmcs, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info);
+	if (error != 0)
+		return (error);
+
+	if (code_valid) {
+		error = vmcs_setreg(vmcs,
+				    VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR),
+				    code);
+	}
+	return (error);
+}
+
+static int
+vmx_getcap(void *arg, int vcpu, int type, int *retval)
+{
+	struct vmx *vmx = arg;
+	int vcap;
+	int ret;
+
+	ret = ENOENT;
+
+	vcap = vmx->cap[vcpu].set;
+
+	switch (type) {
+	case VM_CAP_HALT_EXIT:
+		if (cap_halt_exit)
+			ret = 0;
+		break;
+	case VM_CAP_PAUSE_EXIT:
+		if (cap_pause_exit)
+			ret = 0;
+		break;
+	case VM_CAP_MTRAP_EXIT:
+		if (cap_monitor_trap)
+			ret = 0;
+		break;
+	case VM_CAP_UNRESTRICTED_GUEST:
+		if (cap_unrestricted_guest)
+			ret = 0;
+		break;
+	default:
+		break;
+	}
+
+	if (ret == 0)
+		*retval = (vcap & (1 << type)) ? 1 : 0;
+
+	return (ret);
+}
+
+static int
+vmx_setcap(void *arg, int vcpu, int type, int val)
+{
+	struct vmx *vmx = arg;
+	struct vmcs *vmcs = &vmx->vmcs[vcpu];
+	uint32_t baseval;
+	uint32_t *pptr;
+	int error;
+	int flag;
+	int reg;
+	int retval;
+
+	retval = ENOENT;
+	pptr = NULL;
+
+	switch (type) {
+	case VM_CAP_HALT_EXIT:
+		if (cap_halt_exit) {
+			retval = 0;
+			pptr = &vmx->cap[vcpu].proc_ctls;
+			baseval = *pptr;
+			flag = PROCBASED_HLT_EXITING;
+			reg = VMCS_PRI_PROC_BASED_CTLS;
+		}
+		break;
+	case VM_CAP_MTRAP_EXIT:
+		if (cap_monitor_trap) {
+			retval = 0;
+			pptr = &vmx->cap[vcpu].proc_ctls;
+			baseval = *pptr;
+			flag = PROCBASED_MTF;
+			reg = VMCS_PRI_PROC_BASED_CTLS;
+		}
+		break;
+	case VM_CAP_PAUSE_EXIT:
+		if (cap_pause_exit) {
+			retval = 0;
+			pptr = &vmx->cap[vcpu].proc_ctls;
+			baseval = *pptr;
+			flag = PROCBASED_PAUSE_EXITING;
+			reg = VMCS_PRI_PROC_BASED_CTLS;
+		}
+		break;
+	case VM_CAP_UNRESTRICTED_GUEST:
+		if (cap_unrestricted_guest) {
+			retval = 0;
+			baseval = procbased_ctls2;
+			flag = PROCBASED2_UNRESTRICTED_GUEST;
+			reg = VMCS_SEC_PROC_BASED_CTLS;
+		}
+		break;
+	default:
+		break;
+	}
+
+	if (retval == 0) {
+		if (val) {
+			baseval |= flag;
+		} else {
+			baseval &= ~flag;
+		}
+		VMPTRLD(vmcs);
+		error = vmwrite(reg, baseval);
+		VMCLEAR(vmcs);
+
+		if (error) {
+			retval = error;
+		} else {
+			/*
+			 * Update optional stored flags, and record
+			 * setting
+			 */
+			if (pptr != NULL) {
+				*pptr = baseval;
+			}
+
+			if (val) {
+				vmx->cap[vcpu].set |= (1 << type);
+			} else {
+				vmx->cap[vcpu].set &= ~(1 << type);
+			}
+		}
+	}
+
+        return (retval);
+}
+
+struct vmm_ops vmm_ops_intel = {
+	vmx_init,
+	vmx_cleanup,
+	vmx_vminit,
+	vmx_run,
+	vmx_vmcleanup,
+	ept_vmmmap_set,
+	ept_vmmmap_get,
+	vmx_getreg,
+	vmx_setreg,
+	vmx_getdesc,
+	vmx_setdesc,
+	vmx_inject,
+	vmx_getcap,
+	vmx_setcap
+};
diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h
new file mode 100644
index 0000000..c7cd567
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx.h
@@ -0,0 +1,120 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_H_
+#define	_VMX_H_
+
+#include "vmcs.h"
+
+#define	GUEST_MSR_MAX_ENTRIES	64		/* arbitrary */
+
+struct vmxctx {
+	register_t	tmpstk[32];		/* vmx_return() stack */
+	register_t	tmpstktop;
+
+	register_t	guest_rdi;		/* Guest state */
+	register_t	guest_rsi;
+	register_t	guest_rdx;
+	register_t	guest_rcx;
+	register_t	guest_r8;
+	register_t	guest_r9;
+	register_t	guest_rax;
+	register_t	guest_rbx;
+	register_t	guest_rbp;
+	register_t	guest_r10;
+	register_t	guest_r11;
+	register_t	guest_r12;
+	register_t	guest_r13;
+	register_t	guest_r14;
+	register_t	guest_r15;
+	register_t	guest_cr2;
+
+	register_t	host_r15;		/* Host state */
+	register_t	host_r14;
+	register_t	host_r13;
+	register_t	host_r12;
+	register_t	host_rbp;
+	register_t	host_rsp;
+	register_t	host_rbx;
+	register_t	host_rip;
+	/*
+	 * XXX todo debug registers and fpu state
+	 */
+	
+	int		launched;		/* vmcs launch state */
+	int		launch_error;
+};
+
+struct vmxcap {
+	int	set;
+	uint32_t proc_ctls;
+};
+
+struct vmxstate {
+	int	lastcpu;	/* host cpu that this 'vcpu' last ran on */
+	uint16_t vpid;
+};
+
+/* virtual machine softc */
+struct vmx {
+	pml4_entry_t	pml4ept[NPML4EPG];
+	struct vmcs	vmcs[VM_MAXCPU];	/* one vmcs per virtual cpu */
+	char		msr_bitmap[PAGE_SIZE];
+	struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES];
+	struct vmxctx	ctx[VM_MAXCPU];
+	struct vmxcap	cap[VM_MAXCPU];
+	struct vmxstate	state[VM_MAXCPU];
+	struct vm	*vm;
+};
+CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
+
+#define	VMX_RETURN_DIRECT	0
+#define	VMX_RETURN_LONGJMP	1
+#define	VMX_RETURN_VMRESUME	2
+#define	VMX_RETURN_VMLAUNCH	3
+#define	VMX_RETURN_AST		4
+/*
+ * vmx_setjmp() returns:
+ * - 0 when it returns directly
+ * - 1 when it returns from vmx_longjmp
+ * - 2 when it returns from vmx_resume (which would only be in the error case)
+ * - 3 when it returns from vmx_launch (which would only be in the error case)
+ * - 4 when it returns from vmx_resume or vmx_launch because of AST pending
+ */
+int	vmx_setjmp(struct vmxctx *ctx);
+void	vmx_longjmp(void);			/* returns via vmx_setjmp */
+void	vmx_launch(struct vmxctx *ctx) __dead2;	/* may return via vmx_setjmp */
+void	vmx_resume(struct vmxctx *ctx) __dead2;	/* may return via vmx_setjmp */
+
+u_long	vmx_fix_cr0(u_long cr0);
+u_long	vmx_fix_cr4(u_long cr4);
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_controls.h b/sys/amd64/vmm/intel/vmx_controls.h
new file mode 100644
index 0000000..31f29f8
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_controls.h
@@ -0,0 +1,92 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_CONTROLS_H_
+#define	_VMX_CONTROLS_H_
+
+/* Pin-Based VM-Execution Controls */
+#define	PINBASED_EXTINT_EXITING		(1 << 0)
+#define	PINBASED_NMI_EXITING		(1 << 3)
+#define	PINBASED_VIRTUAL_NMI		(1 << 5)
+#define	PINBASED_PREMPTION_TIMER	(1 << 6)
+
+/* Primary Processor-Based VM-Execution Controls */
+#define	PROCBASED_INT_WINDOW_EXITING	(1 << 2)
+#define	PROCBASED_TSC_OFFSET		(1 << 3)
+#define	PROCBASED_HLT_EXITING		(1 << 7)
+#define	PROCBASED_INVLPG_EXITING	(1 << 9)
+#define	PROCBASED_MWAIT_EXITING		(1 << 10)
+#define	PROCBASED_RDPMC_EXITING		(1 << 11)
+#define	PROCBASED_RDTSC_EXITING		(1 << 12)
+#define	PROCBASED_CR3_LOAD_EXITING	(1 << 15)
+#define	PROCBASED_CR3_STORE_EXITING	(1 << 16)
+#define	PROCBASED_CR8_LOAD_EXITING	(1 << 19)
+#define	PROCBASED_CR8_STORE_EXITING	(1 << 20)
+#define	PROCBASED_USE_TPR_SHADOW	(1 << 21)
+#define	PROCBASED_NMI_WINDOW_EXITING	(1 << 22)
+#define PROCBASED_MOV_DR_EXITING	(1 << 23)
+#define	PROCBASED_IO_EXITING		(1 << 24)
+#define	PROCBASED_IO_BITMAPS		(1 << 25)
+#define	PROCBASED_MTF			(1 << 27)
+#define	PROCBASED_MSR_BITMAPS		(1 << 28)
+#define	PROCBASED_MONITOR_EXITING	(1 << 29)
+#define	PROCBASED_PAUSE_EXITING		(1 << 30)
+#define	PROCBASED_SECONDARY_CONTROLS	(1 << 31)
+
+/* Secondary Processor-Based VM-Execution Controls */
+#define	PROCBASED2_VIRTUALIZE_APIC	(1 << 0)
+#define	PROCBASED2_ENABLE_EPT		(1 << 1)
+#define	PROCBASED2_DESC_TABLE_EXITING	(1 << 2)
+#define	PROCBASED2_ENABLE_RDTSCP	(1 << 3)
+#define	PROCBASED2_VIRTUALIZE_X2APIC	(1 << 4)
+#define	PROCBASED2_ENABLE_VPID		(1 << 5)
+#define	PROCBASED2_WBINVD_EXITING	(1 << 6)
+#define	PROCBASED2_UNRESTRICTED_GUEST	(1 << 7)
+#define	PROCBASED2_PAUSE_LOOP_EXITING	(1 << 10)
+
+/* VM Exit Controls */
+#define	VM_EXIT_SAVE_DEBUG_CONTROLS	(1 << 2)
+#define	VM_EXIT_HOST_LMA		(1 << 9)
+#define	VM_EXIT_LOAD_PERF_GLOBAL_CTRL	(1 << 12)
+#define	VM_EXIT_ACKNOWLEDGE_INTERRUPT	(1 << 15)
+#define	VM_EXIT_SAVE_PAT		(1 << 18)
+#define	VM_EXIT_LOAD_PAT		(1 << 19)
+#define	VM_EXIT_SAVE_EFER		(1 << 20)
+#define	VM_EXIT_LOAD_EFER		(1 << 21)
+#define	VM_EXIT_SAVE_PREEMPTION_TIMER	(1 << 22)
+
+/* VM Entry Controls */
+#define	VM_ENTRY_LOAD_DEBUG_CONTROLS	(1 << 2)
+#define	VM_ENTRY_GUEST_LMA		(1 << 9)
+#define	VM_ENTRY_INTO_SMM		(1 << 10)
+#define	VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1 << 11)
+#define	VM_ENTRY_LOAD_PERF_GLOBAL_CTRL	(1 << 13)
+#define	VM_ENTRY_LOAD_PAT		(1 << 14)
+#define	VM_ENTRY_LOAD_EFER		(1 << 15)
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_cpufunc.h b/sys/amd64/vmm/intel/vmx_cpufunc.h
new file mode 100644
index 0000000..2e66443
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_cpufunc.h
@@ -0,0 +1,218 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_VMX_CPUFUNC_H_
+#define	_VMX_CPUFUNC_H_
+
+struct vmcs;
+
+/*
+ * Section 5.2 "Conventions" from Intel Architecture Manual 2B.
+ *
+ *			error
+ * VMsucceed		  0
+ * VMFailInvalid	  1
+ * VMFailValid		  2	see also VMCS VM-Instruction Error Field
+ */
+#define	VM_SUCCESS		0
+#define	VM_FAIL_INVALID		1
+#define	VM_FAIL_VALID		2
+#define	VMX_SET_ERROR_CODE \
+	"	jnc 1f;"						\
+	"	mov $1, %[error];"	/* CF: error = 1 */		\
+	"	jmp 3f;"						\
+	"1:	jnz 2f;"						\
+	"	mov $2, %[error];"	/* ZF: error = 2 */		\
+	"	jmp 3f;"						\
+	"2:	mov $0, %[error];"					\
+	"3:"
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmxon(char *region)
+{
+	int error;
+	uint64_t addr;
+
+	addr = vtophys(region);
+	__asm __volatile("vmxon %[addr];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [addr] "m" (*(uint64_t *)&addr)
+			 : "memory");
+
+	return (error);
+}
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmclear(struct vmcs *vmcs)
+{
+	int error;
+	uint64_t addr;
+
+	addr = vtophys(vmcs);
+	__asm __volatile("vmclear %[addr];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [addr] "m" (*(uint64_t *)&addr)
+			 : "memory");
+	return (error);
+}
+
+static __inline void
+vmxoff(void)
+{
+
+	__asm __volatile("vmxoff");
+}
+
+static __inline void
+vmptrst(uint64_t *addr)
+{
+
+	__asm __volatile("vmptrst %[addr]" :: [addr]"m" (*addr) : "memory");
+}
+
+static __inline int
+vmptrld(struct vmcs *vmcs)
+{
+	int error;
+	uint64_t addr;
+
+	addr = vtophys(vmcs);
+	__asm __volatile("vmptrld %[addr];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [addr] "m" (*(uint64_t *)&addr)
+			 : "memory");
+	return (error);
+}
+
+static __inline int
+vmwrite(uint64_t reg, uint64_t val)
+{
+	int error;
+
+	__asm __volatile("vmwrite %[val], %[reg];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [val] "r" (val), [reg] "r" (reg)
+			 : "memory");
+
+	return (error);
+}
+
+static __inline int
+vmread(uint64_t r, uint64_t *addr)
+{
+	int error;
+
+	__asm __volatile("vmread %[r], %[addr];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [r] "r" (r), [addr] "m" (*addr)
+			 : "memory");
+
+	return (error);
+}
+
+static void __inline
+VMCLEAR(struct vmcs *vmcs)
+{
+	int err;
+
+	err = vmclear(vmcs);
+	if (err != 0)
+		panic("%s: vmclear(%p) error %d", __func__, vmcs, err);
+
+	critical_exit();
+}
+
+static void __inline
+VMPTRLD(struct vmcs *vmcs)
+{
+	int err;
+
+	critical_enter();
+
+	err = vmptrld(vmcs);
+	if (err != 0)
+		panic("%s: vmptrld(%p) error %d", __func__, vmcs, err);
+}
+
+#define	INVVPID_TYPE_ADDRESS		0UL
+#define	INVVPID_TYPE_SINGLE_CONTEXT	1UL
+#define	INVVPID_TYPE_ALL_CONTEXTS	2UL
+
+struct invvpid_desc {
+	uint16_t	vpid;
+	uint16_t	_res1;
+	uint32_t	_res2;
+	uint64_t	linear_addr;
+};
+CTASSERT(sizeof(struct invvpid_desc) == 16);
+
+static void __inline
+invvpid(uint64_t type, struct invvpid_desc desc)
+{
+	int error;
+
+	__asm __volatile("invvpid %[desc], %[type];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [desc] "m" (desc), [type] "r" (type)
+			 : "memory");
+
+	if (error)
+		panic("invvpid error %d", error);
+}
+
+#define	INVEPT_TYPE_SINGLE_CONTEXT	1UL
+#define	INVEPT_TYPE_ALL_CONTEXTS	2UL
+struct invept_desc {
+	uint64_t	eptp;
+	uint64_t	_res;
+};
+CTASSERT(sizeof(struct invept_desc) == 16);
+
+static void __inline
+invept(uint64_t type, struct invept_desc desc)
+{
+	int error;
+
+	__asm __volatile("invept %[desc], %[type];"
+			 VMX_SET_ERROR_CODE
+			 : [error] "=r" (error)
+			 : [desc] "m" (desc), [type] "r" (type)
+			 : "memory");
+
+	if (error)
+		panic("invept error %d", error);
+}
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_genassym.c b/sys/amd64/vmm/intel/vmx_genassym.c
new file mode 100644
index 0000000..823a05d
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_genassym.c
@@ -0,0 +1,89 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/assym.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+
+#include <machine/vmm.h>
+#include "vmx.h"
+#include "vmx_cpufunc.h"
+
+ASSYM(VMXCTX_TMPSTKTOP, offsetof(struct vmxctx, tmpstktop));
+ASSYM(VMXCTX_GUEST_RDI, offsetof(struct vmxctx, guest_rdi));
+ASSYM(VMXCTX_GUEST_RSI, offsetof(struct vmxctx, guest_rsi));
+ASSYM(VMXCTX_GUEST_RDX, offsetof(struct vmxctx, guest_rdx));
+ASSYM(VMXCTX_GUEST_RCX, offsetof(struct vmxctx, guest_rcx));
+ASSYM(VMXCTX_GUEST_R8, offsetof(struct vmxctx, guest_r8));
+ASSYM(VMXCTX_GUEST_R9, offsetof(struct vmxctx, guest_r9));
+ASSYM(VMXCTX_GUEST_RAX, offsetof(struct vmxctx, guest_rax));
+ASSYM(VMXCTX_GUEST_RBX, offsetof(struct vmxctx, guest_rbx));
+ASSYM(VMXCTX_GUEST_RBP, offsetof(struct vmxctx, guest_rbp));
+ASSYM(VMXCTX_GUEST_R10, offsetof(struct vmxctx, guest_r10));
+ASSYM(VMXCTX_GUEST_R11, offsetof(struct vmxctx, guest_r11));
+ASSYM(VMXCTX_GUEST_R12, offsetof(struct vmxctx, guest_r12));
+ASSYM(VMXCTX_GUEST_R13, offsetof(struct vmxctx, guest_r13));
+ASSYM(VMXCTX_GUEST_R14, offsetof(struct vmxctx, guest_r14));
+ASSYM(VMXCTX_GUEST_R15, offsetof(struct vmxctx, guest_r15));
+ASSYM(VMXCTX_GUEST_CR2, offsetof(struct vmxctx, guest_cr2));
+
+ASSYM(VMXCTX_HOST_R15, offsetof(struct vmxctx, host_r15));
+ASSYM(VMXCTX_HOST_R14, offsetof(struct vmxctx, host_r14));
+ASSYM(VMXCTX_HOST_R13, offsetof(struct vmxctx, host_r13));
+ASSYM(VMXCTX_HOST_R12, offsetof(struct vmxctx, host_r12));
+ASSYM(VMXCTX_HOST_RBP, offsetof(struct vmxctx, host_rbp));
+ASSYM(VMXCTX_HOST_RSP, offsetof(struct vmxctx, host_rsp));
+ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx));
+ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip));
+
+ASSYM(VMXCTX_LAUNCH_ERROR, offsetof(struct vmxctx, launch_error));
+
+ASSYM(VM_SUCCESS,	VM_SUCCESS);
+ASSYM(VM_FAIL_INVALID,	VM_FAIL_INVALID);
+ASSYM(VM_FAIL_VALID,	VM_FAIL_VALID);
+
+ASSYM(VMX_RETURN_DIRECT,	VMX_RETURN_DIRECT);
+ASSYM(VMX_RETURN_LONGJMP,	VMX_RETURN_LONGJMP);
+ASSYM(VMX_RETURN_VMRESUME,	VMX_RETURN_VMRESUME);
+ASSYM(VMX_RETURN_VMLAUNCH,	VMX_RETURN_VMLAUNCH);
+ASSYM(VMX_RETURN_AST,		VMX_RETURN_AST);
+
+ASSYM(TDF_ASTPENDING, TDF_ASTPENDING);
+ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED);
+ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
+ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread));
diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c
new file mode 100644
index 0000000..2aba63c
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_msr.c
@@ -0,0 +1,172 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/cpufunc.h>
+
+#include "vmx_msr.h"
+
+static boolean_t
+vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
+{
+
+	if (msr_val & (1UL << (bitpos + 32)))
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+static boolean_t
+vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
+{
+
+	if ((msr_val & (1UL << bitpos)) == 0)
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+uint32_t
+vmx_revision(void)
+{
+
+	return (rdmsr(MSR_VMX_BASIC) & 0xffffffff);
+}
+
+/*
+ * Generate a bitmask to be used for the VMCS execution control fields.
+ *
+ * The caller specifies what bits should be set to one in 'ones_mask'
+ * and what bits should be set to zero in 'zeros_mask'. The don't-care
+ * bits are set to the default value. The default values are obtained
+ * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining
+ * VMX Capabilities".
+ *
+ * Returns zero on success and non-zero on error.
+ */
+int
+vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
+	       uint32_t zeros_mask, uint32_t *retval)
+{
+	int i;
+	uint64_t val, trueval;
+	boolean_t true_ctls_avail, one_allowed, zero_allowed;
+
+	/* We cannot ask the same bit to be set to both '1' and '0' */
+	if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
+		return (EINVAL);
+
+	if (rdmsr(MSR_VMX_BASIC) & (1UL << 55))
+		true_ctls_avail = TRUE;
+	else
+		true_ctls_avail = FALSE;
+
+	val = rdmsr(ctl_reg);
+	if (true_ctls_avail)
+		trueval = rdmsr(true_ctl_reg);		/* step c */
+	else
+		trueval = val;				/* step a */
+
+	for (i = 0; i < 32; i++) {
+		one_allowed = vmx_ctl_allows_one_setting(trueval, i);
+		zero_allowed = vmx_ctl_allows_zero_setting(trueval, i);
+
+		KASSERT(one_allowed || zero_allowed,
+			("invalid zero/one setting for bit %d of ctl 0x%0x, "
+			 "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg));
+
+		if (zero_allowed && !one_allowed) {		/* b(i),c(i) */
+			if (ones_mask & (1 << i))
+				return (EINVAL);
+			*retval &= ~(1 << i);
+		} else if (one_allowed && !zero_allowed) {	/* b(i),c(i) */
+			if (zeros_mask & (1 << i))
+				return (EINVAL);
+			*retval |= 1 << i;
+		} else {
+			if (zeros_mask & (1 << i))	/* b(ii),c(ii) */
+				*retval &= ~(1 << i);
+			else if (ones_mask & (1 << i)) /* b(ii), c(ii) */
+				*retval |= 1 << i;
+			else if (!true_ctls_avail)
+				*retval &= ~(1 << i);	/* b(iii) */
+			else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/
+				*retval &= ~(1 << i);
+			else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */
+				*retval |= 1 << i;
+			else {
+				panic("vmx_set_ctlreg: unable to determine "
+				      "correct value of ctl bit %d for msr "
+				      "0x%0x and true msr 0x%0x", i, ctl_reg,
+				      true_ctl_reg);
+			}
+		}
+	}
+
+	return (0);
+}
+
+void
+msr_bitmap_initialize(char *bitmap)
+{
+
+	memset(bitmap, 0xff, PAGE_SIZE);
+}
+
+int
+msr_bitmap_change_access(char *bitmap, u_int msr, int access)
+{
+	int byte, bit;
+
+	if (msr <= 0x00001FFF)
+		byte = msr / 8;
+	else if (msr >= 0xC0000000 && msr <= 0xC0001FFF)
+		byte = 1024 + (msr - 0xC0000000) / 8;
+	else
+		return (EINVAL);
+
+	bit = msr & 0x7;
+
+	if (access & MSR_BITMAP_ACCESS_READ)
+		bitmap[byte] &= ~(1 << bit);
+	else
+		bitmap[byte] |= 1 << bit;
+
+	byte += 2048;
+	if (access & MSR_BITMAP_ACCESS_WRITE)
+		bitmap[byte] &= ~(1 << bit);
+	else
+		bitmap[byte] |= 1 << bit;
+
+	return (0);
+}
diff --git a/sys/amd64/vmm/intel/vmx_msr.h b/sys/amd64/vmm/intel/vmx_msr.h
new file mode 100644
index 0000000..e6379a9
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_msr.h
@@ -0,0 +1,78 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMX_MSR_H_
+#define	_VMX_MSR_H_
+
+#define	MSR_VMX_BASIC			0x480
+#define	MSR_VMX_EPT_VPID_CAP		0x48C
+
+#define	MSR_VMX_PROCBASED_CTLS		0x482
+#define	MSR_VMX_TRUE_PROCBASED_CTLS	0x48E
+
+#define	MSR_VMX_PINBASED_CTLS		0x481
+#define	MSR_VMX_TRUE_PINBASED_CTLS	0x48D
+
+#define	MSR_VMX_PROCBASED_CTLS2		0x48B
+
+#define	MSR_VMX_EXIT_CTLS		0x483
+#define	MSR_VMX_TRUE_EXIT_CTLS		0x48f
+
+#define	MSR_VMX_ENTRY_CTLS		0x484
+#define	MSR_VMX_TRUE_ENTRY_CTLS		0x490
+
+#define	MSR_VMX_CR0_FIXED0		0x486
+#define	MSR_VMX_CR0_FIXED1		0x487
+
+#define	MSR_VMX_CR4_FIXED0		0x488
+#define	MSR_VMX_CR4_FIXED1		0x489
+
+uint32_t vmx_revision(void);
+
+int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
+		   uint32_t zeros_mask, uint32_t *retval);
+
+/*
+ * According to Section 21.10.4 "Software Access to Related Structures",
+ * changes to data structures pointed to by the VMCS must be made only when
+ * there is no logical processor with a current VMCS that points to the
+ * data structure.
+ *
+ * This pretty much limits us to configuring the MSR bitmap before VMCS
+ * initialization for SMP VMs. Unless of course we do it the hard way - which
+ * would involve some form of synchronization between the vcpus to vmclear
+ * all VMCSs' that point to the bitmap.
+ */
+#define	MSR_BITMAP_ACCESS_NONE	0x0
+#define	MSR_BITMAP_ACCESS_READ	0x1
+#define	MSR_BITMAP_ACCESS_WRITE	0x2
+#define	MSR_BITMAP_ACCESS_RW	(MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE)
+void	msr_bitmap_initialize(char *bitmap);
+int	msr_bitmap_change_access(char *bitmap, u_int msr, int access);
+
+#endif
diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S
new file mode 100644
index 0000000..4ba582a
--- /dev/null
+++ b/sys/amd64/vmm/intel/vmx_support.S
@@ -0,0 +1,246 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <machine/asmacros.h>
+
+#include "vmx_assym.s"
+
+/*
+ * Disable interrupts before updating %rsp in VMX_CHECK_AST or
+ * VMX_GUEST_RESTORE.
+ *
+ * The location that %rsp points to is a 'vmxctx' and not a
+ * real stack so we don't want an interrupt handler to trash it
+ */
+#define	VMX_DISABLE_INTERRUPTS		cli
+
+/*
+ * If the thread hosting the vcpu has an ast pending then take care of it
+ * by returning from vmx_setjmp() with a return value of VMX_RETURN_AST.
+ *
+ * Assumes that %rdi holds a pointer to the 'vmxctx' and that interrupts
+ * are disabled.
+ */
+#define	VMX_CHECK_AST							\
+	movq	PCPU(CURTHREAD),%rax;					\
+	testl	$TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax);	\
+	je	9f;							\
+	movq	$VMX_RETURN_AST,%rsi;					\
+	movq	%rdi,%rsp;						\
+	addq	$VMXCTX_TMPSTKTOP,%rsp;					\
+	callq	vmx_return;						\
+9:
+
+/*
+ * Assumes that %rdi holds a pointer to the 'vmxctx'.
+ *
+ * On "return" all registers are updated to reflect guest state. The two
+ * exceptions are %rip and %rsp. These registers are atomically switched
+ * by hardware from the guest area of the vmcs.
+ *
+ * We modify %rsp to point to the 'vmxctx' so we can use it to restore
+ * host context in case of an error with 'vmlaunch' or 'vmresume'.
+ */
+#define	VMX_GUEST_RESTORE						\
+	movq	%rdi,%rsp;						\
+	movq	VMXCTX_GUEST_CR2(%rdi),%rsi;				\
+	movq	%rsi,%cr2;						\
+	movq	VMXCTX_GUEST_RSI(%rdi),%rsi;				\
+	movq	VMXCTX_GUEST_RDX(%rdi),%rdx;				\
+	movq	VMXCTX_GUEST_RCX(%rdi),%rcx;				\
+	movq	VMXCTX_GUEST_R8(%rdi),%r8;				\
+	movq	VMXCTX_GUEST_R9(%rdi),%r9;				\
+	movq	VMXCTX_GUEST_RAX(%rdi),%rax;				\
+	movq	VMXCTX_GUEST_RBX(%rdi),%rbx;				\
+	movq	VMXCTX_GUEST_RBP(%rdi),%rbp;				\
+	movq	VMXCTX_GUEST_R10(%rdi),%r10;				\
+	movq	VMXCTX_GUEST_R11(%rdi),%r11;				\
+	movq	VMXCTX_GUEST_R12(%rdi),%r12;				\
+	movq	VMXCTX_GUEST_R13(%rdi),%r13;				\
+	movq	VMXCTX_GUEST_R14(%rdi),%r14;				\
+	movq	VMXCTX_GUEST_R15(%rdi),%r15;				\
+	movq	VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */
+
+#define	VM_INSTRUCTION_ERROR(reg)					\
+	jnc 	1f;							\
+	movl 	$VM_FAIL_INVALID,reg;		/* CF is set */		\
+	jmp 	3f;							\
+1:	jnz 	2f;							\
+	movl 	$VM_FAIL_VALID,reg;		/* ZF is set */		\
+	jmp 	3f;							\
+2:	movl 	$VM_SUCCESS,reg;					\
+3:	movl	reg,VMXCTX_LAUNCH_ERROR(%rsp)
+
+	.text
+/*
+ * int vmx_setjmp(ctxp)
+ * %rdi = ctxp
+ *
+ * Return value is '0' when it returns directly from here.
+ * Return value is '1' when it returns after a vm exit through vmx_longjmp.
+ */
+ENTRY(vmx_setjmp)
+	movq	(%rsp),%rax			/* return address */
+	movq    %r15,VMXCTX_HOST_R15(%rdi)
+	movq    %r14,VMXCTX_HOST_R14(%rdi)
+	movq    %r13,VMXCTX_HOST_R13(%rdi)
+	movq    %r12,VMXCTX_HOST_R12(%rdi)
+	movq    %rbp,VMXCTX_HOST_RBP(%rdi)
+	movq    %rsp,VMXCTX_HOST_RSP(%rdi)
+	movq    %rbx,VMXCTX_HOST_RBX(%rdi)
+	movq    %rax,VMXCTX_HOST_RIP(%rdi)
+
+	/*
+	 * XXX save host debug registers
+	 */
+	movl	$VMX_RETURN_DIRECT,%eax
+	ret
+END(vmx_setjmp)
+
+/*
+ * void vmx_return(struct vmxctx *ctxp, int retval)
+ * %rdi = ctxp
+ * %rsi = retval
+ * Return to vmm context through vmx_setjmp() with a value of 'retval'.
+ */
+ENTRY(vmx_return)
+	/* Restore host context. */
+	movq	VMXCTX_HOST_R15(%rdi),%r15
+	movq	VMXCTX_HOST_R14(%rdi),%r14
+	movq	VMXCTX_HOST_R13(%rdi),%r13
+	movq	VMXCTX_HOST_R12(%rdi),%r12
+	movq	VMXCTX_HOST_RBP(%rdi),%rbp
+	movq	VMXCTX_HOST_RSP(%rdi),%rsp
+	movq	VMXCTX_HOST_RBX(%rdi),%rbx
+	movq	VMXCTX_HOST_RIP(%rdi),%rax
+	movq	%rax,(%rsp)			/* return address */
+
+	/*
+	 * XXX restore host debug registers
+	 */
+	movl	%esi,%eax
+	ret
+END(vmx_return)
+
+/*
+ * void vmx_longjmp(void)
+ * %rsp points to the struct vmxctx
+ */
+ENTRY(vmx_longjmp)
+	/*
+	 * Save guest state that is not automatically saved in the vmcs.
+	 */
+	movq	%rdi,VMXCTX_GUEST_RDI(%rsp)
+	movq	%rsi,VMXCTX_GUEST_RSI(%rsp)
+	movq	%rdx,VMXCTX_GUEST_RDX(%rsp)
+	movq	%rcx,VMXCTX_GUEST_RCX(%rsp)
+	movq	%r8,VMXCTX_GUEST_R8(%rsp)
+	movq	%r9,VMXCTX_GUEST_R9(%rsp)
+	movq	%rax,VMXCTX_GUEST_RAX(%rsp)
+	movq	%rbx,VMXCTX_GUEST_RBX(%rsp)
+	movq	%rbp,VMXCTX_GUEST_RBP(%rsp)
+	movq	%r10,VMXCTX_GUEST_R10(%rsp)
+	movq	%r11,VMXCTX_GUEST_R11(%rsp)
+	movq	%r12,VMXCTX_GUEST_R12(%rsp)
+	movq	%r13,VMXCTX_GUEST_R13(%rsp)
+	movq	%r14,VMXCTX_GUEST_R14(%rsp)
+	movq	%r15,VMXCTX_GUEST_R15(%rsp)
+
+	movq	%cr2,%rdi
+	movq	%rdi,VMXCTX_GUEST_CR2(%rsp)
+
+	movq	%rsp,%rdi
+	movq	$VMX_RETURN_LONGJMP,%rsi
+
+	addq	$VMXCTX_TMPSTKTOP,%rsp
+	callq	vmx_return
+END(vmx_longjmp)
+
+/*
+ * void vmx_resume(struct vmxctx *ctxp)
+ * %rdi = ctxp
+ *
+ * Although the return type is a 'void' this function may return indirectly
+ * through vmx_setjmp() with a return value of 2.
+ */
+ENTRY(vmx_resume)
+	VMX_DISABLE_INTERRUPTS
+
+	VMX_CHECK_AST
+
+	/*
+	 * Restore guest state that is not automatically loaded from the vmcs.
+	 */
+	VMX_GUEST_RESTORE
+
+	vmresume
+
+	/*
+	 * Capture the reason why vmresume failed.
+	 */
+	VM_INSTRUCTION_ERROR(%eax)
+
+	/* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */
+	movq	%rsp,%rdi
+	movq	$VMX_RETURN_VMRESUME,%rsi
+
+	addq	$VMXCTX_TMPSTKTOP,%rsp
+	callq	vmx_return
+END(vmx_resume)
+
+/*
+ * void vmx_launch(struct vmxctx *ctxp)
+ * %rdi = ctxp
+ *
+ * Although the return type is a 'void' this function may return indirectly
+ * through vmx_setjmp() with a return value of 3.
+ */
+ENTRY(vmx_launch)
+	VMX_DISABLE_INTERRUPTS
+
+	VMX_CHECK_AST
+
+	/*
+	 * Restore guest state that is not automatically loaded from the vmcs.
+	 */
+	VMX_GUEST_RESTORE
+
+	vmlaunch
+
+	/*
+	 * Capture the reason why vmlaunch failed.
+	 */
+	VM_INSTRUCTION_ERROR(%eax)
+
+	/* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */
+	movq	%rsp,%rdi
+	movq	$VMX_RETURN_VMLAUNCH,%rsi
+
+	addq	$VMXCTX_TMPSTKTOP,%rsp
+	callq	vmx_return
+END(vmx_launch)
diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c
new file mode 100644
index 0000000..ef0e9bc
--- /dev/null
+++ b/sys/amd64/vmm/intel/vtd.c
@@ -0,0 +1,677 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <dev/pci/pcireg.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+#include <machine/pci_cfgreg.h>
+
+#include "io/iommu.h"
+
+/*
+ * Documented in the "Intel Virtualization Technology for Directed I/O",
+ * Architecture Spec, September 2008.
+ */
+
+/* Section 10.4 "Register Descriptions" */
+struct vtdmap {
+	volatile uint32_t	version;
+	volatile uint32_t	res0;
+	volatile uint64_t	cap;
+	volatile uint64_t	ext_cap;
+	volatile uint32_t	gcr;
+	volatile uint32_t	gsr;
+	volatile uint64_t	rta;
+	volatile uint64_t	ccr;
+};
+
+#define	VTD_CAP_SAGAW(cap)	(((cap) >> 8) & 0x1F)
+#define	VTD_CAP_ND(cap)		((cap) & 0x7)
+#define	VTD_CAP_CM(cap)		(((cap) >> 7) & 0x1)
+#define	VTD_CAP_SPS(cap)	(((cap) >> 34) & 0xF)
+#define	VTD_CAP_RWBF(cap)	(((cap) >> 4) & 0x1)
+
+#define	VTD_ECAP_DI(ecap)	(((ecap) >> 2) & 0x1)
+#define	VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1)
+#define	VTD_ECAP_IRO(ecap)	(((ecap) >> 8) & 0x3FF)
+
+#define	VTD_GCR_WBF		(1 << 27)
+#define	VTD_GCR_SRTP		(1 << 30)
+#define	VTD_GCR_TE		(1 << 31)
+
+#define	VTD_GSR_WBFS		(1 << 27)
+#define	VTD_GSR_RTPS		(1 << 30)
+#define	VTD_GSR_TES		(1 << 31)
+
+#define	VTD_CCR_ICC		(1UL << 63)	/* invalidate context cache */
+#define	VTD_CCR_CIRG_GLOBAL	(1UL << 61)	/* global invalidation */
+
+#define	VTD_IIR_IVT		(1UL << 63)	/* invalidation IOTLB */
+#define	VTD_IIR_IIRG_GLOBAL	(1ULL << 60)	/* global IOTLB invalidation */
+#define	VTD_IIR_IIRG_DOMAIN	(2ULL << 60)	/* domain IOTLB invalidation */
+#define	VTD_IIR_IIRG_PAGE	(3ULL << 60)	/* page IOTLB invalidation */
+#define	VTD_IIR_DRAIN_READS	(1ULL << 49)	/* drain pending DMA reads */
+#define	VTD_IIR_DRAIN_WRITES	(1ULL << 48)	/* drain pending DMA writes */
+#define	VTD_IIR_DOMAIN_P	32
+
+#define	VTD_ROOT_PRESENT	0x1
+#define	VTD_CTX_PRESENT		0x1
+#define	VTD_CTX_TT_ALL		(1UL << 2)
+
+#define	VTD_PTE_RD		(1UL << 0)
+#define	VTD_PTE_WR		(1UL << 1)
+#define	VTD_PTE_SUPERPAGE	(1UL << 7)
+#define	VTD_PTE_ADDR_M		(0x000FFFFFFFFFF000UL)
+
+struct domain {
+	uint64_t	*ptp;		/* first level page table page */
+	int		pt_levels;	/* number of page table levels */
+	int		addrwidth;	/* 'AW' field in context entry */
+	int		spsmask;	/* supported super page sizes */
+	u_int		id;		/* domain id */
+	vm_paddr_t	maxaddr;	/* highest address to be mapped */
+	SLIST_ENTRY(domain) next;
+};
+
+static SLIST_HEAD(, domain) domhead;
+
+#define	DRHD_MAX_UNITS	8
+static int		drhd_num;
+static struct vtdmap	*vtdmaps[DRHD_MAX_UNITS];
+static int		max_domains;
+typedef int		(*drhd_ident_func_t)(void);
+
+static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
+static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
+
+static MALLOC_DEFINE(M_VTD, "vtd", "vtd");
+
+/*
+ * Config space register definitions from the "Intel 5520 and 5500" datasheet.
+ */
+static int
+tylersburg_vtd_ident(void)
+{
+	int units, nlbus;
+	uint16_t did, vid;
+	uint32_t miscsts, vtbar;
+
+	const int bus = 0;
+	const int slot = 20;
+	const int func = 0;
+
+	units = 0;
+
+	vid = pci_cfgregread(bus, slot, func, PCIR_VENDOR, 2);
+	did = pci_cfgregread(bus, slot, func, PCIR_DEVICE, 2);
+	if (vid != 0x8086 || did != 0x342E)
+		goto done;
+
+	/*
+	 * Check if this is a dual IOH configuration.
+	 */
+	miscsts = pci_cfgregread(bus, slot, func, 0x9C, 4);
+	if (miscsts & (1 << 25))
+		nlbus = pci_cfgregread(bus, slot, func, 0x160, 1);
+	else	
+		nlbus = -1;
+
+	vtbar = pci_cfgregread(bus, slot, func, 0x180, 4);
+	if (vtbar & 0x1) {
+		vtdmaps[units++] = (struct vtdmap *)
+					PHYS_TO_DMAP(vtbar & 0xffffe000);
+	} else if (bootverbose)
+		printf("VT-d unit in legacy IOH is disabled!\n");
+
+	if (nlbus != -1) {
+		vtbar = pci_cfgregread(nlbus, slot, func, 0x180, 4);
+		if (vtbar & 0x1) {
+			vtdmaps[units++] = (struct vtdmap *)
+					   PHYS_TO_DMAP(vtbar & 0xffffe000);
+		} else if (bootverbose)
+			printf("VT-d unit in non-legacy IOH is disabled!\n");
+	}
+done:
+	return (units);
+}
+
+static drhd_ident_func_t drhd_ident_funcs[] = {
+	tylersburg_vtd_ident,
+	NULL
+};
+
+static int
+vtd_max_domains(struct vtdmap *vtdmap)
+{
+	int nd;
+
+	nd = VTD_CAP_ND(vtdmap->cap);
+
+	switch (nd) {
+	case 0:
+		return (16);
+	case 1:
+		return (64);
+	case 2:
+		return (256);
+	case 3:
+		return (1024);
+	case 4:
+		return (4 * 1024);
+	case 5:
+		return (16 * 1024);
+	case 6:
+		return (64 * 1024);
+	default:
+		panic("vtd_max_domains: invalid value of nd (0x%0x)", nd);
+	}
+}
+
+static u_int
+domain_id(void)
+{
+	u_int id;
+	struct domain *dom;
+
+	/* Skip domain id 0 - it is reserved when Caching Mode field is set */
+	for (id = 1; id < max_domains; id++) {
+		SLIST_FOREACH(dom, &domhead, next) {
+			if (dom->id == id)
+				break;
+		}
+		if (dom == NULL)
+			break;		/* found it */
+	}
+	
+	if (id >= max_domains)
+		panic("domain ids exhausted");
+
+	return (id);
+}
+
+static void
+vtd_wbflush(struct vtdmap *vtdmap)
+{
+
+	if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0)
+		pmap_invalidate_cache();
+
+	if (VTD_CAP_RWBF(vtdmap->cap)) {
+		vtdmap->gcr = VTD_GCR_WBF;
+		while ((vtdmap->gsr & VTD_GSR_WBFS) != 0)
+			;
+	}
+}
+
+static void
+vtd_ctx_global_invalidate(struct vtdmap *vtdmap)
+{
+
+	vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL;
+	while ((vtdmap->ccr & VTD_CCR_ICC) != 0)
+		;
+}
+
+static void
+vtd_iotlb_global_invalidate(struct vtdmap *vtdmap)
+{
+	int offset;
+	volatile uint64_t *iotlb_reg, val;
+
+	vtd_wbflush(vtdmap);
+
+	offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16;
+	iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8);
+	
+	*iotlb_reg =  VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL |
+		      VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES;
+
+	while (1) {
+		val = *iotlb_reg;
+		if ((val & VTD_IIR_IVT) == 0)
+			break;
+	}
+}
+
+static void
+vtd_translation_enable(struct vtdmap *vtdmap)
+{
+
+	vtdmap->gcr = VTD_GCR_TE;
+	while ((vtdmap->gsr & VTD_GSR_TES) == 0)
+		;
+}
+
+static void
+vtd_translation_disable(struct vtdmap *vtdmap)
+{
+
+	vtdmap->gcr = 0;
+	while ((vtdmap->gsr & VTD_GSR_TES) != 0)
+		;
+}
+
+static int
+vtd_init(void)
+{
+	int i, units;
+	struct vtdmap *vtdmap;
+	vm_paddr_t ctx_paddr;
+	
+	for (i = 0; drhd_ident_funcs[i] != NULL; i++) {
+		units = (*drhd_ident_funcs[i])();
+		if (units > 0)
+			break;
+	}
+
+	if (units <= 0)
+		return (ENXIO);
+
+	drhd_num = units;
+	vtdmap = vtdmaps[0];
+
+	if (VTD_CAP_CM(vtdmap->cap) != 0)
+		panic("vtd_init: invalid caching mode");
+
+	max_domains = vtd_max_domains(vtdmap);
+
+	/*
+	 * Set up the root-table to point to the context-entry tables
+	 */
+	for (i = 0; i < 256; i++) {
+		ctx_paddr = vtophys(ctx_tables[i]);
+		if (ctx_paddr & PAGE_MASK)
+			panic("ctx table (0x%0lx) not page aligned", ctx_paddr);
+
+		root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT;
+	}
+
+	return (0);
+}
+
+static void
+vtd_cleanup(void)
+{
+}
+
+static void
+vtd_enable(void)
+{
+	int i;
+	struct vtdmap *vtdmap;
+
+	for (i = 0; i < drhd_num; i++) {
+		vtdmap = vtdmaps[i];
+		vtd_wbflush(vtdmap);
+
+		/* Update the root table address */
+		vtdmap->rta = vtophys(root_table);
+		vtdmap->gcr = VTD_GCR_SRTP;
+		while ((vtdmap->gsr & VTD_GSR_RTPS) == 0)
+			;
+
+		vtd_ctx_global_invalidate(vtdmap);
+		vtd_iotlb_global_invalidate(vtdmap);
+
+		vtd_translation_enable(vtdmap);
+	}
+}
+
+static void
+vtd_disable(void)
+{
+	int i;
+	struct vtdmap *vtdmap;
+
+	for (i = 0; i < drhd_num; i++) {
+		vtdmap = vtdmaps[i];
+		vtd_translation_disable(vtdmap);
+	}
+}
+
+static void
+vtd_add_device(void *arg, int bus, int slot, int func)
+{
+	int idx;
+	uint64_t *ctxp;
+	struct domain *dom = arg;
+	vm_paddr_t pt_paddr;
+	struct vtdmap *vtdmap;
+
+	if (bus < 0 || bus > PCI_BUSMAX ||
+	    slot < 0 || slot > PCI_SLOTMAX ||
+	    func < 0 || func > PCI_FUNCMAX)
+		panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
+
+	vtdmap = vtdmaps[0];
+	ctxp = ctx_tables[bus];
+	pt_paddr = vtophys(dom->ptp);
+	idx = (slot << 3 | func) * 2;
+
+	if (ctxp[idx] & VTD_CTX_PRESENT) {
+		panic("vtd_add_device: device %d/%d/%d is already owned by "
+		      "domain %d", bus, slot, func,
+		      (uint16_t)(ctxp[idx + 1] >> 8));
+	}
+
+	/*
+	 * Order is important. The 'present' bit is set only after all fields
+	 * of the context pointer are initialized.
+	 */
+	ctxp[idx + 1] = dom->addrwidth | (dom->id << 8);
+
+	if (VTD_ECAP_DI(vtdmap->ext_cap))
+		ctxp[idx] = VTD_CTX_TT_ALL;
+	else
+		ctxp[idx] = 0;
+
+	ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT;
+
+	/*
+	 * 'Not Present' entries are not cached in either the Context Cache
+	 * or in the IOTLB, so there is no need to invalidate either of them.
+	 */
+}
+
+static void
+vtd_remove_device(void *arg, int bus, int slot, int func)
+{
+	int i, idx;
+	uint64_t *ctxp;
+	struct vtdmap *vtdmap;
+
+	if (bus < 0 || bus > PCI_BUSMAX ||
+	    slot < 0 || slot > PCI_SLOTMAX ||
+	    func < 0 || func > PCI_FUNCMAX)
+		panic("vtd_add_device: invalid bsf %d/%d/%d", bus, slot, func);
+
+	ctxp = ctx_tables[bus];
+	idx = (slot << 3 | func) * 2;
+
+	/*
+	 * Order is important. The 'present' bit is must be cleared first.
+	 */
+	ctxp[idx] = 0;
+	ctxp[idx + 1] = 0;
+
+	/*
+	 * Invalidate the Context Cache and the IOTLB.
+	 *
+	 * XXX use device-selective invalidation for Context Cache
+	 * XXX use domain-selective invalidation for IOTLB
+	 */
+	for (i = 0; i < drhd_num; i++) {
+		vtdmap = vtdmaps[i];
+		vtd_ctx_global_invalidate(vtdmap);
+		vtd_iotlb_global_invalidate(vtdmap);
+	}
+}
+
+#define	CREATE_MAPPING	0
+#define	REMOVE_MAPPING	1
+
+static uint64_t
+vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,
+		   int remove)
+{
+	struct domain *dom;
+	int i, spshift, ptpshift, ptpindex, nlevels;
+	uint64_t spsize, *ptp;
+
+	dom = arg;
+	ptpindex = 0;
+	ptpshift = 0;
+
+	if (gpa & PAGE_MASK)
+		panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
+
+	if (hpa & PAGE_MASK)
+		panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa);
+
+	if (len & PAGE_MASK)
+		panic("vtd_create_mapping: unaligned len 0x%0lx", len);
+
+	/*
+	 * Compute the size of the mapping that we can accomodate.
+	 *
+	 * This is based on three factors:
+	 * - supported super page size
+	 * - alignment of the region starting at 'gpa' and 'hpa'
+	 * - length of the region 'len'
+	 */
+	spshift = 48;
+	for (i = 3; i >= 0; i--) {
+		spsize = 1UL << spshift;
+		if ((dom->spsmask & (1 << i)) != 0 &&
+		    (gpa & (spsize - 1)) == 0 &&
+		    (hpa & (spsize - 1)) == 0 &&
+		    (len >= spsize)) {
+			break;
+		}
+		spshift -= 9;
+	}
+
+	ptp = dom->ptp;
+	nlevels = dom->pt_levels;
+	while (--nlevels >= 0) {
+		ptpshift = 12 + nlevels * 9;
+		ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+		/* We have reached the leaf mapping */
+		if (spshift >= ptpshift) {
+			break;
+		}
+
+		/*
+		 * We are working on a non-leaf page table page.
+		 *
+		 * Create a downstream page table page if necessary and point
+		 * to it from the current page table.
+		 */
+		if (ptp[ptpindex] == 0) {
+			void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO);
+			ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR;
+		}
+
+		ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M);
+	}
+
+	if ((gpa & ((1UL << ptpshift) - 1)) != 0)
+		panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift);
+
+	/*
+	 * Update the 'gpa' -> 'hpa' mapping
+	 */
+	if (remove) {
+		ptp[ptpindex] = 0;
+	} else {
+		ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR;
+
+		if (nlevels > 0)
+			ptp[ptpindex] |= VTD_PTE_SUPERPAGE;
+	}
+
+	return (1UL << ptpshift);
+}
+
+static uint64_t
+vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
+{
+
+	return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING));
+}
+
+static uint64_t
+vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len)
+{
+
+	return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING));
+}
+
+static void
+vtd_invalidate_tlb(void *dom)
+{
+	int i;
+	struct vtdmap *vtdmap;
+
+	/*
+	 * Invalidate the IOTLB.
+	 * XXX use domain-selective invalidation for IOTLB
+	 */
+	for (i = 0; i < drhd_num; i++) {
+		vtdmap = vtdmaps[i];
+		vtd_iotlb_global_invalidate(vtdmap);
+	}
+}
+
+static void *
+vtd_create_domain(vm_paddr_t maxaddr)
+{
+	struct domain *dom;
+	vm_paddr_t addr;
+	int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth;
+	struct vtdmap *vtdmap;
+
+	if (drhd_num <= 0)
+		panic("vtd_create_domain: no dma remapping hardware available");
+
+	vtdmap = vtdmaps[0];
+
+	/*
+	 * Calculate AGAW.
+	 * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec.
+	 */
+	addr = 0;
+	for (gaw = 0; addr < maxaddr; gaw++)
+		addr = 1ULL << gaw;
+
+	res = (gaw - 12) % 9;
+	if (res == 0)
+		agaw = gaw;
+	else
+		agaw = gaw + 9 - res;
+
+	if (agaw > 64)
+		agaw = 64;
+
+	/*
+	 * Select the smallest Supported AGAW and the corresponding number
+	 * of page table levels.
+	 */
+	pt_levels = 2;
+	sagaw = 30;
+	addrwidth = 0;
+	tmp = VTD_CAP_SAGAW(vtdmap->cap);
+	for (i = 0; i < 5; i++) {
+		if ((tmp & (1 << i)) != 0 && sagaw >= agaw)
+			break;
+		pt_levels++;
+		addrwidth++;
+		sagaw += 9;
+		if (sagaw > 64)
+			sagaw = 64;
+	}
+
+	if (i >= 5) {
+		panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d",
+		      VTD_CAP_SAGAW(vtdmap->cap), agaw);
+	}
+
+	dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK);
+	dom->pt_levels = pt_levels;
+	dom->addrwidth = addrwidth;
+	dom->spsmask = VTD_CAP_SPS(vtdmap->cap);
+	dom->id = domain_id();
+	dom->maxaddr = maxaddr;
+	dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK);
+	if ((uintptr_t)dom->ptp & PAGE_MASK)
+		panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp);
+
+	SLIST_INSERT_HEAD(&domhead, dom, next);
+
+	return (dom);
+}
+
+static void
+vtd_free_ptp(uint64_t *ptp, int level)
+{
+	int i;
+	uint64_t *nlp;
+
+	if (level > 1) {
+		for (i = 0; i < 512; i++) {
+			if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0)
+				continue;
+			if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0)
+				continue;
+			nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M);
+			vtd_free_ptp(nlp, level - 1);
+		}
+	}
+
+	bzero(ptp, PAGE_SIZE);
+	free(ptp, M_VTD);
+}
+
+static void
+vtd_destroy_domain(void *arg)
+{
+	struct domain *dom;
+	
+	dom = arg;
+
+	SLIST_REMOVE(&domhead, dom, domain, next);
+	vtd_free_ptp(dom->ptp, dom->pt_levels);
+	free(dom, M_VTD);
+}
+
+struct iommu_ops iommu_ops_intel = {
+	vtd_init,
+	vtd_cleanup,
+	vtd_enable,
+	vtd_disable,
+	vtd_create_domain,
+	vtd_destroy_domain,
+	vtd_create_mapping,
+	vtd_remove_mapping,
+	vtd_add_device,
+	vtd_remove_device,
+	vtd_invalidate_tlb,
+};
diff --git a/sys/amd64/vmm/io/iommu.c b/sys/amd64/vmm/io/iommu.c
new file mode 100644
index 0000000..c8447cc
--- /dev/null
+++ b/sys/amd64/vmm/io/iommu.c
@@ -0,0 +1,277 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/md_var.h>
+
+#include "vmm_util.h"
+#include "vmm_mem.h"
+#include "iommu.h"
+
+static boolean_t iommu_avail;
+static struct iommu_ops *ops;
+static void *host_domain;
+
+static __inline int
+IOMMU_INIT(void)
+{
+	if (ops != NULL)
+		return ((*ops->init)());
+	else
+		return (ENXIO);
+}
+
+static __inline void
+IOMMU_CLEANUP(void)
+{
+	if (ops != NULL && iommu_avail)
+		(*ops->cleanup)();
+}
+
+static __inline void *
+IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr)
+{
+
+	if (ops != NULL && iommu_avail)
+		return ((*ops->create_domain)(maxaddr));
+	else
+		return (NULL);
+}
+
+static __inline void
+IOMMU_DESTROY_DOMAIN(void *dom)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->destroy_domain)(dom);
+}
+
+static __inline uint64_t
+IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
+{
+
+	if (ops != NULL && iommu_avail)
+		return ((*ops->create_mapping)(domain, gpa, hpa, len));
+	else
+		return (len);		/* XXX */
+}
+
+static __inline uint64_t
+IOMMU_REMOVE_MAPPING(void *domain, vm_paddr_t gpa, uint64_t len)
+{
+
+	if (ops != NULL && iommu_avail)
+		return ((*ops->remove_mapping)(domain, gpa, len));
+	else
+		return (len);		/* XXX */
+}
+
+static __inline void
+IOMMU_ADD_DEVICE(void *domain, int bus, int slot, int func)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->add_device)(domain, bus, slot, func);
+}
+
+static __inline void
+IOMMU_REMOVE_DEVICE(void *domain, int bus, int slot, int func)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->remove_device)(domain, bus, slot, func);
+}
+
+static __inline void
+IOMMU_INVALIDATE_TLB(void *domain)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->invalidate_tlb)(domain);
+}
+
+static __inline void
+IOMMU_ENABLE(void)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->enable)();
+}
+
+static __inline void
+IOMMU_DISABLE(void)
+{
+
+	if (ops != NULL && iommu_avail)
+		(*ops->disable)();
+}
+
+void
+iommu_init(void)
+{
+	int error, bus, slot, func;
+	vm_paddr_t maxaddr;
+	const char *name;
+	device_t dev;
+
+	if (vmm_is_intel())
+		ops = &iommu_ops_intel;
+	else if (vmm_is_amd())
+		ops = &iommu_ops_amd;
+	else
+		ops = NULL;
+
+	error = IOMMU_INIT();
+	if (error)
+		return;
+
+	iommu_avail = TRUE;
+
+	/*
+	 * Create a domain for the devices owned by the host
+	 */
+	maxaddr = vmm_mem_maxaddr();
+	host_domain = IOMMU_CREATE_DOMAIN(maxaddr);
+	if (host_domain == NULL)
+		panic("iommu_init: unable to create a host domain");
+
+	/*
+	 * Create 1:1 mappings from '0' to 'maxaddr' for devices assigned to
+	 * the host
+	 */
+	iommu_create_mapping(host_domain, 0, 0, maxaddr);
+
+	for (bus = 0; bus <= PCI_BUSMAX; bus++) {
+		for (slot = 0; slot <= PCI_SLOTMAX; slot++) {
+			for (func = 0; func <= PCI_FUNCMAX; func++) {
+				dev = pci_find_dbsf(0, bus, slot, func);
+				if (dev == NULL)
+					continue;
+
+				/* skip passthrough devices */
+				name = device_get_name(dev);
+				if (name != NULL && strcmp(name, "ppt") == 0)
+					continue;
+
+				/* everything else belongs to the host domain */
+				iommu_add_device(host_domain, bus, slot, func);
+			}
+		}
+	}
+	IOMMU_ENABLE();
+
+}
+
+void
+iommu_cleanup(void)
+{
+	IOMMU_DISABLE();
+	IOMMU_DESTROY_DOMAIN(host_domain);
+	IOMMU_CLEANUP();
+}
+
+void *
+iommu_create_domain(vm_paddr_t maxaddr)
+{
+
+	return (IOMMU_CREATE_DOMAIN(maxaddr));
+}
+
+void
+iommu_destroy_domain(void *dom)
+{
+
+	IOMMU_DESTROY_DOMAIN(dom);
+}
+
+void
+iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len)
+{
+	uint64_t mapped, remaining;
+
+	remaining = len;
+
+	while (remaining > 0) {
+		mapped = IOMMU_CREATE_MAPPING(dom, gpa, hpa, remaining);
+		gpa += mapped;
+		hpa += mapped;
+		remaining -= mapped;
+	}
+}
+
+void
+iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len)
+{
+	uint64_t unmapped, remaining;
+
+	remaining = len;
+
+	while (remaining > 0) {
+		unmapped = IOMMU_REMOVE_MAPPING(dom, gpa, remaining);
+		gpa += unmapped;
+		remaining -= unmapped;
+	}
+}
+
+void *
+iommu_host_domain(void)
+{
+
+	return (host_domain);
+}
+
+void
+iommu_add_device(void *dom, int bus, int slot, int func)
+{
+
+	IOMMU_ADD_DEVICE(dom, bus, slot, func);
+}
+
+void
+iommu_remove_device(void *dom, int bus, int slot, int func)
+{
+
+	IOMMU_REMOVE_DEVICE(dom, bus, slot, func);
+}
+
+void
+iommu_invalidate_tlb(void *domain)
+{
+
+	IOMMU_INVALIDATE_TLB(domain);
+}
diff --git a/sys/amd64/vmm/io/iommu.h b/sys/amd64/vmm/io/iommu.h
new file mode 100644
index 0000000..d5c1d6e
--- /dev/null
+++ b/sys/amd64/vmm/io/iommu.h
@@ -0,0 +1,75 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IO_IOMMU_H_
+#define	_IO_IOMMU_H_
+
+typedef int (*iommu_init_func_t)(void);
+typedef void (*iommu_cleanup_func_t)(void);
+typedef void (*iommu_enable_func_t)(void);
+typedef void (*iommu_disable_func_t)(void);
+typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr);
+typedef void (*iommu_destroy_domain_t)(void *domain);
+typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa,
+					   vm_paddr_t hpa, uint64_t len);
+typedef uint64_t (*iommu_remove_mapping_t)(void *domain, vm_paddr_t gpa,
+					   uint64_t len);
+typedef void (*iommu_add_device_t)(void *domain, int bus, int slot, int func);
+typedef void (*iommu_remove_device_t)(void *dom, int bus, int slot, int func);
+typedef void (*iommu_invalidate_tlb_t)(void *dom);
+
+struct iommu_ops {
+	iommu_init_func_t	init;		/* module wide */
+	iommu_cleanup_func_t	cleanup;
+	iommu_enable_func_t	enable;
+	iommu_disable_func_t	disable;
+
+	iommu_create_domain_t	create_domain;	/* domain-specific */
+	iommu_destroy_domain_t	destroy_domain;
+	iommu_create_mapping_t	create_mapping;
+	iommu_remove_mapping_t	remove_mapping;
+	iommu_add_device_t	add_device;
+	iommu_remove_device_t	remove_device;
+	iommu_invalidate_tlb_t	invalidate_tlb;
+};
+
+extern struct iommu_ops iommu_ops_intel;
+extern struct iommu_ops iommu_ops_amd;
+
+void	iommu_init(void);
+void	iommu_cleanup(void);
+void	*iommu_host_domain(void);
+void	*iommu_create_domain(vm_paddr_t maxaddr);
+void	iommu_destroy_domain(void *dom);
+void	iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa,
+			     size_t len);
+void	iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len);
+void	iommu_add_device(void *dom, int bus, int slot, int func);
+void	iommu_remove_device(void *dom, int bus, int slot, int func);
+void	iommu_invalidate_tlb(void *domain);
+#endif
diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c
new file mode 100644
index 0000000..fdf136b
--- /dev/null
+++ b/sys/amd64/vmm/io/ppt.c
@@ -0,0 +1,610 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/bus.h>
+#include <sys/pciio.h>
+#include <sys/rman.h>
+#include <sys/smp.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/resource.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+
+#include "vmm_lapic.h"
+#include "vmm_ktr.h"
+
+#include "iommu.h"
+#include "ppt.h"
+
+/* XXX locking */
+
+#define	MAX_PPTDEVS	(sizeof(pptdevs) / sizeof(pptdevs[0]))
+#define	MAX_MMIOSEGS	(PCIR_MAX_BAR_0 + 1)
+#define	MAX_MSIMSGS	32
+
+MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
+
+struct pptintr_arg {				/* pptintr(pptintr_arg) */
+	struct pptdev	*pptdev;
+	int		vec;
+	int 		vcpu;
+};
+
+static struct pptdev {
+	device_t	dev;
+	struct vm	*vm;			/* owner of this device */
+	struct vm_memory_segment mmio[MAX_MMIOSEGS];
+	struct {
+		int	num_msgs;		/* guest state */
+
+		int	startrid;		/* host state */
+		struct resource *res[MAX_MSIMSGS];
+		void	*cookie[MAX_MSIMSGS];
+		struct pptintr_arg arg[MAX_MSIMSGS];
+	} msi;
+
+	struct {
+		int num_msgs;
+		int startrid;
+		int msix_table_rid;
+		struct resource *msix_table_res;
+		struct resource **res;
+		void **cookie;
+		struct pptintr_arg *arg;
+	} msix;
+} pptdevs[32];
+
+static int num_pptdevs;
+
+static int
+ppt_probe(device_t dev)
+{
+	int bus, slot, func;
+	struct pci_devinfo *dinfo;
+
+	dinfo = (struct pci_devinfo *)device_get_ivars(dev);
+
+	bus = pci_get_bus(dev);
+	slot = pci_get_slot(dev);
+	func = pci_get_function(dev);
+
+	/*
+	 * To qualify as a pci passthrough device a device must:
+	 * - be allowed by administrator to be used in this role
+	 * - be an endpoint device
+	 */
+	if (vmm_is_pptdev(bus, slot, func) &&
+	    (dinfo->cfg.hdrtype & PCIM_HDRTYPE) == PCIM_HDRTYPE_NORMAL)
+		return (0);
+	else
+		return (ENXIO);
+}
+
+static int
+ppt_attach(device_t dev)
+{
+	int n;
+
+	if (num_pptdevs >= MAX_PPTDEVS) {
+		printf("ppt_attach: maximum number of pci passthrough devices "
+		       "exceeded\n");
+		return (ENXIO);
+	}
+
+	n = num_pptdevs++;
+	pptdevs[n].dev = dev;
+
+	if (bootverbose)
+		device_printf(dev, "attached\n");
+
+	return (0);
+}
+
+static int
+ppt_detach(device_t dev)
+{
+	/*
+	 * XXX check whether there are any pci passthrough devices assigned
+	 * to guests before we allow this driver to detach.
+	 */
+
+	return (0);
+}
+
+static device_method_t ppt_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,		ppt_probe),
+	DEVMETHOD(device_attach,	ppt_attach),
+	DEVMETHOD(device_detach,	ppt_detach),
+	{0, 0}
+};
+
+static devclass_t ppt_devclass;
+DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, 0);
+DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL);
+
+static struct pptdev *
+ppt_find(int bus, int slot, int func)
+{
+	device_t dev;
+	int i, b, s, f;
+
+	for (i = 0; i < num_pptdevs; i++) {
+		dev = pptdevs[i].dev;
+		b = pci_get_bus(dev);
+		s = pci_get_slot(dev);
+		f = pci_get_function(dev);
+		if (bus == b && slot == s && func == f)
+			return (&pptdevs[i]);
+	}
+	return (NULL);
+}
+
+static void
+ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
+{
+	int i;
+	struct vm_memory_segment *seg;
+
+	for (i = 0; i < MAX_MMIOSEGS; i++) {
+		seg = &ppt->mmio[i];
+		if (seg->len == 0)
+			continue;
+		(void)vm_unmap_mmio(vm, seg->gpa, seg->len);
+		bzero(seg, sizeof(struct vm_memory_segment));
+	}
+}
+
+static void
+ppt_teardown_msi(struct pptdev *ppt)
+{
+	int i, rid;
+	void *cookie;
+	struct resource *res;
+
+	if (ppt->msi.num_msgs == 0)
+		return;
+
+	for (i = 0; i < ppt->msi.num_msgs; i++) {
+		rid = ppt->msi.startrid + i;
+		res = ppt->msi.res[i];
+		cookie = ppt->msi.cookie[i];
+
+		if (cookie != NULL)
+			bus_teardown_intr(ppt->dev, res, cookie);
+
+		if (res != NULL)
+			bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
+		
+		ppt->msi.res[i] = NULL;
+		ppt->msi.cookie[i] = NULL;
+	}
+
+	if (ppt->msi.startrid == 1)
+		pci_release_msi(ppt->dev);
+
+	ppt->msi.num_msgs = 0;
+}
+
+static void 
+ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
+{
+	int rid;
+	struct resource *res;
+	void *cookie;
+
+	rid = ppt->msix.startrid + idx;
+	res = ppt->msix.res[idx];
+	cookie = ppt->msix.cookie[idx];
+
+	if (cookie != NULL) 
+		bus_teardown_intr(ppt->dev, res, cookie);
+
+	if (res != NULL) 
+		bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
+
+	ppt->msix.res[idx] = NULL;
+	ppt->msix.cookie[idx] = NULL;
+}
+
+static void 
+ppt_teardown_msix(struct pptdev *ppt)
+{
+	int i;
+
+	if (ppt->msix.num_msgs == 0) 
+		return;
+
+	for (i = 0; i < ppt->msix.num_msgs; i++) 
+		ppt_teardown_msix_intr(ppt, i);
+
+	if (ppt->msix.msix_table_res) {
+		bus_release_resource(ppt->dev, SYS_RES_MEMORY, 
+				     ppt->msix.msix_table_rid,
+				     ppt->msix.msix_table_res);
+		ppt->msix.msix_table_res = NULL;
+		ppt->msix.msix_table_rid = 0;
+	}
+
+	free(ppt->msix.res, M_PPTMSIX);
+	free(ppt->msix.cookie, M_PPTMSIX);
+	free(ppt->msix.arg, M_PPTMSIX);
+
+	pci_release_msi(ppt->dev);
+
+	ppt->msix.num_msgs = 0;
+}
+
+int
+ppt_assign_device(struct vm *vm, int bus, int slot, int func)
+{
+	struct pptdev *ppt;
+
+	ppt = ppt_find(bus, slot, func);
+	if (ppt != NULL) {
+		/*
+		 * If this device is owned by a different VM then we
+		 * cannot change its owner.
+		 */
+		if (ppt->vm != NULL && ppt->vm != vm)
+			return (EBUSY);
+
+		ppt->vm = vm;
+		iommu_add_device(vm_iommu_domain(vm), bus, slot, func);
+		return (0);
+	}
+	return (ENOENT);
+}
+
+int
+ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
+{
+	struct pptdev *ppt;
+
+	ppt = ppt_find(bus, slot, func);
+	if (ppt != NULL) {
+		/*
+		 * If this device is not owned by this 'vm' then bail out.
+		 */
+		if (ppt->vm != vm)
+			return (EBUSY);
+		ppt_unmap_mmio(vm, ppt);
+		ppt_teardown_msi(ppt);
+		ppt_teardown_msix(ppt);
+		iommu_remove_device(vm_iommu_domain(vm), bus, slot, func);
+		ppt->vm = NULL;
+		return (0);
+	}
+	return (ENOENT);
+}
+
+int
+ppt_unassign_all(struct vm *vm)
+{
+	int i, bus, slot, func;
+	device_t dev;
+
+	for (i = 0; i < num_pptdevs; i++) {
+		if (pptdevs[i].vm == vm) {
+			dev = pptdevs[i].dev;
+			bus = pci_get_bus(dev);
+			slot = pci_get_slot(dev);
+			func = pci_get_function(dev);
+			ppt_unassign_device(vm, bus, slot, func);
+		}
+	}
+
+	return (0);
+}
+
+int
+ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
+	     vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+	int i, error;
+	struct vm_memory_segment *seg;
+	struct pptdev *ppt;
+
+	ppt = ppt_find(bus, slot, func);
+	if (ppt != NULL) {
+		if (ppt->vm != vm)
+			return (EBUSY);
+
+		for (i = 0; i < MAX_MMIOSEGS; i++) {
+			seg = &ppt->mmio[i];
+			if (seg->len == 0) {
+				error = vm_map_mmio(vm, gpa, len, hpa);
+				if (error == 0) {
+					seg->gpa = gpa;
+					seg->len = len;
+				}
+				return (error);
+			}
+		}
+		return (ENOSPC);
+	}
+	return (ENOENT);
+}
+
+static int
+pptintr(void *arg)
+{
+	int vec;
+	struct pptdev *ppt;
+	struct pptintr_arg *pptarg;
+	
+	pptarg = arg;
+	ppt = pptarg->pptdev;
+	vec = pptarg->vec;
+
+	if (ppt->vm != NULL)
+		(void) lapic_set_intr(ppt->vm, pptarg->vcpu, vec);
+	else {
+		/*
+		 * XXX
+		 * This is not expected to happen - panic?
+		 */
+	}
+
+	/*
+	 * For legacy interrupts give other filters a chance in case
+	 * the interrupt was not generated by the passthrough device.
+	 */
+	if (ppt->msi.startrid == 0)
+		return (FILTER_STRAY);
+	else
+		return (FILTER_HANDLED);
+}
+
+/*
+ * XXX
+ * When we try to free the MSI resource the kernel will bind the thread to
+ * the host cpu was originally handling the MSI. The function freeing the
+ * MSI vector (apic_free_vector()) will panic the kernel if the thread
+ * is already bound to a cpu.
+ * 
+ * So, we temporarily unbind the vcpu thread before freeing the MSI resource.
+ */
+static void
+PPT_TEARDOWN_MSI(struct vm *vm, int vcpu, struct pptdev *ppt)
+{
+	int pincpu = -1;
+
+	vm_get_pinning(vm, vcpu, &pincpu);
+
+	if (pincpu >= 0)
+		vm_set_pinning(vm, vcpu, -1);
+
+	ppt_teardown_msi(ppt);
+
+	if (pincpu >= 0)
+		vm_set_pinning(vm, vcpu, pincpu);
+}
+
+int
+ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
+	      int destcpu, int vector, int numvec)
+{
+	int i, rid, flags;
+	int msi_count, startrid, error, tmp;
+	struct pptdev *ppt;
+
+	if ((destcpu >= VM_MAXCPU || destcpu < 0) ||
+	    (vector < 0 || vector > 255) ||
+	    (numvec < 0 || numvec > MAX_MSIMSGS))
+		return (EINVAL);
+
+	ppt = ppt_find(bus, slot, func);
+	if (ppt == NULL)
+		return (ENOENT);
+	if (ppt->vm != vm)		/* Make sure we own this device */
+		return (EBUSY);
+
+	/* Free any allocated resources */
+	PPT_TEARDOWN_MSI(vm, vcpu, ppt);
+
+	if (numvec == 0)		/* nothing more to do */
+		return (0);
+
+	flags = RF_ACTIVE;
+	msi_count = pci_msi_count(ppt->dev);
+	if (msi_count == 0) {
+		startrid = 0;		/* legacy interrupt */
+		msi_count = 1;
+		flags |= RF_SHAREABLE;
+	} else
+		startrid = 1;		/* MSI */
+
+	/*
+	 * The device must be capable of supporting the number of vectors
+	 * the guest wants to allocate.
+	 */
+	if (numvec > msi_count)
+		return (EINVAL);
+
+	/*
+	 * Make sure that we can allocate all the MSI vectors that are needed
+	 * by the guest.
+	 */
+	if (startrid == 1) {
+		tmp = numvec;
+		error = pci_alloc_msi(ppt->dev, &tmp);
+		if (error)
+			return (error);
+		else if (tmp != numvec) {
+			pci_release_msi(ppt->dev);
+			return (ENOSPC);
+		} else {
+			/* success */
+		}
+	}
+	
+	ppt->msi.startrid = startrid;
+
+	/*
+	 * Allocate the irq resource and attach it to the interrupt handler.
+	 */
+	for (i = 0; i < numvec; i++) {
+		ppt->msi.num_msgs = i + 1;
+		ppt->msi.cookie[i] = NULL;
+
+		rid = startrid + i;
+		ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
+							 &rid, flags);
+		if (ppt->msi.res[i] == NULL)
+			break;
+
+		ppt->msi.arg[i].pptdev = ppt;
+		ppt->msi.arg[i].vec = vector + i;
+		ppt->msi.arg[i].vcpu = destcpu;
+
+		error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
+				       INTR_TYPE_NET | INTR_MPSAFE,
+				       pptintr, NULL, &ppt->msi.arg[i],
+				       &ppt->msi.cookie[i]);
+		if (error != 0)
+			break;
+	}
+	
+	if (i < numvec) {
+		PPT_TEARDOWN_MSI(vm, vcpu, ppt);
+		return (ENXIO);
+	}
+
+	return (0);
+}
+
+int
+ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
+	       int idx, uint32_t msg, uint32_t vector_control, uint64_t addr)
+{
+	struct pptdev *ppt;
+	struct pci_devinfo *dinfo;
+	int numvec, alloced, rid, error;
+	size_t res_size, cookie_size, arg_size;
+
+	ppt = ppt_find(bus, slot, func);
+	if (ppt == NULL)
+		return (ENOENT);
+	if (ppt->vm != vm)		/* Make sure we own this device */
+		return (EBUSY);
+
+	dinfo = device_get_ivars(ppt->dev);
+	if (!dinfo) 
+		return (ENXIO);
+
+	/* 
+	 * First-time configuration:
+	 * 	Allocate the MSI-X table
+	 *	Allocate the IRQ resources
+	 *	Set up some variables in ppt->msix
+	 */
+	if (ppt->msix.num_msgs == 0) {
+		numvec = pci_msix_count(ppt->dev);
+		if (numvec <= 0)
+			return (EINVAL);
+
+		ppt->msix.startrid = 1;
+		ppt->msix.num_msgs = numvec;
+
+		res_size = numvec * sizeof(ppt->msix.res[0]);
+		cookie_size = numvec * sizeof(ppt->msix.cookie[0]);
+		arg_size = numvec * sizeof(ppt->msix.arg[0]);
+
+		ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO);
+		ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX,
+					  M_WAITOK | M_ZERO);
+		ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO);
+
+		rid = dinfo->cfg.msix.msix_table_bar;
+		ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev,
+					       SYS_RES_MEMORY, &rid, RF_ACTIVE);
+
+		if (ppt->msix.msix_table_res == NULL) {
+			ppt_teardown_msix(ppt);
+			return (ENOSPC);
+		}
+		ppt->msix.msix_table_rid = rid;
+
+		alloced = numvec;
+		error = pci_alloc_msix(ppt->dev, &alloced);
+		if (error || alloced != numvec) {
+			ppt_teardown_msix(ppt);
+			return (error == 0 ? ENOSPC: error);
+		}
+	}
+
+	if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+		/* Tear down the IRQ if it's already set up */
+		ppt_teardown_msix_intr(ppt, idx);
+
+		/* Allocate the IRQ resource */
+		ppt->msix.cookie[idx] = NULL;
+		rid = ppt->msix.startrid + idx;
+		ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
+							    &rid, RF_ACTIVE);
+		if (ppt->msix.res[idx] == NULL)
+			return (ENXIO);
+	
+		ppt->msix.arg[idx].pptdev = ppt;
+		ppt->msix.arg[idx].vec = msg;
+		ppt->msix.arg[idx].vcpu = (addr >> 12) & 0xFF;
+	
+		/* Setup the MSI-X interrupt */
+		error = bus_setup_intr(ppt->dev, ppt->msix.res[idx],
+				       INTR_TYPE_NET | INTR_MPSAFE,
+				       pptintr, NULL, &ppt->msix.arg[idx],
+				       &ppt->msix.cookie[idx]);
+	
+		if (error != 0) {
+			bus_teardown_intr(ppt->dev, ppt->msix.res[idx], ppt->msix.cookie[idx]);
+			bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]);
+			ppt->msix.cookie[idx] = NULL;
+			ppt->msix.res[idx] = NULL;
+			return (ENXIO);
+		}
+	} else {
+		/* Masked, tear it down if it's already been set up */
+		ppt_teardown_msix_intr(ppt, idx);
+	}
+
+	return (0);
+}
+
diff --git a/sys/amd64/vmm/io/ppt.h b/sys/amd64/vmm/io/ppt.h
new file mode 100644
index 0000000..63c8228
--- /dev/null
+++ b/sys/amd64/vmm/io/ppt.h
@@ -0,0 +1,41 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IO_PPT_H_
+#define	_IO_PPT_H_
+
+int	ppt_assign_device(struct vm *vm, int bus, int slot, int func);
+int	ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
+int	ppt_unassign_all(struct vm *vm);
+int	ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
+		     vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int	ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
+		      int destcpu, int vector, int numvec);
+int	ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
+		       int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
+#endif
diff --git a/sys/amd64/vmm/io/vdev.c b/sys/amd64/vmm/io/vdev.c
new file mode 100644
index 0000000..cd6c5d1
--- /dev/null
+++ b/sys/amd64/vmm/io/vdev.c
@@ -0,0 +1,270 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include "vdev.h"
+
+struct vdev {
+	SLIST_ENTRY(vdev) 	 entry;
+	struct vdev_ops 	*ops;
+	void			*dev;
+};
+static SLIST_HEAD(, vdev)	vdev_head;
+static int 		  	vdev_count;
+
+struct vdev_region {
+	SLIST_ENTRY(vdev_region) 	 entry;
+	struct vdev_ops 		*ops;
+	void				*dev;
+	struct io_region		*io;
+};
+static SLIST_HEAD(, vdev_region)	 region_head;
+static int 		  		 region_count;
+
+static MALLOC_DEFINE(M_VDEV, "vdev", "vdev");
+
+#define VDEV_INIT 	(0)
+#define VDEV_RESET	(1)
+#define VDEV_HALT	(2)
+
+// static const char* vdev_event_str[] = {"VDEV_INIT", "VDEV_RESET", "VDEV_HALT"};
+
+static int
+vdev_system_event(int event)
+{
+	struct vdev 	*vd;
+	int		 rc;
+
+	// TODO: locking
+	SLIST_FOREACH(vd, &vdev_head, entry) {
+		// printf("%s : %s Device %s\n", __func__, vdev_event_str[event], vd->ops->name);
+		switch (event) {
+			case VDEV_INIT:
+				rc = vd->ops->init(vd->dev);
+				break;
+			case VDEV_RESET:
+				rc = vd->ops->reset(vd->dev);
+				break;
+			case VDEV_HALT:
+				rc = vd->ops->halt(vd->dev);
+				break;
+			default:
+				break;
+		}
+		if (rc) {
+			printf("vdev %s init failed rc=%d\n",
+			    vd->ops->name, rc);
+			return rc;
+		}
+	}
+	return 0;
+}
+
+int
+vdev_init(void)
+{
+	return vdev_system_event(VDEV_INIT);
+}
+
+int
+vdev_reset(void)
+{
+	return vdev_system_event(VDEV_RESET);
+}
+
+int
+vdev_halt(void)
+{
+	return vdev_system_event(VDEV_HALT);
+}
+
+void
+vdev_vm_init(void)
+{
+	SLIST_INIT(&vdev_head);
+	vdev_count = 0;
+
+	SLIST_INIT(&region_head);
+	region_count = 0;
+}
+void
+vdev_vm_cleanup(void)
+{
+	struct vdev *vd;
+     
+	// TODO: locking
+	while (!SLIST_EMPTY(&vdev_head)) {
+		vd = SLIST_FIRST(&vdev_head);
+		SLIST_REMOVE_HEAD(&vdev_head, entry);
+		free(vd, M_VDEV);
+		vdev_count--;
+	}
+}
+
+int
+vdev_register(struct vdev_ops *ops, void *dev)
+{
+	struct vdev *vd;
+	vd = malloc(sizeof(*vd), M_VDEV, M_WAITOK | M_ZERO); 
+	vd->ops = ops;
+	vd->dev = dev;
+	
+	// TODO: locking
+	SLIST_INSERT_HEAD(&vdev_head, vd, entry); 
+	vdev_count++;
+	return 0;
+}
+
+void
+vdev_unregister(void *dev)
+{
+	struct vdev 	*vd, *found;
+
+	found = NULL;
+	// TODO: locking
+	SLIST_FOREACH(vd, &vdev_head, entry) {
+		if (vd->dev == dev) {
+			found = vd;
+		}
+	}
+
+	if (found) {
+		SLIST_REMOVE(&vdev_head, found, vdev, entry);
+		free(found, M_VDEV);
+	}
+}
+
+#define IN_RANGE(val, start, end)	\
+    (((val) >= (start)) && ((val) < (end)))
+
+static struct vdev_region*
+vdev_find_region(struct io_region *io, void *dev) 
+{
+	struct 		vdev_region *region, *found;
+	uint64_t	region_base;
+	uint64_t	region_end;
+
+	found = NULL;
+
+	// TODO: locking
+	// FIXME: we should verify we are in the context the current
+	// 	  vcpu here as well.
+	SLIST_FOREACH(region, &region_head, entry) {
+		region_base = region->io->base;
+		region_end = region_base + region->io->len;
+		if (IN_RANGE(io->base, region_base, region_end) &&
+		    IN_RANGE(io->base+io->len, region_base, region_end+1) &&
+		    (dev && dev == region->dev)) {
+			found = region;
+			break;
+		}
+	}
+	return found;
+}
+
+int
+vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io)
+{
+	struct vdev_region *region;
+
+	region = vdev_find_region(io, dev);
+	if (region) {
+		return -EEXIST;
+	}
+
+	region = malloc(sizeof(*region), M_VDEV, M_WAITOK | M_ZERO);
+	region->io = io;
+	region->ops = ops;
+	region->dev = dev;
+
+	// TODO: locking
+	SLIST_INSERT_HEAD(&region_head, region, entry); 
+	region_count++;
+
+	return 0;
+}
+
+void
+vdev_unregister_region(void *dev, struct io_region *io)
+{
+	struct vdev_region *region;
+
+	region = vdev_find_region(io, dev);
+	
+	if (region) {
+		SLIST_REMOVE(&region_head, region, vdev_region, entry);
+		free(region, M_VDEV);
+		region_count--;
+	}
+}
+
+static int
+vdev_memrw(uint64_t gpa, opsize_t size, uint64_t *data, int read)
+{
+	struct vdev_region 	*region;
+	struct io_region	 io;
+	region_attr_t		 attr;
+	int			 rc;
+
+	io.base = gpa;
+	io.len = size;
+
+	region = vdev_find_region(&io, NULL);
+	if (!region)
+		return -EINVAL;
+	
+	attr = (read) ? MMIO_READ : MMIO_WRITE;
+	if (!(region->io->attr & attr))
+		return -EPERM;
+
+	if (read)
+		rc = region->ops->memread(region->dev, gpa, size, data);
+	else 
+		rc = region->ops->memwrite(region->dev, gpa, size, *data);
+
+	return rc;
+}
+
+int
+vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data)
+{
+	return vdev_memrw(gpa, size, data, 1);
+}
+
+int
+vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data)
+{
+	return vdev_memrw(gpa, size, &data, 0);
+}
diff --git a/sys/amd64/vmm/io/vdev.h b/sys/amd64/vmm/io/vdev.h
new file mode 100644
index 0000000..6feeba8
--- /dev/null
+++ b/sys/amd64/vmm/io/vdev.h
@@ -0,0 +1,84 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VDEV_H_
+#define	_VDEV_H_
+
+typedef enum {
+	BYTE	= 1,
+	WORD	= 2,
+	DWORD	= 4,
+	QWORD	= 8,
+} opsize_t;
+
+typedef enum {
+	MMIO_READ = 1,
+	MMIO_WRITE = 2,
+} region_attr_t;
+
+struct io_region {
+	uint64_t	base;
+	uint64_t	len;
+	region_attr_t	attr;
+	int		vcpu;
+};
+
+typedef int (*vdev_init_t)(void* dev);
+typedef int (*vdev_reset_t)(void* dev);
+typedef int (*vdev_halt_t)(void* dev);
+typedef int (*vdev_memread_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t *data);
+typedef int (*vdev_memwrite_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t data);
+
+
+struct vdev_ops {
+	const char	*name;
+	vdev_init_t	init;
+	vdev_reset_t	reset;
+	vdev_halt_t	halt;
+	vdev_memread_t	memread;
+	vdev_memwrite_t	memwrite;
+};
+
+
+void vdev_vm_init(void);
+void vdev_vm_cleanup(void);
+
+int  vdev_register(struct vdev_ops *ops, void *dev);
+void vdev_unregister(void *dev);
+
+int  vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io);
+void vdev_unregister_region(void *dev, struct io_region *io);
+
+int vdev_init(void);
+int vdev_reset(void);
+int vdev_halt(void);
+int vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data);
+int vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data);
+
+#endif	/* _VDEV_H_ */
+
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
new file mode 100644
index 0000000..15fc6c2
--- /dev/null
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -0,0 +1,901 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+
+#include <machine/clock.h>
+#include <x86/specialreg.h>
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+
+#include "vmm_lapic.h"
+#include "vmm_ktr.h"
+#include "vdev.h"
+#include "vlapic.h"
+
+#define	VLAPIC_CTR0(vlapic, format)					\
+	VMM_CTR0((vlapic)->vm, (vlapic)->vcpuid, format)
+
+#define	VLAPIC_CTR1(vlapic, format, p1)					\
+	VMM_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1)
+
+#define	VLAPIC_CTR_IRR(vlapic, msg)					\
+do {									\
+	uint32_t *irrptr = &(vlapic)->apic.irr0;			\
+	irrptr[0] = irrptr[0];	/* silence compiler */			\
+	VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]);	\
+} while (0)
+
+#define	VLAPIC_CTR_ISR(vlapic, msg)					\
+do {									\
+	uint32_t *isrptr = &(vlapic)->apic.isr0;			\
+	isrptr[0] = isrptr[0];	/* silence compiler */			\
+	VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]);	\
+	VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]);	\
+} while (0)
+
+static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
+
+#define	PRIO(x)			((x) >> 4)
+
+#define VLAPIC_VERSION		(16)
+#define VLAPIC_MAXLVT_ENTRIES	(5)
+
+#define	x2apic(vlapic)	(((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0)
+
+enum boot_state {
+	BS_INIT,
+	BS_SIPI,
+	BS_RUNNING
+};
+
+struct vlapic {
+	struct vm		*vm;
+	int			vcpuid;
+
+	struct io_region	*mmio;
+	struct vdev_ops		*ops;
+	struct LAPIC		 apic;
+
+	int			 esr_update;
+
+	int			 divisor;
+	int			 ccr_ticks;
+
+	/*
+	 * The 'isrvec_stk' is a stack of vectors injected by the local apic.
+	 * A vector is popped from the stack when the processor does an EOI.
+	 * The vector on the top of the stack is used to compute the
+	 * Processor Priority in conjunction with the TPR.
+	 */
+	uint8_t			 isrvec_stk[ISRVEC_STK_SIZE];
+	int			 isrvec_stk_top;
+
+	uint64_t		msr_apicbase;
+	enum boot_state		boot_state;
+};
+
+#define VLAPIC_BUS_FREQ	tsc_freq
+
+static int
+vlapic_timer_divisor(uint32_t dcr)
+{
+	switch (dcr & 0xB) {
+	case APIC_TDCR_2:
+		return (2);
+	case APIC_TDCR_4:
+		return (4);
+	case APIC_TDCR_8:
+		return (8);
+	case APIC_TDCR_16:
+		return (16);
+	case APIC_TDCR_32:
+		return (32);
+	case APIC_TDCR_64:
+		return (64);
+	case APIC_TDCR_128:
+		return (128);
+	default:
+		panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
+	}
+}
+
+static void
+vlapic_mask_lvts(uint32_t *lvts, int num_lvt)
+{
+	int i;
+	for (i = 0; i < num_lvt; i++) {
+		*lvts |= APIC_LVT_M;
+		lvts += 4;
+	}
+}
+
+#if 0
+static inline void
+vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
+{
+	printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset,
+	    *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS,
+	    *lvt & APIC_LVTT_M);
+}
+#endif
+
+static uint64_t
+vlapic_get_ccr(struct vlapic *vlapic)
+{
+	struct LAPIC    *lapic = &vlapic->apic;
+	return lapic->ccr_timer;
+}
+
+static void
+vlapic_update_errors(struct vlapic *vlapic)
+{
+	struct LAPIC    *lapic = &vlapic->apic;
+	lapic->esr = 0; // XXX 
+}
+
+static void
+vlapic_init_ipi(struct vlapic *vlapic)
+{
+	struct LAPIC    *lapic = &vlapic->apic;
+	lapic->version = VLAPIC_VERSION;
+	lapic->version |= (VLAPIC_MAXLVT_ENTRIES < MAXLVTSHIFT);
+	lapic->dfr = 0xffffffff;
+	lapic->svr = APIC_SVR_VECTOR;
+	vlapic_mask_lvts(&lapic->lvt_timer, VLAPIC_MAXLVT_ENTRIES+1);
+}
+
+static int
+vlapic_op_reset(void* dev)
+{
+	struct vlapic 	*vlapic = (struct vlapic*)dev;
+	struct LAPIC	*lapic = &vlapic->apic;
+
+	memset(lapic, 0, sizeof(*lapic));
+	lapic->apr = vlapic->vcpuid;
+	vlapic_init_ipi(vlapic);
+	vlapic->divisor = vlapic_timer_divisor(lapic->dcr_timer);
+
+	if (vlapic->vcpuid == 0)
+		vlapic->boot_state = BS_RUNNING;	/* BSP */
+	else
+		vlapic->boot_state = BS_INIT;		/* AP */
+	
+	return 0;
+
+}
+
+static int
+vlapic_op_init(void* dev)
+{
+	struct vlapic *vlapic = (struct vlapic*)dev;
+	vdev_register_region(vlapic->ops, vlapic, vlapic->mmio);
+	return vlapic_op_reset(dev);
+}
+
+static int
+vlapic_op_halt(void* dev)
+{
+	struct vlapic *vlapic = (struct vlapic*)dev;
+	vdev_unregister_region(vlapic, vlapic->mmio);
+	return 0;
+
+}
+
+void
+vlapic_set_intr_ready(struct vlapic *vlapic, int vector)
+{
+	struct LAPIC	*lapic = &vlapic->apic;
+	uint32_t	*irrptr;
+	int		idx;
+
+	if (vector < 0 || vector >= 256)
+		panic("vlapic_set_intr_ready: invalid vector %d\n", vector);
+
+	idx = (vector / 32) * 4;
+	irrptr = &lapic->irr0;
+	atomic_set_int(&irrptr[idx], 1 << (vector % 32));
+	VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
+}
+
+static void
+vlapic_start_timer(struct vlapic *vlapic, uint32_t elapsed)
+{
+	uint32_t icr_timer;
+
+	icr_timer = vlapic->apic.icr_timer;
+
+	vlapic->ccr_ticks = ticks;
+	if (elapsed < icr_timer)
+		vlapic->apic.ccr_timer = icr_timer - elapsed;
+	else {
+		/*
+		 * This can happen when the guest is trying to run its local
+		 * apic timer higher that the setting of 'hz' in the host.
+		 *
+		 * We deal with this by running the guest local apic timer
+		 * at the rate of the host's 'hz' setting.
+		 */
+		vlapic->apic.ccr_timer = 0;
+	}
+}
+
+static __inline uint32_t *
+vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
+{
+	struct LAPIC	*lapic = &vlapic->apic;
+	int 		 i;
+
+	if (offset < APIC_OFFSET_TIMER_LVT || offset > APIC_OFFSET_ERROR_LVT) {
+		panic("vlapic_get_lvt: invalid LVT\n");
+	}
+	i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
+	return ((&lapic->lvt_timer) + i);;
+}
+
+#if 1
+static void
+dump_isrvec_stk(struct vlapic *vlapic)
+{
+	int i;
+	uint32_t *isrptr;
+
+	isrptr = &vlapic->apic.isr0;
+	for (i = 0; i < 8; i++)
+		printf("ISR%d 0x%08x\n", i, isrptr[i * 4]);
+
+	for (i = 0; i <= vlapic->isrvec_stk_top; i++)
+		printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]);
+}
+#endif
+
+/*
+ * Algorithm adopted from section "Interrupt, Task and Processor Priority"
+ * in Intel Architecture Manual Vol 3a.
+ */
+static void
+vlapic_update_ppr(struct vlapic *vlapic)
+{
+	int isrvec, tpr, ppr;
+
+	/*
+	 * Note that the value on the stack at index 0 is always 0.
+	 *
+	 * This is a placeholder for the value of ISRV when none of the
+	 * bits is set in the ISRx registers.
+	 */
+	isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top];
+	tpr = vlapic->apic.tpr;
+
+#if 1
+	{
+		int i, lastprio, curprio, vector, idx;
+		uint32_t *isrptr;
+
+		if (vlapic->isrvec_stk_top == 0 && isrvec != 0)
+			panic("isrvec_stk is corrupted: %d", isrvec);
+
+		/*
+		 * Make sure that the priority of the nested interrupts is
+		 * always increasing.
+		 */
+		lastprio = -1;
+		for (i = 1; i <= vlapic->isrvec_stk_top; i++) {
+			curprio = PRIO(vlapic->isrvec_stk[i]);
+			if (curprio <= lastprio) {
+				dump_isrvec_stk(vlapic);
+				panic("isrvec_stk does not satisfy invariant");
+			}
+			lastprio = curprio;
+		}
+
+		/*
+		 * Make sure that each bit set in the ISRx registers has a
+		 * corresponding entry on the isrvec stack.
+		 */
+		i = 1;
+		isrptr = &vlapic->apic.isr0;
+		for (vector = 0; vector < 256; vector++) {
+			idx = (vector / 32) * 4;
+			if (isrptr[idx] & (1 << (vector % 32))) {
+				if (i > vlapic->isrvec_stk_top ||
+				    vlapic->isrvec_stk[i] != vector) {
+					dump_isrvec_stk(vlapic);
+					panic("ISR and isrvec_stk out of sync");
+				}
+				i++;
+			}
+		}
+	}
+#endif
+
+	if (PRIO(tpr) >= PRIO(isrvec))
+		ppr = tpr;
+	else
+		ppr = isrvec & 0xf0;
+
+	vlapic->apic.ppr = ppr;
+	VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
+}
+
+static void
+vlapic_process_eoi(struct vlapic *vlapic)
+{
+	struct LAPIC	*lapic = &vlapic->apic;
+	uint32_t	*isrptr;
+	int		i, idx, bitpos;
+
+	isrptr = &lapic->isr0;
+
+	/*
+	 * The x86 architecture reserves the the first 32 vectors for use
+	 * by the processor.
+	 */
+	for (i = 7; i > 0; i--) {
+		idx = i * 4;
+		bitpos = fls(isrptr[idx]);
+		if (bitpos != 0) {
+			if (vlapic->isrvec_stk_top <= 0) {
+				panic("invalid vlapic isrvec_stk_top %d",
+				      vlapic->isrvec_stk_top);
+			}
+			isrptr[idx] &= ~(1 << (bitpos - 1));
+			VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi");
+			vlapic->isrvec_stk_top--;
+			vlapic_update_ppr(vlapic);
+			return;
+		}
+	}
+}
+
+static __inline int
+vlapic_get_lvt_field(uint32_t *lvt, uint32_t mask)
+{
+	return (*lvt & mask);
+}
+
+static __inline int
+vlapic_periodic_timer(struct vlapic *vlapic)
+{
+	uint32_t *lvt;
+	
+	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+
+	return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC));
+}
+
+static void
+vlapic_fire_timer(struct vlapic *vlapic)
+{
+	int vector;
+	uint32_t *lvt;
+	
+	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+
+	if (!vlapic_get_lvt_field(lvt, APIC_LVTT_M)) {
+		vector = vlapic_get_lvt_field(lvt,APIC_LVTT_VECTOR);
+		vlapic_set_intr_ready(vlapic, vector);
+	}
+}
+
+static int
+lapic_process_icr(struct vlapic *vlapic, uint64_t icrval)
+{
+	int i;
+	cpuset_t dmask;
+	uint32_t dest, vec, mode;
+	struct vlapic *vlapic2;
+	struct vm_exit *vmexit;
+	
+	if (x2apic(vlapic))
+		dest = icrval >> 32;
+	else
+		dest = icrval >> (32 + 24);
+	vec = icrval & APIC_VECTOR_MASK;
+	mode = icrval & APIC_DELMODE_MASK;
+
+	if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) {
+		switch (icrval & APIC_DEST_MASK) {
+		case APIC_DEST_DESTFLD:
+			CPU_SETOF(dest, &dmask);
+			break;
+		case APIC_DEST_SELF:
+			CPU_SETOF(vlapic->vcpuid, &dmask);
+			break;
+		case APIC_DEST_ALLISELF:
+			dmask = vm_active_cpus(vlapic->vm);
+			break;
+		case APIC_DEST_ALLESELF:
+			dmask = vm_active_cpus(vlapic->vm);
+			CPU_CLR(vlapic->vcpuid, &dmask);
+			break;
+		}
+
+		while ((i = cpusetobj_ffs(&dmask)) != 0) {
+			i--;
+			CPU_CLR(i, &dmask);
+			if (mode == APIC_DELMODE_FIXED)
+				lapic_set_intr(vlapic->vm, i, vec);
+			else
+				vm_inject_nmi(vlapic->vm, i);
+		}
+
+		return (0);	/* handled completely in the kernel */
+	}
+
+	if (mode == APIC_DELMODE_INIT) {
+		if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT)
+			return (0);
+
+		if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
+			vlapic2 = vm_lapic(vlapic->vm, dest);
+
+			/* move from INIT to waiting-for-SIPI state */
+			if (vlapic2->boot_state == BS_INIT) {
+				vlapic2->boot_state = BS_SIPI;
+			}
+
+			return (0);
+		}
+	}
+
+	if (mode == APIC_DELMODE_STARTUP) {
+		if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
+			vlapic2 = vm_lapic(vlapic->vm, dest);
+
+			/*
+			 * Ignore SIPIs in any state other than wait-for-SIPI
+			 */
+			if (vlapic2->boot_state != BS_SIPI)
+				return (0);
+
+			vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
+			vmexit->exitcode = VM_EXITCODE_SPINUP_AP;
+			vmexit->u.spinup_ap.vcpu = dest;
+			vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT;
+
+			/*
+			 * XXX this assumes that the startup IPI always succeeds
+			 */
+			vlapic2->boot_state = BS_RUNNING;
+			vm_activate_cpu(vlapic2->vm, dest);
+
+			return (0);
+		}
+	}
+
+	/*
+	 * This will cause a return to userland.
+	 */
+	return (1);
+}
+
+int
+vlapic_pending_intr(struct vlapic *vlapic)
+{
+	struct LAPIC	*lapic = &vlapic->apic;
+	int	  	 idx, i, bitpos, vector;
+	uint32_t	*irrptr, val;
+
+	irrptr = &lapic->irr0;
+
+	/*
+	 * The x86 architecture reserves the the first 32 vectors for use
+	 * by the processor.
+	 */
+	for (i = 7; i > 0; i--) {
+		idx = i * 4;
+		val = atomic_load_acq_int(&irrptr[idx]);
+		bitpos = fls(val);
+		if (bitpos != 0) {
+			vector = i * 32 + (bitpos - 1);
+			if (PRIO(vector) > PRIO(lapic->ppr)) {
+				VLAPIC_CTR1(vlapic, "pending intr %d", vector);
+				return (vector);
+			} else 
+				break;
+		}
+	}
+	VLAPIC_CTR0(vlapic, "no pending intr");
+	return (-1);
+}
+
+void
+vlapic_intr_accepted(struct vlapic *vlapic, int vector)
+{
+	struct LAPIC	*lapic = &vlapic->apic;
+	uint32_t	*irrptr, *isrptr;
+	int		idx, stk_top;
+
+	/*
+	 * clear the ready bit for vector being accepted in irr 
+	 * and set the vector as in service in isr.
+	 */
+	idx = (vector / 32) * 4;
+
+	irrptr = &lapic->irr0;
+	atomic_clear_int(&irrptr[idx], 1 << (vector % 32));
+	VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted");
+
+	isrptr = &lapic->isr0;
+	isrptr[idx] |= 1 << (vector % 32);
+	VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted");
+
+	/*
+	 * Update the PPR
+	 */
+	vlapic->isrvec_stk_top++;
+
+	stk_top = vlapic->isrvec_stk_top;
+	if (stk_top >= ISRVEC_STK_SIZE)
+		panic("isrvec_stk_top overflow %d", stk_top);
+
+	vlapic->isrvec_stk[stk_top] = vector;
+	vlapic_update_ppr(vlapic);
+}
+
+int
+vlapic_op_mem_read(void* dev, uint64_t gpa, opsize_t size, uint64_t *data)
+{
+	struct vlapic 	*vlapic = (struct vlapic*)dev;
+	struct LAPIC	*lapic = &vlapic->apic;
+	uint64_t	 offset = gpa & ~(PAGE_SIZE);
+	uint32_t	*reg;
+	int		 i;
+
+	if (offset > sizeof(*lapic)) {
+		*data = 0;
+		return 0;
+	}
+	
+	offset &= ~3;
+	switch(offset)
+	{
+		case APIC_OFFSET_ID:
+			if (x2apic(vlapic))
+				*data = vlapic->vcpuid;
+			else
+				*data = vlapic->vcpuid << 24;
+			break;
+		case APIC_OFFSET_VER:
+			*data = lapic->version;
+			break;
+		case APIC_OFFSET_TPR:
+			*data = lapic->tpr;
+			break;
+		case APIC_OFFSET_APR:
+			*data = lapic->apr;
+			break;
+		case APIC_OFFSET_PPR:
+			*data = lapic->ppr;
+			break;
+		case APIC_OFFSET_EOI:
+			*data = lapic->eoi;
+			break;
+		case APIC_OFFSET_LDR:
+			*data = lapic->ldr;
+			break;
+		case APIC_OFFSET_DFR:
+			*data = lapic->dfr;
+			break;
+		case APIC_OFFSET_SVR:
+			*data = lapic->svr;
+			break;
+		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
+			i = (offset - APIC_OFFSET_ISR0) >> 2;
+			reg = &lapic->isr0;
+			*data = *(reg + i);
+			break;
+		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
+			i = (offset - APIC_OFFSET_TMR0) >> 2;
+			reg = &lapic->tmr0;
+			*data = *(reg + i);
+			break;
+		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
+			i = (offset - APIC_OFFSET_IRR0) >> 2;
+			reg = &lapic->irr0;
+			*data = atomic_load_acq_int(reg + i);
+			break;
+		case APIC_OFFSET_ESR:
+			*data = lapic->esr;
+			break;
+		case APIC_OFFSET_ICR_LOW: 
+			*data = lapic->icr_lo;
+			break;
+		case APIC_OFFSET_ICR_HI: 
+			*data = lapic->icr_hi;
+			break;
+		case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+			reg = vlapic_get_lvt(vlapic, offset);	
+			*data = *(reg);
+			break;
+		case APIC_OFFSET_ICR:
+			*data = lapic->icr_timer;
+			break;
+		case APIC_OFFSET_CCR:
+			*data = vlapic_get_ccr(vlapic);
+			break;
+		case APIC_OFFSET_DCR:
+			*data = lapic->dcr_timer;
+			break;
+		case APIC_OFFSET_RRR:
+		default:
+			*data = 0;
+			break;
+	}
+	return 0;
+}
+
+int
+vlapic_op_mem_write(void* dev, uint64_t gpa, opsize_t size, uint64_t data)
+{
+	struct vlapic 	*vlapic = (struct vlapic*)dev;
+	struct LAPIC	*lapic = &vlapic->apic;
+	uint64_t	 offset = gpa & ~(PAGE_SIZE);
+	uint32_t	*reg;
+	int		retval;
+
+	if (offset > sizeof(*lapic)) {
+		return 0;
+	}
+
+	retval = 0;
+	offset &= ~3;
+	switch(offset)
+	{
+		case APIC_OFFSET_ID:
+			break;
+		case APIC_OFFSET_TPR:
+			lapic->tpr = data & 0xff;
+			vlapic_update_ppr(vlapic);
+			break;
+		case APIC_OFFSET_EOI:
+			vlapic_process_eoi(vlapic);
+			break;
+		case APIC_OFFSET_LDR:
+			break;
+		case APIC_OFFSET_DFR:
+			break;
+		case APIC_OFFSET_SVR:
+			lapic->svr = data;
+			break;
+		case APIC_OFFSET_ICR_LOW: 
+			if (!x2apic(vlapic)) {
+				data &= 0xffffffff;
+				data |= (uint64_t)lapic->icr_hi << 32;
+			}
+			retval = lapic_process_icr(vlapic, data);
+			break;
+		case APIC_OFFSET_ICR_HI:
+			if (!x2apic(vlapic)) {
+				retval = 0;
+				lapic->icr_hi = data;
+			}
+			break;
+		case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+			reg = vlapic_get_lvt(vlapic, offset);	
+			if (!(lapic->svr & APIC_SVR_ENABLE)) {
+				data |= APIC_LVT_M;
+			}
+			*reg = data;
+			// vlapic_dump_lvt(offset, reg);
+			break;
+		case APIC_OFFSET_ICR:
+			lapic->icr_timer = data;
+			vlapic_start_timer(vlapic, 0);
+			break;
+
+		case APIC_OFFSET_DCR:
+			lapic->dcr_timer = data;
+			vlapic->divisor = vlapic_timer_divisor(data);
+			break;
+
+		case APIC_OFFSET_ESR:
+			vlapic_update_errors(vlapic);
+			break;
+		case APIC_OFFSET_VER:
+		case APIC_OFFSET_APR:
+		case APIC_OFFSET_PPR:
+		case APIC_OFFSET_RRR:
+		case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
+		case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
+		case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
+		case APIC_OFFSET_CCR:
+		default:
+			// Read only.
+			break;
+	}
+
+	return (retval);
+}
+
+int
+vlapic_timer_tick(struct vlapic *vlapic)
+{
+	int curticks, delta, periodic, fired;
+	uint32_t ccr;
+	uint32_t decrement, leftover;
+
+restart:
+	curticks = ticks;
+	delta = curticks - vlapic->ccr_ticks;
+
+	/* Local APIC timer is disabled */
+	if (vlapic->apic.icr_timer == 0)
+		return (-1);
+
+	/* One-shot mode and timer has already counted down to zero */
+	periodic = vlapic_periodic_timer(vlapic);
+	if (!periodic && vlapic->apic.ccr_timer == 0)
+		return (-1);
+	/*
+	 * The 'curticks' and 'ccr_ticks' are out of sync by more than
+	 * 2^31 ticks. We deal with this by restarting the timer.
+	 */
+	if (delta < 0) {
+		vlapic_start_timer(vlapic, 0);
+		goto restart;
+	}
+
+	fired = 0;
+	decrement = (VLAPIC_BUS_FREQ / vlapic->divisor) / hz;
+
+	vlapic->ccr_ticks = curticks;
+	ccr = vlapic->apic.ccr_timer;
+
+	while (delta-- > 0) {
+		if (ccr > decrement) {
+			ccr -= decrement;
+			continue;
+		}
+
+		/* Trigger the local apic timer interrupt */
+		vlapic_fire_timer(vlapic);
+		if (periodic) {
+			leftover = decrement - ccr;
+			vlapic_start_timer(vlapic, leftover);
+			ccr = vlapic->apic.ccr_timer;
+		} else {
+			/*
+			 * One-shot timer has counted down to zero.
+			 */
+			ccr = 0;
+		}
+		fired = 1;
+		break;
+	}
+
+	vlapic->apic.ccr_timer = ccr;
+
+	if (!fired)
+		return ((ccr / decrement) + 1);
+	else
+		return (0);
+}
+
+struct vdev_ops vlapic_dev_ops = {
+	.name = "vlapic",
+	.init = vlapic_op_init,
+	.reset = vlapic_op_reset,
+	.halt = vlapic_op_halt,
+	.memread = vlapic_op_mem_read,
+	.memwrite = vlapic_op_mem_write,
+};
+static struct io_region vlapic_mmio[VM_MAXCPU];
+
+struct vlapic *
+vlapic_init(struct vm *vm, int vcpuid)
+{
+	struct vlapic 		*vlapic;
+
+	vlapic = malloc(sizeof(struct vlapic), M_VLAPIC, M_WAITOK | M_ZERO);
+	vlapic->vm = vm;
+	vlapic->vcpuid = vcpuid;
+
+	vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED;
+
+	if (vcpuid == 0)
+		vlapic->msr_apicbase |= APICBASE_BSP;
+
+	vlapic->ops = &vlapic_dev_ops;
+
+	vlapic->mmio = vlapic_mmio + vcpuid;
+	vlapic->mmio->base = DEFAULT_APIC_BASE;
+	vlapic->mmio->len = PAGE_SIZE;
+	vlapic->mmio->attr = MMIO_READ|MMIO_WRITE;
+	vlapic->mmio->vcpu = vcpuid;
+
+	vdev_register(&vlapic_dev_ops, vlapic);
+
+	vlapic_op_init(vlapic);
+
+	return (vlapic);
+}
+
+void
+vlapic_cleanup(struct vlapic *vlapic)
+{
+	vlapic_op_halt(vlapic);
+	vdev_unregister(vlapic);
+	free(vlapic, M_VLAPIC);
+}
+
+uint64_t
+vlapic_get_apicbase(struct vlapic *vlapic)
+{
+
+	return (vlapic->msr_apicbase);
+}
+
+void
+vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val)
+{
+	int err;
+	enum x2apic_state state;
+
+	err = vm_get_x2apic_state(vlapic->vm, vlapic->vcpuid, &state);
+	if (err)
+		panic("vlapic_set_apicbase: err %d fetching x2apic state", err);
+
+	if (state == X2APIC_DISABLED)
+		val &= ~APICBASE_X2APIC;
+
+	vlapic->msr_apicbase = val;
+}
+
+void
+vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
+{
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, vcpuid);
+
+	if (state == X2APIC_DISABLED)
+		vlapic->msr_apicbase &= ~APICBASE_X2APIC;
+}
diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h
new file mode 100644
index 0000000..00de019
--- /dev/null
+++ b/sys/amd64/vmm/io/vlapic.h
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VLAPIC_H_
+#define	_VLAPIC_H_
+
+#include "vdev.h"
+
+struct vm;
+  
+/*
+ * Map of APIC Registers:       Offset  Description          		 	Access
+ */
+#define APIC_OFFSET_ID 		0x20    // Local APIC ID               		R/W
+#define APIC_OFFSET_VER 	0x30    // Local APIC Version              	R
+#define APIC_OFFSET_TPR 	0x80    // Task Priority Register          	R/W
+#define APIC_OFFSET_APR 	0x90    // Arbitration Priority Register   	R
+#define APIC_OFFSET_PPR 	0xA0    // Processor Priority Register     	R
+#define APIC_OFFSET_EOI 	0xB0    // EOI Register                    	W
+#define APIC_OFFSET_RRR 	0xC0    // Remote read                     	R
+#define APIC_OFFSET_LDR 	0xD0    // Logical Destination             	R/W
+#define APIC_OFFSET_DFR 	0xE0    // Destination Format Register     	0..27 R;  28..31 R/W
+#define APIC_OFFSET_SVR 	0xF0    // Spurious Interrupt Vector Reg.  	0..3  R;  4..9   R/W
+#define APIC_OFFSET_ISR0 	0x100   // ISR  000-031                    	R
+#define APIC_OFFSET_ISR1 	0x110   // ISR  032-063                    	R
+#define APIC_OFFSET_ISR2 	0x120   // ISR  064-095                    	R
+#define APIC_OFFSET_ISR3 	0x130   // ISR  095-128                    	R
+#define APIC_OFFSET_ISR4 	0x140   // ISR  128-159                    	R
+#define APIC_OFFSET_ISR5 	0x150   // ISR  160-191                    	R
+#define APIC_OFFSET_ISR6 	0x160   // ISR  192-223                    	R
+#define APIC_OFFSET_ISR7 	0x170   // ISR  224-255                    	R
+#define APIC_OFFSET_TMR0 	0x180   // TMR  000-031                    	R
+#define APIC_OFFSET_TMR1 	0x190   // TMR  032-063                    	R
+#define APIC_OFFSET_TMR2 	0x1A0   // TMR  064-095                    	R
+#define APIC_OFFSET_TMR3 	0x1B0   // TMR  095-128                    	R
+#define APIC_OFFSET_TMR4 	0x1C0   // TMR  128-159                    	R
+#define APIC_OFFSET_TMR5 	0x1D0   // TMR  160-191                    	R
+#define APIC_OFFSET_TMR6 	0x1E0   // TMR  192-223                    	R
+#define APIC_OFFSET_TMR7 	0x1F0   // TMR  224-255                    	R
+#define APIC_OFFSET_IRR0 	0x200   // IRR  000-031                    	R
+#define APIC_OFFSET_IRR1 	0x210   // IRR  032-063                    	R
+#define APIC_OFFSET_IRR2 	0x220   // IRR  064-095                    	R
+#define APIC_OFFSET_IRR3 	0x230   // IRR  095-128                    	R
+#define APIC_OFFSET_IRR4 	0x240   // IRR  128-159                    	R
+#define APIC_OFFSET_IRR5 	0x250   // IRR  160-191                    	R
+#define APIC_OFFSET_IRR6 	0x260   // IRR  192-223                    	R
+#define APIC_OFFSET_IRR7 	0x270   // IRR  224-255                    	R
+#define APIC_OFFSET_ESR		0x280   // Error Status Register           	R
+#define APIC_OFFSET_ICR_LOW 	0x300   // Interrupt Command Reg. (0-31)   	R/W
+#define APIC_OFFSET_ICR_HI 	0x310   // Interrupt Command Reg. (32-63)  	R/W
+#define APIC_OFFSET_TIMER_LVT 	0x320   // Local Vector Table (Timer)      	R/W
+#define APIC_OFFSET_THERM_LVT 	0x330   // Local Vector Table (Thermal)    	R/W (PIV+)
+#define APIC_OFFSET_PERF_LVT 	0x340   // Local Vector Table (Performance) 	R/W (P6+)
+#define APIC_OFFSET_LINT0_LVT 	0x350   // Local Vector Table (LINT0)      	R/W
+#define APIC_OFFSET_LINT1_LVT 	0x360 	// Local Vector Table (LINT1)      	R/W
+#define APIC_OFFSET_ERROR_LVT 	0x370   // Local Vector Table (ERROR)      	R/W
+#define APIC_OFFSET_ICR 	0x380   // Initial Count Reg. for Timer    	R/W
+#define APIC_OFFSET_CCR 	0x390   // Current Count of Timer          	R
+#define APIC_OFFSET_DCR 	0x3E0   // Timer Divide Configuration Reg. 	R/W
+
+/*
+ * 16 priority levels with at most one vector injected per level.
+ */
+#define	ISRVEC_STK_SIZE		(16 + 1)
+
+enum x2apic_state;
+
+struct vlapic *vlapic_init(struct vm *vm, int vcpuid);
+void vlapic_cleanup(struct vlapic *vlapic);
+
+int vlapic_op_mem_write(void* dev, uint64_t gpa,
+    			opsize_t size, uint64_t data);
+
+int vlapic_op_mem_read(void* dev, uint64_t gpa,
+    			opsize_t size, uint64_t *data);
+
+int vlapic_pending_intr(struct vlapic *vlapic);
+void vlapic_intr_accepted(struct vlapic *vlapic, int vector);
+void vlapic_set_intr_ready(struct vlapic *vlapic, int vector);
+int vlapic_timer_tick(struct vlapic *vlapic);
+
+uint64_t vlapic_get_apicbase(struct vlapic *vlapic);
+void vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val);
+void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state s);
+
+#endif	/* _VLAPIC_H_ */
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
new file mode 100644
index 0000000..a4dea79
--- /dev/null
+++ b/sys/amd64/vmm/vmm.c
@@ -0,0 +1,1022 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+
+#include <machine/vm.h>
+#include <machine/pcb.h>
+#include <machine/smp.h>
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_host.h"
+#include "vmm_mem.h"
+#include "vmm_util.h"
+#include <machine/vmm_dev.h>
+#include "vlapic.h"
+#include "vmm_msr.h"
+#include "vmm_ipi.h"
+#include "vmm_stat.h"
+#include "vmm_lapic.h"
+
+#include "io/ppt.h"
+#include "io/iommu.h"
+
+struct vlapic;
+
+struct vcpu {
+	int		flags;
+	enum vcpu_state	state;
+	struct mtx	mtx;
+	int		pincpu;		/* host cpuid this vcpu is bound to */
+	int		hostcpu;	/* host cpuid this vcpu last ran on */
+	uint64_t	guest_msrs[VMM_MSR_NUM];
+	struct vlapic	*vlapic;
+	int		 vcpuid;
+	struct savefpu	*guestfpu;	/* guest fpu state */
+	void		*stats;
+	struct vm_exit	exitinfo;
+	enum x2apic_state x2apic_state;
+	int		nmi_pending;
+};
+#define	VCPU_F_PINNED	0x0001
+
+#define	VCPU_PINCPU(vm, vcpuid)	\
+    ((vm->vcpu[vcpuid].flags & VCPU_F_PINNED) ? vm->vcpu[vcpuid].pincpu : -1)
+
+#define	VCPU_UNPIN(vm, vcpuid)	(vm->vcpu[vcpuid].flags &= ~VCPU_F_PINNED)
+
+#define	VCPU_PIN(vm, vcpuid, host_cpuid)				\
+do {									\
+	vm->vcpu[vcpuid].flags |= VCPU_F_PINNED;			\
+	vm->vcpu[vcpuid].pincpu = host_cpuid;				\
+} while(0)
+
+#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
+#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
+#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
+
+#define	VM_MAX_MEMORY_SEGMENTS	2
+
+struct vm {
+	void		*cookie;	/* processor-specific data */
+	void		*iommu;		/* iommu-specific data */
+	struct vcpu	vcpu[VM_MAXCPU];
+	int		num_mem_segs;
+	struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
+	char		name[VM_MAX_NAMELEN];
+
+	/*
+	 * Set of active vcpus.
+	 * An active vcpu is one that has been started implicitly (BSP) or
+	 * explicitly (AP) by sending it a startup ipi.
+	 */
+	cpuset_t	active_cpus;
+};
+
+static struct vmm_ops *ops;
+#define	VMM_INIT()	(ops != NULL ? (*ops->init)() : 0)
+#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
+
+#define	VMINIT(vm)	(ops != NULL ? (*ops->vminit)(vm): NULL)
+#define	VMRUN(vmi, vcpu, rip) \
+	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO)
+#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
+#define	VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm)			\
+    	(ops != NULL ? 							\
+    	(*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) :	\
+	ENXIO)
+#define	VMMMAP_GET(vmi, gpa) \
+	(ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO)
+#define	VMGETREG(vmi, vcpu, num, retval)		\
+	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
+#define	VMSETREG(vmi, vcpu, num, val)		\
+	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
+#define	VMGETDESC(vmi, vcpu, num, desc)		\
+	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
+#define	VMSETDESC(vmi, vcpu, num, desc)		\
+	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
+#define	VMINJECT(vmi, vcpu, type, vec, ec, ecv)	\
+	(ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
+#define	VMGETCAP(vmi, vcpu, num, retval)	\
+	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
+#define	VMSETCAP(vmi, vcpu, num, val)		\
+	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
+
+#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
+#define	fpu_stop_emulating()	clts()
+
+static MALLOC_DEFINE(M_VM, "vm", "vm");
+CTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
+
+/* statistics */
+static VMM_STAT_DEFINE(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
+
+static void
+vcpu_cleanup(struct vcpu *vcpu)
+{
+	vlapic_cleanup(vcpu->vlapic);
+	vmm_stat_free(vcpu->stats);	
+	fpu_save_area_free(vcpu->guestfpu);
+}
+
+static void
+vcpu_init(struct vm *vm, uint32_t vcpu_id)
+{
+	struct vcpu *vcpu;
+	
+	vcpu = &vm->vcpu[vcpu_id];
+
+	vcpu_lock_init(vcpu);
+	vcpu->hostcpu = NOCPU;
+	vcpu->vcpuid = vcpu_id;
+	vcpu->vlapic = vlapic_init(vm, vcpu_id);
+	vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
+	vcpu->guestfpu = fpu_save_area_alloc();
+	fpu_save_area_reset(vcpu->guestfpu);
+	vcpu->stats = vmm_stat_alloc();
+}
+
+struct vm_exit *
+vm_exitinfo(struct vm *vm, int cpuid)
+{
+	struct vcpu *vcpu;
+
+	if (cpuid < 0 || cpuid >= VM_MAXCPU)
+		panic("vm_exitinfo: invalid cpuid %d", cpuid);
+
+	vcpu = &vm->vcpu[cpuid];
+
+	return (&vcpu->exitinfo);
+}
+
+static int
+vmm_init(void)
+{
+	int error;
+
+	vmm_host_state_init();
+	vmm_ipi_init();
+
+	error = vmm_mem_init();
+	if (error)
+		return (error);
+	
+	if (vmm_is_intel())
+		ops = &vmm_ops_intel;
+	else if (vmm_is_amd())
+		ops = &vmm_ops_amd;
+	else
+		return (ENXIO);
+
+	vmm_msr_init();
+
+	return (VMM_INIT());
+}
+
+static int
+vmm_handler(module_t mod, int what, void *arg)
+{
+	int error;
+
+	switch (what) {
+	case MOD_LOAD:
+		vmmdev_init();
+		iommu_init();
+		error = vmm_init();
+		break;
+	case MOD_UNLOAD:
+		error = vmmdev_cleanup();
+		if (error == 0) {
+			iommu_cleanup();
+			vmm_ipi_cleanup();
+			error = VMM_CLEANUP();
+		}
+		break;
+	default:
+		error = 0;
+		break;
+	}
+	return (error);
+}
+
+static moduledata_t vmm_kmod = {
+	"vmm",
+	vmm_handler,
+	NULL
+};
+
+/*
+ * Execute the module load handler after the pci passthru driver has had
+ * a chance to claim devices. We need this information at the time we do
+ * iommu initialization.
+ */
+DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_CONFIGURE + 1, SI_ORDER_ANY);
+MODULE_VERSION(vmm, 1);
+
+SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
+
+struct vm *
+vm_create(const char *name)
+{
+	int i;
+	struct vm *vm;
+	vm_paddr_t maxaddr;
+
+	const int BSP = 0;
+
+	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
+		return (NULL);
+
+	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
+	strcpy(vm->name, name);
+	vm->cookie = VMINIT(vm);
+
+	for (i = 0; i < VM_MAXCPU; i++) {
+		vcpu_init(vm, i);
+		guest_msrs_init(vm, i);
+	}
+
+	maxaddr = vmm_mem_maxaddr();
+	vm->iommu = iommu_create_domain(maxaddr);
+	vm_activate_cpu(vm, BSP);
+
+	return (vm);
+}
+
+static void
+vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
+{
+	size_t len;
+	vm_paddr_t hpa;
+	void *host_domain;
+
+	host_domain = iommu_host_domain();
+
+	len = 0;
+	while (len < seg->len) {
+		hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE);
+		if (hpa == (vm_paddr_t)-1) {
+			panic("vm_free_mem_segs: cannot free hpa "
+			      "associated with gpa 0x%016lx", seg->gpa + len);
+		}
+
+		/*
+		 * Remove the 'gpa' to 'hpa' mapping in VMs domain.
+		 * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'.
+		 */
+		iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE);
+		iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE);
+
+		vmm_mem_free(hpa, PAGE_SIZE);
+
+		len += PAGE_SIZE;
+	}
+
+	/*
+	 * Invalidate cached translations associated with 'vm->iommu' since
+	 * we have now moved some pages from it.
+	 */
+	iommu_invalidate_tlb(vm->iommu);
+
+	bzero(seg, sizeof(struct vm_memory_segment));
+}
+
+void
+vm_destroy(struct vm *vm)
+{
+	int i;
+
+	ppt_unassign_all(vm);
+
+	for (i = 0; i < vm->num_mem_segs; i++)
+		vm_free_mem_seg(vm, &vm->mem_segs[i]);
+
+	vm->num_mem_segs = 0;
+
+	for (i = 0; i < VM_MAXCPU; i++)
+		vcpu_cleanup(&vm->vcpu[i]);
+
+	iommu_destroy_domain(vm->iommu);
+
+	VMCLEANUP(vm->cookie);
+
+	free(vm, M_VM);
+}
+
+const char *
+vm_name(struct vm *vm)
+{
+	return (vm->name);
+}
+
+int
+vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+
+	return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
+			   VM_PROT_RW, spok));
+}
+
+int
+vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+
+	return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0,
+			   VM_PROT_NONE, spok));
+}
+
+/*
+ * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise
+ */
+static boolean_t
+vm_gpa_available(struct vm *vm, vm_paddr_t gpa)
+{
+	int i;
+	vm_paddr_t gpabase, gpalimit;
+
+	if (gpa & PAGE_MASK)
+		panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa);
+
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		gpabase = vm->mem_segs[i].gpa;
+		gpalimit = gpabase + vm->mem_segs[i].len;
+		if (gpa >= gpabase && gpa < gpalimit)
+			return (FALSE);
+	}
+
+	return (TRUE);
+}
+
+int
+vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+	int error, available, allocated;
+	struct vm_memory_segment *seg;
+	vm_paddr_t g, hpa;
+	void *host_domain;
+
+	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+
+	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
+		return (EINVAL);
+	
+	available = allocated = 0;
+	g = gpa;
+	while (g < gpa + len) {
+		if (vm_gpa_available(vm, g))
+			available++;
+		else
+			allocated++;
+
+		g += PAGE_SIZE;
+	}
+
+	/*
+	 * If there are some allocated and some available pages in the address
+	 * range then it is an error.
+	 */
+	if (allocated && available)
+		return (EINVAL);
+
+	/*
+	 * If the entire address range being requested has already been
+	 * allocated then there isn't anything more to do.
+	 */
+	if (allocated && available == 0)
+		return (0);
+
+	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
+		return (E2BIG);
+
+	host_domain = iommu_host_domain();
+
+	seg = &vm->mem_segs[vm->num_mem_segs];
+
+	error = 0;
+	seg->gpa = gpa;
+	seg->len = 0;
+	while (seg->len < len) {
+		hpa = vmm_mem_alloc(PAGE_SIZE);
+		if (hpa == 0) {
+			error = ENOMEM;
+			break;
+		}
+
+		error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE,
+				   VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok);
+		if (error)
+			break;
+
+		/*
+		 * Remove the 1:1 mapping for 'hpa' from the 'host_domain'.
+		 * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain.
+		 */
+		iommu_remove_mapping(host_domain, hpa, PAGE_SIZE);
+		iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE);
+
+		seg->len += PAGE_SIZE;
+	}
+
+	if (error) {
+		vm_free_mem_seg(vm, seg);
+		return (error);
+	}
+
+	/*
+	 * Invalidate cached translations associated with 'host_domain' since
+	 * we have now moved some pages from it.
+	 */
+	iommu_invalidate_tlb(host_domain);
+
+	vm->num_mem_segs++;
+
+	return (0);
+}
+
+vm_paddr_t
+vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+	vm_paddr_t nextpage;
+
+	nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE);
+	if (len > nextpage - gpa)
+		panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len);
+
+	return (VMMMAP_GET(vm->cookie, gpa));
+}
+
+int
+vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
+		  struct vm_memory_segment *seg)
+{
+	int i;
+
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		if (gpabase == vm->mem_segs[i].gpa) {
+			*seg = vm->mem_segs[i];
+			return (0);
+		}
+	}
+	return (-1);
+}
+
+int
+vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
+{
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (reg >= VM_REG_LAST)
+		return (EINVAL);
+
+	return (VMGETREG(vm->cookie, vcpu, reg, retval));
+}
+
+int
+vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
+{
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (reg >= VM_REG_LAST)
+		return (EINVAL);
+
+	return (VMSETREG(vm->cookie, vcpu, reg, val));
+}
+
+static boolean_t
+is_descriptor_table(int reg)
+{
+
+	switch (reg) {
+	case VM_REG_GUEST_IDTR:
+	case VM_REG_GUEST_GDTR:
+		return (TRUE);
+	default:
+		return (FALSE);
+	}
+}
+
+static boolean_t
+is_segment_register(int reg)
+{
+	
+	switch (reg) {
+	case VM_REG_GUEST_ES:
+	case VM_REG_GUEST_CS:
+	case VM_REG_GUEST_SS:
+	case VM_REG_GUEST_DS:
+	case VM_REG_GUEST_FS:
+	case VM_REG_GUEST_GS:
+	case VM_REG_GUEST_TR:
+	case VM_REG_GUEST_LDTR:
+		return (TRUE);
+	default:
+		return (FALSE);
+	}
+}
+
+int
+vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
+		struct seg_desc *desc)
+{
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (!is_segment_register(reg) && !is_descriptor_table(reg))
+		return (EINVAL);
+
+	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
+}
+
+int
+vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
+		struct seg_desc *desc)
+{
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (!is_segment_register(reg) && !is_descriptor_table(reg))
+		return (EINVAL);
+
+	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
+}
+
+int
+vm_get_pinning(struct vm *vm, int vcpuid, int *cpuid)
+{
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	*cpuid = VCPU_PINCPU(vm, vcpuid);
+
+	return (0);
+}
+
+int
+vm_set_pinning(struct vm *vm, int vcpuid, int host_cpuid)
+{
+	struct thread *td;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	td = curthread;		/* XXXSMP only safe when muxing vcpus */
+
+	/* unpin */
+	if (host_cpuid < 0) {
+		VCPU_UNPIN(vm, vcpuid);
+		thread_lock(td);
+		sched_unbind(td);
+		thread_unlock(td);
+		return (0);
+	}
+
+	if (CPU_ABSENT(host_cpuid))
+		return (EINVAL);
+
+	/*
+	 * XXX we should check that 'host_cpuid' has not already been pinned
+	 * by another vm.
+	 */
+	thread_lock(td);
+	sched_bind(td, host_cpuid);
+	thread_unlock(td);
+	VCPU_PIN(vm, vcpuid, host_cpuid);
+
+	return (0);
+}
+
+static void
+restore_guest_fpustate(struct vcpu *vcpu)
+{
+
+	/* flush host state to the pcb */
+	fpuexit(curthread);
+
+	/* restore guest FPU state */
+	fpu_stop_emulating();
+	fpurestore(vcpu->guestfpu);
+
+	/*
+	 * The FPU is now "dirty" with the guest's state so turn on emulation
+	 * to trap any access to the FPU by the host.
+	 */
+	fpu_start_emulating();
+}
+
+static void
+save_guest_fpustate(struct vcpu *vcpu)
+{
+
+	if ((rcr0() & CR0_TS) == 0)
+		panic("fpu emulation not enabled in host!");
+
+	/* save guest FPU state */
+	fpu_stop_emulating();
+	fpusave(vcpu->guestfpu);
+	fpu_start_emulating();
+}
+
+static VMM_STAT_DEFINE(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
+
+int
+vm_run(struct vm *vm, struct vm_run *vmrun)
+{
+	int error, vcpuid, sleepticks, t;
+	struct vcpu *vcpu;
+	struct pcb *pcb;
+	uint64_t tscval, rip;
+	struct vm_exit *vme;
+
+	vcpuid = vmrun->cpuid;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+	vme = &vmrun->vm_exit;
+	rip = vmrun->rip;
+restart:
+	critical_enter();
+
+	tscval = rdtsc();
+
+	pcb = PCPU_GET(curpcb);
+	set_pcb_flags(pcb, PCB_FULL_IRET);
+
+	restore_guest_msrs(vm, vcpuid);	
+	restore_guest_fpustate(vcpu);
+
+	vcpu->hostcpu = curcpu;
+	error = VMRUN(vm->cookie, vcpuid, rip);
+	vcpu->hostcpu = NOCPU;
+
+	save_guest_fpustate(vcpu);
+	restore_host_msrs(vm, vcpuid);
+
+	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
+
+	/* copy the exit information */
+	bcopy(&vcpu->exitinfo, vme, sizeof(struct vm_exit));
+
+	critical_exit();
+
+	/*
+	 * Oblige the guest's desire to 'hlt' by sleeping until the vcpu
+	 * is ready to run.
+	 */
+	if (error == 0 && vme->exitcode == VM_EXITCODE_HLT) {
+		vcpu_lock(vcpu);
+
+		/*
+		 * Figure out the number of host ticks until the next apic
+		 * timer interrupt in the guest.
+		 */
+		sleepticks = lapic_timer_tick(vm, vcpuid);
+
+		/*
+		 * If the guest local apic timer is disabled then sleep for
+		 * a long time but not forever.
+		 */
+		if (sleepticks < 0)
+			sleepticks = hz;
+
+		/*
+		 * Do a final check for pending NMI or interrupts before
+		 * really putting this thread to sleep.
+		 *
+		 * These interrupts could have happened any time after we
+		 * returned from VMRUN() and before we grabbed the vcpu lock.
+		 */
+		if (!vm_nmi_pending(vm, vcpuid) &&
+		    lapic_pending_intr(vm, vcpuid) < 0) {
+			if (sleepticks <= 0)
+				panic("invalid sleepticks %d", sleepticks);
+			t = ticks;
+			msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
+			vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
+		}
+
+		vcpu_unlock(vcpu);
+
+		rip = vme->rip + vme->inst_length;
+		goto restart;
+	}
+
+	return (error);
+}
+
+int
+vm_inject_event(struct vm *vm, int vcpuid, int type,
+		int vector, uint32_t code, int code_valid)
+{
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
+		return (EINVAL);
+
+	if (vector < 0 || vector > 255)
+		return (EINVAL);
+
+	return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
+}
+
+static VMM_STAT_DEFINE(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
+
+int
+vm_inject_nmi(struct vm *vm, int vcpuid)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	vcpu->nmi_pending = 1;
+	vm_interrupt_hostcpu(vm, vcpuid);
+	return (0);
+}
+
+int
+vm_nmi_pending(struct vm *vm, int vcpuid)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	return (vcpu->nmi_pending);
+}
+
+void
+vm_nmi_clear(struct vm *vm, int vcpuid)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	if (vcpu->nmi_pending == 0)
+		panic("vm_nmi_clear: inconsistent nmi_pending state");
+
+	vcpu->nmi_pending = 0;
+	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
+}
+
+int
+vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
+{
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (type < 0 || type >= VM_CAP_MAX)
+		return (EINVAL);
+
+	return (VMGETCAP(vm->cookie, vcpu, type, retval));
+}
+
+int
+vm_set_capability(struct vm *vm, int vcpu, int type, int val)
+{
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (type < 0 || type >= VM_CAP_MAX)
+		return (EINVAL);
+
+	return (VMSETCAP(vm->cookie, vcpu, type, val));
+}
+
+uint64_t *
+vm_guest_msrs(struct vm *vm, int cpu)
+{
+	return (vm->vcpu[cpu].guest_msrs);
+}
+
+struct vlapic *
+vm_lapic(struct vm *vm, int cpu)
+{
+	return (vm->vcpu[cpu].vlapic);
+}
+
+boolean_t
+vmm_is_pptdev(int bus, int slot, int func)
+{
+	int found, b, s, f, n;
+	char *val, *cp, *cp2;
+
+	/*
+	 * setenv pptdevs "1/2/3 4/5/6 7/8/9 10/11/12"
+	 */
+	found = 0;
+	cp = val = getenv("pptdevs");
+	while (cp != NULL && *cp != '\0') {
+		if ((cp2 = strchr(cp, ' ')) != NULL)
+			*cp2 = '\0';
+
+		n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
+		if (n == 3 && bus == b && slot == s && func == f) {
+			found = 1;
+			break;
+		}
+		
+		if (cp2 != NULL)
+			*cp2++ = ' ';
+
+		cp = cp2;
+	}
+	freeenv(val);
+	return (found);
+}
+
+void *
+vm_iommu_domain(struct vm *vm)
+{
+
+	return (vm->iommu);
+}
+
+int
+vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
+{
+	int error;
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	vcpu_lock(vcpu);
+
+	/*
+	 * The following state transitions are allowed:
+	 * IDLE -> RUNNING -> IDLE
+	 * IDLE -> CANNOT_RUN -> IDLE
+	 */
+	if ((vcpu->state == VCPU_IDLE && state != VCPU_IDLE) ||
+	    (vcpu->state != VCPU_IDLE && state == VCPU_IDLE)) {
+		error = 0;
+		vcpu->state = state;
+	} else {
+		error = EBUSY;
+	}
+
+	vcpu_unlock(vcpu);
+
+	return (error);
+}
+
+enum vcpu_state
+vcpu_get_state(struct vm *vm, int vcpuid)
+{
+	struct vcpu *vcpu;
+	enum vcpu_state state;
+
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	vcpu_lock(vcpu);
+	state = vcpu->state;
+	vcpu_unlock(vcpu);
+
+	return (state);
+}
+
+void
+vm_activate_cpu(struct vm *vm, int vcpuid)
+{
+
+	if (vcpuid >= 0 && vcpuid < VM_MAXCPU)
+		CPU_SET(vcpuid, &vm->active_cpus);
+}
+
+cpuset_t
+vm_active_cpus(struct vm *vm)
+{
+
+	return (vm->active_cpus);
+}
+
+void *
+vcpu_stats(struct vm *vm, int vcpuid)
+{
+
+	return (vm->vcpu[vcpuid].stats);
+}
+
+int
+vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
+{
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	*state = vm->vcpu[vcpuid].x2apic_state;
+
+	return (0);
+}
+
+int
+vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
+{
+	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (state < 0 || state >= X2APIC_STATE_LAST)
+		return (EINVAL);
+
+	vm->vcpu[vcpuid].x2apic_state = state;
+
+	vlapic_set_x2apic_state(vm, vcpuid, state);
+
+	return (0);
+}
+
+void
+vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
+{
+	int hostcpu;
+	struct vcpu *vcpu;
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	vcpu_lock(vcpu);
+	hostcpu = vcpu->hostcpu;
+	if (hostcpu == NOCPU) {
+		/*
+		 * If the vcpu is 'RUNNING' but without a valid 'hostcpu' then
+		 * the host thread must be sleeping waiting for an event to
+		 * kick the vcpu out of 'hlt'.
+		 *
+		 * XXX this is racy because the condition exists right before
+		 * and after calling VMRUN() in vm_run(). The wakeup() is
+		 * benign in this case.
+		 */
+		if (vcpu->state == VCPU_RUNNING)
+			wakeup_one(vcpu);
+	} else {
+		if (vcpu->state != VCPU_RUNNING)
+			panic("invalid vcpu state %d", vcpu->state);
+		if (hostcpu != curcpu)
+			ipi_cpu(hostcpu, vmm_ipinum);
+	}
+	vcpu_unlock(vcpu);
+}
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
new file mode 100644
index 0000000..0150ebd
--- /dev/null
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -0,0 +1,538 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/malloc.h>
+#include <sys/conf.h>
+#include <sys/sysctl.h>
+#include <sys/libkern.h>
+#include <sys/ioccom.h>
+#include <sys/mman.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include "vmm_lapic.h"
+#include "vmm_stat.h"
+#include "vmm_mem.h"
+#include "io/ppt.h"
+#include <machine/vmm_dev.h>
+
+struct vmmdev_softc {
+	struct vm	*vm;		/* vm instance cookie */
+	struct cdev	*cdev;
+	SLIST_ENTRY(vmmdev_softc) link;
+};
+static SLIST_HEAD(, vmmdev_softc) head;
+
+static struct mtx vmmdev_mtx;
+
+static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
+
+SYSCTL_DECL(_hw_vmm);
+
+static struct vmmdev_softc *
+vmmdev_lookup(const char *name)
+{
+	struct vmmdev_softc *sc;
+
+#ifdef notyet	/* XXX kernel is not compiled with invariants */
+	mtx_assert(&vmmdev_mtx, MA_OWNED);
+#endif
+
+	SLIST_FOREACH(sc, &head, link) {
+		if (strcmp(name, vm_name(sc->vm)) == 0)
+			break;
+	}
+
+	return (sc);
+}
+
+static struct vmmdev_softc *
+vmmdev_lookup2(struct cdev *cdev)
+{
+
+	return (cdev->si_drv1);
+}
+
+static int
+vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
+{
+	int error, off, c;
+	vm_paddr_t hpa, gpa;
+	struct vmmdev_softc *sc;
+
+	static char zerobuf[PAGE_SIZE];
+
+	error = 0;
+	mtx_lock(&vmmdev_mtx);
+	sc = vmmdev_lookup2(cdev);
+	if (sc == NULL)
+		error = ENXIO;
+
+	while (uio->uio_resid > 0 && error == 0) {
+		gpa = uio->uio_offset;
+		off = gpa & PAGE_MASK;
+		c = min(uio->uio_resid, PAGE_SIZE - off);
+
+		/*
+		 * The VM has a hole in its physical memory map. If we want to
+		 * use 'dd' to inspect memory beyond the hole we need to
+		 * provide bogus data for memory that lies in the hole.
+		 *
+		 * Since this device does not support lseek(2), dd(1) will
+		 * read(2) blocks of data to simulate the lseek(2).
+		 */
+		hpa = vm_gpa2hpa(sc->vm, gpa, c);
+		if (hpa == (vm_paddr_t)-1) {
+			if (uio->uio_rw == UIO_READ)
+				error = uiomove(zerobuf, c, uio);
+			else
+				error = EFAULT;
+		} else
+			error = uiomove((void *)PHYS_TO_DMAP(hpa), c, uio);
+	}
+
+	mtx_unlock(&vmmdev_mtx);
+	return (error);
+}
+
+static int
+vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
+	     struct thread *td)
+{
+	int error, vcpu, state_changed;
+	enum vcpu_state new_state;
+	struct vmmdev_softc *sc;
+	struct vm_memory_segment *seg;
+	struct vm_register *vmreg;
+	struct vm_seg_desc* vmsegdesc;
+	struct vm_pin *vmpin;
+	struct vm_run *vmrun;
+	struct vm_event *vmevent;
+	struct vm_lapic_irq *vmirq;
+	struct vm_capability *vmcap;
+	struct vm_pptdev *pptdev;
+	struct vm_pptdev_mmio *pptmmio;
+	struct vm_pptdev_msi *pptmsi;
+	struct vm_pptdev_msix *pptmsix;
+	struct vm_nmi *vmnmi;
+	struct vm_stats *vmstats;
+	struct vm_stat_desc *statdesc;
+	struct vm_x2apic *x2apic;
+
+	sc = vmmdev_lookup2(cdev);
+	if (sc == NULL)
+		return (ENXIO);
+
+	vcpu = -1;
+	state_changed = 0;
+
+	/*
+	 * Some VMM ioctls can operate only on vcpus that are not running.
+	 */
+	switch (cmd) {
+	case VM_RUN:
+	case VM_SET_PINNING:
+	case VM_GET_REGISTER:
+	case VM_SET_REGISTER:
+	case VM_GET_SEGMENT_DESCRIPTOR:
+	case VM_SET_SEGMENT_DESCRIPTOR:
+	case VM_INJECT_EVENT:
+	case VM_GET_CAPABILITY:
+	case VM_SET_CAPABILITY:
+	case VM_PPTDEV_MSI:
+	case VM_PPTDEV_MSIX:
+	case VM_SET_X2APIC_STATE:
+		/*
+		 * XXX fragile, handle with care
+		 * Assumes that the first field of the ioctl data is the vcpu.
+		 */
+		vcpu = *(int *)data;
+		if (vcpu < 0 || vcpu >= VM_MAXCPU) {
+			error = EINVAL;
+			goto done;
+		}
+
+		if (cmd == VM_RUN)
+			new_state = VCPU_RUNNING;
+		else
+			new_state = VCPU_CANNOT_RUN;
+
+		error = vcpu_set_state(sc->vm, vcpu, new_state);
+		if (error)
+			goto done;
+
+		state_changed = 1;
+		break;
+
+	case VM_MAP_PPTDEV_MMIO:
+	case VM_BIND_PPTDEV:
+	case VM_UNBIND_PPTDEV:
+	case VM_MAP_MEMORY:
+		/*
+		 * ioctls that operate on the entire virtual machine must
+		 * prevent all vcpus from running.
+		 */
+		error = 0;
+		for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
+			error = vcpu_set_state(sc->vm, vcpu, VCPU_CANNOT_RUN);
+			if (error)
+				break;
+		}
+
+		if (error) {
+			while (--vcpu >= 0)
+				vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
+			goto done;
+		}
+
+		state_changed = 2;
+		break;
+
+	default:
+		break;
+	}
+
+	switch(cmd) {
+	case VM_RUN:
+		vmrun = (struct vm_run *)data;
+		error = vm_run(sc->vm, vmrun);
+		break;
+	case VM_STAT_DESC: {
+		const char *desc;
+		statdesc = (struct vm_stat_desc *)data;
+		desc = vmm_stat_desc(statdesc->index);
+		if (desc != NULL) {
+			error = 0;
+			strlcpy(statdesc->desc, desc, sizeof(statdesc->desc));
+		} else
+			error = EINVAL;
+		break;
+	}
+	case VM_STATS: {
+		CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_TYPES);
+		vmstats = (struct vm_stats *)data;
+		getmicrotime(&vmstats->tv);
+		error = vmm_stat_copy(sc->vm, vmstats->cpuid,
+				      &vmstats->num_entries, vmstats->statbuf);
+		break;
+	}
+	case VM_PPTDEV_MSI:
+		pptmsi = (struct vm_pptdev_msi *)data;
+		error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
+				      pptmsi->bus, pptmsi->slot, pptmsi->func,
+				      pptmsi->destcpu, pptmsi->vector,
+				      pptmsi->numvec);
+		break;
+	case VM_PPTDEV_MSIX:
+		pptmsix = (struct vm_pptdev_msix *)data;
+		error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
+				       pptmsix->bus, pptmsix->slot, 
+				       pptmsix->func, pptmsix->idx,
+				       pptmsix->msg, pptmsix->vector_control,
+				       pptmsix->addr);
+		break;
+	case VM_MAP_PPTDEV_MMIO:
+		pptmmio = (struct vm_pptdev_mmio *)data;
+		error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
+				     pptmmio->func, pptmmio->gpa, pptmmio->len,
+				     pptmmio->hpa);
+		break;
+	case VM_BIND_PPTDEV:
+		pptdev = (struct vm_pptdev *)data;
+		error = ppt_assign_device(sc->vm, pptdev->bus, pptdev->slot,
+					  pptdev->func);
+		break;
+	case VM_UNBIND_PPTDEV:
+		pptdev = (struct vm_pptdev *)data;
+		error = ppt_unassign_device(sc->vm, pptdev->bus, pptdev->slot,
+					    pptdev->func);
+		break;
+	case VM_INJECT_EVENT:
+		vmevent = (struct vm_event *)data;
+		error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type,
+					vmevent->vector,
+					vmevent->error_code,
+					vmevent->error_code_valid);
+		break;
+	case VM_INJECT_NMI:
+		vmnmi = (struct vm_nmi *)data;
+		error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
+		break;
+	case VM_LAPIC_IRQ:
+		vmirq = (struct vm_lapic_irq *)data;
+		error = lapic_set_intr(sc->vm, vmirq->cpuid, vmirq->vector);
+		break;
+	case VM_SET_PINNING:
+		vmpin = (struct vm_pin *)data;
+		error = vm_set_pinning(sc->vm, vmpin->vm_cpuid,
+				       vmpin->host_cpuid);
+		break;
+	case VM_GET_PINNING:
+		vmpin = (struct vm_pin *)data;
+		error = vm_get_pinning(sc->vm, vmpin->vm_cpuid,
+				       &vmpin->host_cpuid);
+		break;
+	case VM_MAP_MEMORY:
+		seg = (struct vm_memory_segment *)data;
+		error = vm_malloc(sc->vm, seg->gpa, seg->len);
+		break;
+	case VM_GET_MEMORY_SEG:
+		seg = (struct vm_memory_segment *)data;
+		seg->len = 0;
+		(void)vm_gpabase2memseg(sc->vm, seg->gpa, seg);
+		error = 0;
+		break;
+	case VM_GET_REGISTER:
+		vmreg = (struct vm_register *)data;
+		error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
+					&vmreg->regval);
+		break;
+	case VM_SET_REGISTER:
+		vmreg = (struct vm_register *)data;
+		error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
+					vmreg->regval);
+		break;
+	case VM_SET_SEGMENT_DESCRIPTOR:
+		vmsegdesc = (struct vm_seg_desc *)data;
+		error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
+					vmsegdesc->regnum,
+					&vmsegdesc->desc);
+		break;
+	case VM_GET_SEGMENT_DESCRIPTOR:
+		vmsegdesc = (struct vm_seg_desc *)data;
+		error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
+					vmsegdesc->regnum,
+					&vmsegdesc->desc);
+		break;
+	case VM_GET_CAPABILITY:
+		vmcap = (struct vm_capability *)data;
+		error = vm_get_capability(sc->vm, vmcap->cpuid,
+					  vmcap->captype,
+					  &vmcap->capval);
+		break;
+	case VM_SET_CAPABILITY:
+		vmcap = (struct vm_capability *)data;
+		error = vm_set_capability(sc->vm, vmcap->cpuid,
+					  vmcap->captype,
+					  vmcap->capval);
+		break;
+	case VM_SET_X2APIC_STATE:
+		x2apic = (struct vm_x2apic *)data;
+		error = vm_set_x2apic_state(sc->vm,
+					    x2apic->cpuid, x2apic->state);
+		break;
+	case VM_GET_X2APIC_STATE:
+		x2apic = (struct vm_x2apic *)data;
+		error = vm_get_x2apic_state(sc->vm,
+					    x2apic->cpuid, &x2apic->state);
+		break;
+	default:
+		error = ENOTTY;
+		break;
+	}
+
+	if (state_changed == 1) {
+		vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
+	} else if (state_changed == 2) {
+		for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++)
+			vcpu_set_state(sc->vm, vcpu, VCPU_IDLE);
+	}
+
+done:
+	return (error);
+}
+
+static int
+vmmdev_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr,
+    int nprot, vm_memattr_t *memattr)
+{
+	int error;
+	struct vmmdev_softc *sc;
+
+	error = -1;
+	mtx_lock(&vmmdev_mtx);
+
+	sc = vmmdev_lookup2(cdev);
+	if (sc != NULL && (nprot & PROT_EXEC) == 0) {
+		*paddr = vm_gpa2hpa(sc->vm, (vm_paddr_t)offset, PAGE_SIZE);
+		if (*paddr != (vm_paddr_t)-1)
+			error = 0;
+	}
+
+	mtx_unlock(&vmmdev_mtx);
+
+	return (error);
+}
+
+static void
+vmmdev_destroy(struct vmmdev_softc *sc, boolean_t unlink)
+{
+
+	/*
+	 * XXX must stop virtual machine instances that may be still
+	 * running and cleanup their state.
+	 */
+	if (sc->cdev)
+		destroy_dev(sc->cdev);
+
+	if (sc->vm)
+		vm_destroy(sc->vm);
+
+	if (unlink) {
+		mtx_lock(&vmmdev_mtx);
+		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
+		mtx_unlock(&vmmdev_mtx);
+	}
+
+	free(sc, M_VMMDEV);
+}
+
+static int
+sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	char buf[VM_MAX_NAMELEN];
+	struct vmmdev_softc *sc;
+
+	strlcpy(buf, "beavis", sizeof(buf));
+	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+
+	/*
+	 * XXX TODO if any process has this device open then fail
+	 */
+
+	mtx_lock(&vmmdev_mtx);
+	sc = vmmdev_lookup(buf);
+	if (sc == NULL) {
+		mtx_unlock(&vmmdev_mtx);
+		return (EINVAL);
+	}
+
+	sc->cdev->si_drv1 = NULL;
+	mtx_unlock(&vmmdev_mtx);
+
+	vmmdev_destroy(sc, TRUE);
+
+	return (0);
+}
+SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
+	    NULL, 0, sysctl_vmm_destroy, "A", NULL);
+
+static struct cdevsw vmmdevsw = {
+	.d_name		= "vmmdev",
+	.d_version	= D_VERSION,
+	.d_ioctl	= vmmdev_ioctl,
+	.d_mmap		= vmmdev_mmap,
+	.d_read		= vmmdev_rw,
+	.d_write	= vmmdev_rw,
+};
+
+static int
+sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	struct vm *vm;
+	struct vmmdev_softc *sc, *sc2;
+	char buf[VM_MAX_NAMELEN];
+
+	strlcpy(buf, "beavis", sizeof(buf));
+	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+
+	mtx_lock(&vmmdev_mtx);
+	sc = vmmdev_lookup(buf);
+	mtx_unlock(&vmmdev_mtx);
+	if (sc != NULL)
+		return (EEXIST);
+
+	vm = vm_create(buf);
+	if (vm == NULL)
+		return (EINVAL);
+
+	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
+	sc->vm = vm;
+
+	/*
+	 * Lookup the name again just in case somebody sneaked in when we
+	 * dropped the lock.
+	 */
+	mtx_lock(&vmmdev_mtx);
+	sc2 = vmmdev_lookup(buf);
+	if (sc2 == NULL)
+		SLIST_INSERT_HEAD(&head, sc, link);
+	mtx_unlock(&vmmdev_mtx);
+
+	if (sc2 != NULL) {
+		vmmdev_destroy(sc, FALSE);
+		return (EEXIST);
+	}
+
+	sc->cdev = make_dev(&vmmdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
+			    "vmm/%s", buf);
+	sc->cdev->si_drv1 = sc;
+
+	return (0);
+}
+SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
+	    NULL, 0, sysctl_vmm_create, "A", NULL);
+
+void
+vmmdev_init(void)
+{
+	mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
+}
+
+int
+vmmdev_cleanup(void)
+{
+	int error;
+
+	if (SLIST_EMPTY(&head))
+		error = 0;
+	else
+		error = EBUSY;
+
+	return (error);
+}
diff --git a/sys/amd64/vmm/vmm_host.c b/sys/amd64/vmm/vmm_host.c
new file mode 100644
index 0000000..8dfef73
--- /dev/null
+++ b/sys/amd64/vmm/vmm_host.c
@@ -0,0 +1,124 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/pcpu.h>
+
+#include <machine/cpufunc.h>
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+
+#include "vmm_host.h"
+
+static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4;
+
+void
+vmm_host_state_init(void)
+{
+
+	vmm_host_efer = rdmsr(MSR_EFER);
+	vmm_host_pat = rdmsr(MSR_PAT);
+
+	/*
+	 * We always want CR0.TS to be set when the processor does a VM exit.
+	 *
+	 * With emulation turned on unconditionally after a VM exit, we are
+	 * able to trap inadvertent use of the FPU until the guest FPU state
+	 * has been safely squirreled away.
+	 */
+	vmm_host_cr0 = rcr0() | CR0_TS;
+
+	vmm_host_cr4 = rcr4();
+}
+
+uint64_t
+vmm_get_host_pat(void)
+{
+
+	return (vmm_host_pat);
+}
+
+uint64_t
+vmm_get_host_efer(void)
+{
+
+	return (vmm_host_efer);
+}
+
+uint64_t
+vmm_get_host_cr0(void)
+{
+
+	return (vmm_host_cr0);
+}
+
+uint64_t
+vmm_get_host_cr4(void)
+{
+
+	return (vmm_host_cr4);
+}
+
+uint64_t
+vmm_get_host_datasel(void)
+{
+
+	return (GSEL(GDATA_SEL, SEL_KPL));
+
+}
+
+uint64_t
+vmm_get_host_codesel(void)
+{
+
+	return (GSEL(GCODE_SEL, SEL_KPL));
+}
+
+uint64_t
+vmm_get_host_tsssel(void)
+{
+
+	return (GSEL(GPROC0_SEL, SEL_KPL));
+}
+
+uint64_t
+vmm_get_host_fsbase(void)
+{
+
+	return (0);
+}
+
+uint64_t
+vmm_get_host_idtrbase(void)
+{
+
+	return (r_idt.rd_base);
+}
diff --git a/sys/amd64/vmm/vmm_host.h b/sys/amd64/vmm/vmm_host.h
new file mode 100644
index 0000000..839f54a
--- /dev/null
+++ b/sys/amd64/vmm/vmm_host.h
@@ -0,0 +1,75 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_VMM_HOST_H_
+#define	_VMM_HOST_H_
+
+#ifndef	_KERNEL
+#error "no user-servicable parts inside"
+#endif
+
+void vmm_host_state_init(void);
+
+uint64_t vmm_get_host_pat(void);
+uint64_t vmm_get_host_efer(void);
+uint64_t vmm_get_host_cr0(void);
+uint64_t vmm_get_host_cr4(void);
+uint64_t vmm_get_host_datasel(void);
+uint64_t vmm_get_host_codesel(void);
+uint64_t vmm_get_host_tsssel(void);
+uint64_t vmm_get_host_fsbase(void);
+uint64_t vmm_get_host_idtrbase(void);
+
+/*
+ * Inline access to host state that is used on every VM entry
+ */
+static __inline uint64_t
+vmm_get_host_trbase(void)
+{
+
+	return ((uint64_t)PCPU_GET(tssp));
+}
+
+static __inline uint64_t
+vmm_get_host_gdtrbase(void)
+{
+
+	return ((uint64_t)&gdt[NGDT * curcpu]);
+}
+
+struct pcpu;
+extern struct pcpu __pcpu[];
+
+static __inline uint64_t
+vmm_get_host_gsbase(void)
+{
+
+	return ((uint64_t)&__pcpu[curcpu]);
+}
+
+#endif
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
new file mode 100644
index 0000000..e73f6bb
--- /dev/null
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -0,0 +1,810 @@
+/*-
+ * Copyright (c) 2012 Sandvine, Inc.
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifdef _KERNEL
+#include <sys/param.h>
+#include <sys/pcpu.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+#include <machine/vmm.h>
+#else	/* !_KERNEL */
+#include <sys/types.h>
+#include <sys/errno.h>
+
+#include <machine/vmm.h>
+
+#include <vmmapi.h>
+#endif	/* _KERNEL */
+
+
+
+/* struct vie_op.op_type */
+enum {
+	VIE_OP_TYPE_NONE = 0,
+	VIE_OP_TYPE_MOV,
+	VIE_OP_TYPE_AND,
+	VIE_OP_TYPE_LAST
+};
+
+/* struct vie_op.op_flags */
+#define	VIE_OP_F_IMM		(1 << 0)	/* immediate operand present */
+#define	VIE_OP_F_IMM8		(1 << 1)	/* 8-bit immediate operand */
+
+static const struct vie_op one_byte_opcodes[256] = {
+	[0x89] = {
+		.op_byte = 0x89,
+		.op_type = VIE_OP_TYPE_MOV,
+	},
+	[0x8B] = {
+		.op_byte = 0x8B,
+		.op_type = VIE_OP_TYPE_MOV,
+	},
+	[0xC7] = {
+		.op_byte = 0xC7,
+		.op_type = VIE_OP_TYPE_MOV,
+		.op_flags = VIE_OP_F_IMM,
+	},
+	[0x23] = {
+		.op_byte = 0x23,
+		.op_type = VIE_OP_TYPE_AND,
+	},
+	[0x81] = {
+		/* XXX Group 1 extended opcode - not just AND */
+		.op_byte = 0x81,
+		.op_type = VIE_OP_TYPE_AND,
+		.op_flags = VIE_OP_F_IMM,
+	}
+};
+
+/* struct vie.mod */
+#define	VIE_MOD_INDIRECT		0
+#define	VIE_MOD_INDIRECT_DISP8		1
+#define	VIE_MOD_INDIRECT_DISP32		2
+#define	VIE_MOD_DIRECT			3
+
+/* struct vie.rm */
+#define	VIE_RM_SIB			4
+#define	VIE_RM_DISP32			5
+
+#define	GB				(1024 * 1024 * 1024)
+
+static enum vm_reg_name gpr_map[16] = {
+	VM_REG_GUEST_RAX,
+	VM_REG_GUEST_RCX,
+	VM_REG_GUEST_RDX,
+	VM_REG_GUEST_RBX,
+	VM_REG_GUEST_RSP,
+	VM_REG_GUEST_RBP,
+	VM_REG_GUEST_RSI,
+	VM_REG_GUEST_RDI,
+	VM_REG_GUEST_R8,
+	VM_REG_GUEST_R9,
+	VM_REG_GUEST_R10,
+	VM_REG_GUEST_R11,
+	VM_REG_GUEST_R12,
+	VM_REG_GUEST_R13,
+	VM_REG_GUEST_R14,
+	VM_REG_GUEST_R15
+};
+
+static uint64_t size2mask[] = {
+	[1] = 0xff,
+	[2] = 0xffff,
+	[4] = 0xffffffff,
+	[8] = 0xffffffffffffffff,
+};
+
+static int
+vie_valid_register(enum vm_reg_name reg)
+{
+#ifdef _KERNEL
+	/*
+	 * XXX
+	 * The operand register in which we store the result of the
+	 * read must be a GPR that we can modify even if the vcpu
+	 * is "running". All the GPRs qualify except for %rsp.
+	 *
+	 * This is a limitation of the vm_set_register() API
+	 * and can be fixed if necessary.
+	 */
+	if (reg == VM_REG_GUEST_RSP)
+		return (0);
+#endif
+	return (1);
+}
+
+static int
+vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
+{
+	int error;
+
+	if (!vie_valid_register(reg))
+		return (EINVAL);
+
+	error = vm_get_register(vm, vcpuid, reg, rval);
+
+	return (error);
+}
+
+static int
+vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
+		    uint64_t val, int size)
+{
+	int error;
+	uint64_t origval;
+
+	if (!vie_valid_register(reg))
+		return (EINVAL);
+
+	switch (size) {
+	case 1:
+	case 2:
+		error = vie_read_register(vm, vcpuid, reg, &origval);
+		if (error)
+			return (error);
+		val &= size2mask[size];
+		val |= origval & ~size2mask[size];
+		break;
+	case 4:
+		val &= 0xffffffffUL;
+		break;
+	case 8:
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	error = vm_set_register(vm, vcpuid, reg, val);
+	return (error);
+}
+
+/*
+ * The following simplifying assumptions are made during emulation:
+ *
+ * - guest is in 64-bit mode
+ *   - default address size is 64-bits
+ *   - default operand size is 32-bits
+ *
+ * - operand size override is not supported
+ *
+ * - address size override is not supported
+ */
+static int
+emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+	int error, size;
+	enum vm_reg_name reg;
+	uint64_t val;
+
+	size = 4;
+	error = EINVAL;
+
+	switch (vie->op.op_byte) {
+	case 0x89:
+		/*
+		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
+		 * 89/r:	mov r/m32, r32
+		 * REX.W + 89/r	mov r/m64, r64
+		 */
+		if (vie->rex_w)
+			size = 8;
+		reg = gpr_map[vie->reg];
+		error = vie_read_register(vm, vcpuid, reg, &val);
+		if (error == 0) {
+			val &= size2mask[size];
+			error = memwrite(vm, vcpuid, gpa, val, size, arg);
+		}
+		break;
+	case 0x8B:
+		/*
+		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
+		 * 8B/r:	mov r32, r/m32
+		 * REX.W 8B/r:	mov r64, r/m64
+		 */
+		if (vie->rex_w)
+			size = 8;
+		error = memread(vm, vcpuid, gpa, &val, size, arg);
+		if (error == 0) {
+			reg = gpr_map[vie->reg];
+			error = vie_update_register(vm, vcpuid, reg, val, size);
+		}
+		break;
+	case 0xC7:
+		/*
+		 * MOV from imm32 to mem (ModRM:r/m)
+		 * C7/0		mov r/m32, imm32
+		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
+		 */
+		val = vie->immediate;		/* already sign-extended */
+
+		if (vie->rex_w)
+			size = 8;
+
+		if (size != 8)
+			val &= size2mask[size];
+
+		error = memwrite(vm, vcpuid, gpa, val, size, arg);
+		break;
+	default:
+		break;
+	}
+
+	return (error);
+}
+
+static int
+emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+	int error, size;
+	enum vm_reg_name reg;
+	uint64_t val1, val2;
+
+	size = 4;
+	error = EINVAL;
+
+	switch (vie->op.op_byte) {
+	case 0x23:
+		/*
+		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
+		 * result in reg.
+		 *
+		 * 23/r		and r32, r/m32
+		 * REX.W + 23/r	and r64, r/m64
+		 */
+		if (vie->rex_w)
+			size = 8;
+
+		/* get the first operand */
+		reg = gpr_map[vie->reg];
+		error = vie_read_register(vm, vcpuid, reg, &val1);
+		if (error)
+			break;
+
+		/* get the second operand */
+		error = memread(vm, vcpuid, gpa, &val2, size, arg);
+		if (error)
+			break;
+
+		/* perform the operation and write the result */
+		val1 &= val2;
+		error = vie_update_register(vm, vcpuid, reg, val1, size);
+		break;
+	case 0x81:
+		/*
+		 * AND reg (ModRM:reg) with immediate and store the
+		 * result in reg
+		 *
+		 * 81/          and r/m32, imm32
+		 * REX.W + 81/  and r/m64, imm32 sign-extended to 64
+		 *
+		 * Currently, only the AND operation of the 0x81 opcode
+		 * is implemented (ModRM:reg = b100).
+		 */
+		if ((vie->reg & 7) != 4)
+			break;
+
+		if (vie->rex_w)
+			size = 8;
+		
+		/* get the first operand */
+                error = memread(vm, vcpuid, gpa, &val1, size, arg);
+                if (error)
+			break;
+
+                /*
+		 * perform the operation with the pre-fetched immediate
+		 * operand and write the result
+		 */
+                val1 &= vie->immediate;
+                error = memwrite(vm, vcpuid, gpa, val1, size, arg);
+		break;
+	default:
+		break;
+	}
+	return (error);
+}
+
+int
+vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+			mem_region_read_t memread, mem_region_write_t memwrite,
+			void *memarg)
+{
+	int error;
+
+	if (!vie->decoded)
+		return (EINVAL);
+
+	switch (vie->op.op_type) {
+	case VIE_OP_TYPE_MOV:
+		error = emulate_mov(vm, vcpuid, gpa, vie,
+				    memread, memwrite, memarg);
+		break;
+	case VIE_OP_TYPE_AND:
+		error = emulate_and(vm, vcpuid, gpa, vie,
+				    memread, memwrite, memarg);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	return (error);
+}
+
+#ifdef _KERNEL
+static void
+vie_init(struct vie *vie)
+{
+
+	bzero(vie, sizeof(struct vie));
+
+	vie->base_register = VM_REG_LAST;
+	vie->index_register = VM_REG_LAST;
+}
+
+static int
+gla2gpa(struct vm *vm, uint64_t gla, uint64_t ptpphys,
+	uint64_t *gpa, uint64_t *gpaend)
+{
+	vm_paddr_t hpa;
+	int nlevels, ptpshift, ptpindex;
+	uint64_t *ptpbase, pte, pgsize;
+
+	/*
+	 * XXX assumes 64-bit guest with 4 page walk levels
+	 */
+	nlevels = 4;
+	while (--nlevels >= 0) {
+		/* Zero out the lower 12 bits and the upper 12 bits */
+		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
+
+		hpa = vm_gpa2hpa(vm, ptpphys, PAGE_SIZE);
+		if (hpa == -1)
+			goto error;
+
+		ptpbase = (uint64_t *)PHYS_TO_DMAP(hpa);
+
+		ptpshift = PAGE_SHIFT + nlevels * 9;
+		ptpindex = (gla >> ptpshift) & 0x1FF;
+		pgsize = 1UL << ptpshift;
+
+		pte = ptpbase[ptpindex];
+
+		if ((pte & PG_V) == 0)
+			goto error;
+
+		if (pte & PG_PS) {
+			if (pgsize > 1 * GB)
+				goto error;
+			else
+				break;
+		}
+
+		ptpphys = pte;
+	}
+
+	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
+	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
+	*gpa = pte | (gla & (pgsize - 1));
+	*gpaend = pte + pgsize;
+	return (0);
+
+error:
+	return (-1);
+}
+
+int
+vmm_fetch_instruction(struct vm *vm, int cpuid, uint64_t rip, int inst_length,
+		      uint64_t cr3, struct vie *vie)
+{
+	int n, err;
+	uint64_t hpa, gpa, gpaend, off;
+
+	/*
+	 * XXX cache previously fetched instructions using 'rip' as the tag
+	 */
+
+	if (inst_length > VIE_INST_SIZE)
+		panic("vmm_fetch_instruction: invalid length %d", inst_length);
+
+	vie_init(vie);
+
+	/* Copy the instruction into 'vie' */
+	while (vie->num_valid < inst_length) {
+		err = gla2gpa(vm, rip, cr3, &gpa, &gpaend);
+		if (err)
+			break;
+
+		off = gpa & PAGE_MASK;
+		n = min(inst_length - vie->num_valid, PAGE_SIZE - off);
+
+		hpa = vm_gpa2hpa(vm, gpa, n);
+		if (hpa == -1)
+			break;
+
+		bcopy((void *)PHYS_TO_DMAP(hpa), &vie->inst[vie->num_valid], n);
+
+		rip += n;
+		vie->num_valid += n;
+	}
+
+	if (vie->num_valid == inst_length)
+		return (0);
+	else
+		return (-1);
+}
+
+static int
+vie_peek(struct vie *vie, uint8_t *x)
+{
+
+	if (vie->num_processed < vie->num_valid) {
+		*x = vie->inst[vie->num_processed];
+		return (0);
+	} else
+		return (-1);
+}
+
+static void
+vie_advance(struct vie *vie)
+{
+
+	vie->num_processed++;
+}
+
+static int
+decode_rex(struct vie *vie)
+{
+	uint8_t x;
+
+	if (vie_peek(vie, &x))
+		return (-1);
+
+	if (x >= 0x40 && x <= 0x4F) {
+		vie->rex_w = x & 0x8 ? 1 : 0;
+		vie->rex_r = x & 0x4 ? 1 : 0;
+		vie->rex_x = x & 0x2 ? 1 : 0;
+		vie->rex_b = x & 0x1 ? 1 : 0;
+
+		vie_advance(vie);
+	}
+
+	return (0);
+}
+
+static int
+decode_opcode(struct vie *vie)
+{
+	uint8_t x;
+
+	if (vie_peek(vie, &x))
+		return (-1);
+
+	vie->op = one_byte_opcodes[x];
+
+	if (vie->op.op_type == VIE_OP_TYPE_NONE)
+		return (-1);
+
+	vie_advance(vie);
+	return (0);
+}
+
+/*
+ * XXX assuming 32-bit or 64-bit guest
+ */
+static int
+decode_modrm(struct vie *vie)
+{
+	uint8_t x;
+
+	if (vie_peek(vie, &x))
+		return (-1);
+
+	vie->mod = (x >> 6) & 0x3;
+	vie->rm =  (x >> 0) & 0x7;
+	vie->reg = (x >> 3) & 0x7;
+
+	/*
+	 * A direct addressing mode makes no sense in the context of an EPT
+	 * fault. There has to be a memory access involved to cause the
+	 * EPT fault.
+	 */
+	if (vie->mod == VIE_MOD_DIRECT)
+		return (-1);
+
+	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
+	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
+		/*
+		 * Table 2-5: Special Cases of REX Encodings
+		 *
+		 * mod=0, r/m=5 is used in the compatibility mode to
+		 * indicate a disp32 without a base register.
+		 *
+		 * mod!=3, r/m=4 is used in the compatibility mode to
+		 * indicate that the SIB byte is present.
+		 *
+		 * The 'b' bit in the REX prefix is don't care in
+		 * this case.
+		 */
+	} else {
+		vie->rm |= (vie->rex_b << 3);
+	}
+
+	vie->reg |= (vie->rex_r << 3);
+
+	/* SIB */
+	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
+		goto done;
+
+	vie->base_register = gpr_map[vie->rm];
+
+	switch (vie->mod) {
+	case VIE_MOD_INDIRECT_DISP8:
+		vie->disp_bytes = 1;
+		break;
+	case VIE_MOD_INDIRECT_DISP32:
+		vie->disp_bytes = 4;
+		break;
+	case VIE_MOD_INDIRECT:
+		if (vie->rm == VIE_RM_DISP32) {
+			vie->disp_bytes = 4;
+			vie->base_register = VM_REG_LAST;	/* no base */
+		}
+		break;
+	}
+
+	/* Figure out immediate operand size (if any) */
+	if (vie->op.op_flags & VIE_OP_F_IMM)
+		vie->imm_bytes = 4;
+	else if (vie->op.op_flags & VIE_OP_F_IMM8)
+		vie->imm_bytes = 1;
+
+done:
+	vie_advance(vie);
+
+	return (0);
+}
+
+static int
+decode_sib(struct vie *vie)
+{
+	uint8_t x;
+
+	/* Proceed only if SIB byte is present */
+	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
+		return (0);
+
+	if (vie_peek(vie, &x))
+		return (-1);
+
+	/* De-construct the SIB byte */
+	vie->ss = (x >> 6) & 0x3;
+	vie->index = (x >> 3) & 0x7;
+	vie->base = (x >> 0) & 0x7;
+
+	/* Apply the REX prefix modifiers */
+	vie->index |= vie->rex_x << 3;
+	vie->base |= vie->rex_b << 3;
+
+	switch (vie->mod) {
+	case VIE_MOD_INDIRECT_DISP8:
+		vie->disp_bytes = 1;
+		break;
+	case VIE_MOD_INDIRECT_DISP32:
+		vie->disp_bytes = 4;
+		break;
+	}
+
+	if (vie->mod == VIE_MOD_INDIRECT &&
+	    (vie->base == 5 || vie->base == 13)) {
+		/*
+		 * Special case when base register is unused if mod = 0
+		 * and base = %rbp or %r13.
+		 *
+		 * Documented in:
+		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+		 * Table 2-5: Special Cases of REX Encodings
+		 */
+		vie->disp_bytes = 4;
+	} else {
+		vie->base_register = gpr_map[vie->base];
+	}
+
+	/*
+	 * All encodings of 'index' are valid except for %rsp (4).
+	 *
+	 * Documented in:
+	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+	 * Table 2-5: Special Cases of REX Encodings
+	 */
+	if (vie->index != 4)
+		vie->index_register = gpr_map[vie->index];
+
+	/* 'scale' makes sense only in the context of an index register */
+	if (vie->index_register < VM_REG_LAST)
+		vie->scale = 1 << vie->ss;
+
+	vie_advance(vie);
+
+	return (0);
+}
+
+static int
+decode_displacement(struct vie *vie)
+{
+	int n, i;
+	uint8_t x;
+
+	union {
+		char	buf[4];
+		int8_t	signed8;
+		int32_t	signed32;
+	} u;
+
+	if ((n = vie->disp_bytes) == 0)
+		return (0);
+
+	if (n != 1 && n != 4)
+		panic("decode_displacement: invalid disp_bytes %d", n);
+
+	for (i = 0; i < n; i++) {
+		if (vie_peek(vie, &x))
+			return (-1);
+
+		u.buf[i] = x;
+		vie_advance(vie);
+	}
+
+	if (n == 1)
+		vie->displacement = u.signed8;		/* sign-extended */
+	else
+		vie->displacement = u.signed32;		/* sign-extended */
+
+	return (0);
+}
+
+static int
+decode_immediate(struct vie *vie)
+{
+	int i, n;
+	uint8_t x;
+	union {
+		char	buf[4];
+		int8_t	signed8;
+		int32_t	signed32;
+	} u;
+
+	if ((n = vie->imm_bytes) == 0)
+		return (0);
+
+	if (n != 1 && n != 4)
+		panic("decode_immediate: invalid imm_bytes %d", n);
+
+	for (i = 0; i < n; i++) {
+		if (vie_peek(vie, &x))
+			return (-1);
+
+		u.buf[i] = x;
+		vie_advance(vie);
+	}
+	
+	if (n == 1)
+		vie->immediate = u.signed8;		/* sign-extended */
+	else
+		vie->immediate = u.signed32;		/* sign-extended */
+
+	return (0);
+}
+
+#define	VERIFY_GLA
+/*
+ * Verify that the 'guest linear address' provided as collateral of the nested
+ * page table fault matches with our instruction decoding.
+ */
+#ifdef VERIFY_GLA
+static int
+verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
+{
+	int error;
+	uint64_t base, idx;
+
+	base = 0;
+	if (vie->base_register != VM_REG_LAST) {
+		error = vm_get_register(vm, cpuid, vie->base_register, &base);
+		if (error) {
+			printf("verify_gla: error %d getting base reg %d\n",
+				error, vie->base_register);
+			return (-1);
+		}
+	}
+
+	idx = 0;
+	if (vie->index_register != VM_REG_LAST) {
+		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
+		if (error) {
+			printf("verify_gla: error %d getting index reg %d\n",
+				error, vie->index_register);
+			return (-1);
+		}
+	}
+
+	if (base + vie->scale * idx + vie->displacement != gla) {
+		printf("verify_gla mismatch: "
+		       "base(0x%0lx), scale(%d), index(0x%0lx), "
+		       "disp(0x%0lx), gla(0x%0lx)\n",
+		       base, vie->scale, idx, vie->displacement, gla);
+		return (-1);
+	}
+
+	return (0);
+}
+#endif	/* VERIFY_GLA */
+
+int
+vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
+{
+
+	if (decode_rex(vie))
+		return (-1);
+
+	if (decode_opcode(vie))
+		return (-1);
+
+	if (decode_modrm(vie))
+		return (-1);
+
+	if (decode_sib(vie))
+		return (-1);
+
+	if (decode_displacement(vie))
+		return (-1);
+	
+	if (decode_immediate(vie))
+		return (-1);
+
+#ifdef VERIFY_GLA
+	if (verify_gla(vm, cpuid, gla, vie))
+		return (-1);
+#endif
+
+	vie->decoded = 1;	/* success */
+
+	return (0);
+}
+#endif	/* _KERNEL */
diff --git a/sys/amd64/vmm/vmm_ipi.c b/sys/amd64/vmm/vmm_ipi.c
new file mode 100644
index 0000000..643d326
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ipi.c
@@ -0,0 +1,93 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/bus.h>
+
+#include <machine/intr_machdep.h>
+#include <machine/apicvar.h>
+#include <machine/segments.h>
+#include <machine/md_var.h>
+
+#include <machine/vmm.h>
+#include "vmm_ipi.h"
+
+extern inthand_t IDTVEC(rsvd), IDTVEC(justreturn);
+
+/*
+ * The default is to use the IPI_AST to interrupt a vcpu.
+ */
+int vmm_ipinum = IPI_AST;
+
+CTASSERT(APIC_SPURIOUS_INT == 255);
+
+void
+vmm_ipi_init(void)
+{
+	int idx;
+	uintptr_t func;
+	struct gate_descriptor *ip;
+
+	/*
+	 * Search backwards from the highest IDT vector available for use
+	 * as our IPI vector. We install the 'justreturn' handler at that
+	 * vector and use it to interrupt the vcpus.
+	 *
+	 * We do this because the IPI_AST is heavyweight and saves all
+	 * registers in the trapframe. This is overkill for our use case
+	 * which is simply to EOI the interrupt and return.
+	 */
+	idx = APIC_SPURIOUS_INT;
+	while (--idx >= APIC_IPI_INTS) {
+		ip = &idt[idx];
+		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
+		if (func == (uintptr_t)&IDTVEC(rsvd)) {
+			vmm_ipinum = idx;
+			setidt(vmm_ipinum, IDTVEC(justreturn), SDT_SYSIGT,
+			       SEL_KPL, 0);
+			break;
+		}
+	}
+	
+	if (vmm_ipinum != IPI_AST && bootverbose) {
+		printf("vmm_ipi_init: installing ipi handler to interrupt "
+		       "vcpus at vector %d\n", vmm_ipinum);
+	}
+}
+
+void
+vmm_ipi_cleanup(void)
+{
+	if (vmm_ipinum != IPI_AST)
+		setidt(vmm_ipinum, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
+}
diff --git a/sys/amd64/vmm/vmm_ipi.h b/sys/amd64/vmm/vmm_ipi.h
new file mode 100644
index 0000000..91552e3
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ipi.h
@@ -0,0 +1,39 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_IPI_H_
+#define _VMM_IPI_H_
+
+struct vm;
+
+extern int vmm_ipinum;
+
+void	vmm_ipi_init(void);
+void	vmm_ipi_cleanup(void);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_ktr.h b/sys/amd64/vmm/vmm_ktr.h
new file mode 100644
index 0000000..e691c61
--- /dev/null
+++ b/sys/amd64/vmm/vmm_ktr.h
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_KTR_H_
+#define	_VMM_KTR_H_
+
+#include <sys/ktr.h>
+#include <sys/pcpu.h>
+
+#define	KTR_VMM	KTR_GEN
+
+#define	VMM_CTR0(vm, vcpuid, format)					\
+CTR3(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu)
+
+#define	VMM_CTR1(vm, vcpuid, format, p1)				\
+CTR4(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+			(p1))
+
+#define	VMM_CTR2(vm, vcpuid, format, p1, p2)				\
+CTR5(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+			(p1), (p2))
+
+#define	VMM_CTR3(vm, vcpuid, format, p1, p2, p3)			\
+CTR6(KTR_VMM, "vm %s-%d(%d): " format, vm_name((vm)), (vcpuid), curcpu, \
+			(p1), (p2), (p3))
+#endif
diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c
new file mode 100644
index 0000000..d024b71
--- /dev/null
+++ b/sys/amd64/vmm/vmm_lapic.c
@@ -0,0 +1,201 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+
+#include <x86/specialreg.h>
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_ipi.h"
+#include "vmm_lapic.h"
+#include "vlapic.h"
+
+int
+lapic_pending_intr(struct vm *vm, int cpu)
+{
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	return (vlapic_pending_intr(vlapic));
+}
+
+void
+lapic_intr_accepted(struct vm *vm, int cpu, int vector)
+{
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	vlapic_intr_accepted(vlapic, vector);
+}
+
+int
+lapic_set_intr(struct vm *vm, int cpu, int vector)
+{
+	struct vlapic *vlapic;
+
+	if (cpu < 0 || cpu >= VM_MAXCPU)
+		return (EINVAL);
+
+	if (vector < 32 || vector > 255)
+		return (EINVAL);
+
+	vlapic = vm_lapic(vm, cpu);
+	vlapic_set_intr_ready(vlapic, vector);
+
+	vm_interrupt_hostcpu(vm, cpu);
+
+	return (0);
+}
+
+int
+lapic_timer_tick(struct vm *vm, int cpu)
+{
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	return (vlapic_timer_tick(vlapic));
+}
+
+static boolean_t
+x2apic_msr(u_int msr)
+{
+	if (msr >= 0x800 && msr <= 0xBFF)
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+static u_int
+x2apic_msr_to_regoff(u_int msr)
+{
+
+	return ((msr - 0x800) << 4);
+}
+
+boolean_t
+lapic_msr(u_int msr)
+{
+
+	if (x2apic_msr(msr) || (msr == MSR_APICBASE))
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+int
+lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval)
+{
+	int error;
+	u_int offset;
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	if (msr == MSR_APICBASE) {
+		*rval = vlapic_get_apicbase(vlapic);
+		error = 0;
+	} else {
+		offset = x2apic_msr_to_regoff(msr);
+		error = vlapic_op_mem_read(vlapic, offset, DWORD, rval);
+	}
+
+	return (error);
+}
+
+int
+lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val)
+{
+	int error;
+	u_int offset;
+	struct vlapic *vlapic;
+
+	vlapic = vm_lapic(vm, cpu);
+
+	if (msr == MSR_APICBASE) {
+		vlapic_set_apicbase(vlapic, val);
+		error = 0;
+	} else {
+		offset = x2apic_msr_to_regoff(msr);
+		error = vlapic_op_mem_write(vlapic, offset, DWORD, val);
+	}
+
+	return (error);
+}
+
+int
+lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size,
+		 void *arg)
+{
+	int error;
+	uint64_t off;
+	struct vlapic *vlapic;
+
+	off = gpa - DEFAULT_APIC_BASE;
+
+	/*
+	 * Memory mapped local apic accesses must be 4 bytes wide and
+	 * aligned on a 16-byte boundary.
+	 */
+	if (size != 4 || off & 0xf)
+		return (EINVAL);
+
+	vlapic = vm_lapic(vm, cpu);
+	error = vlapic_op_mem_write(vlapic, off, DWORD, wval);
+	return (error);
+}
+
+int
+lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size,
+		void *arg)
+{
+	int error;
+	uint64_t off;
+	struct vlapic *vlapic;
+
+	off = gpa - DEFAULT_APIC_BASE;
+
+	/*
+	 * Memory mapped local apic accesses must be 4 bytes wide and
+	 * aligned on a 16-byte boundary.
+	 */
+	if (size != 4 || off & 0xf)
+		return (EINVAL);
+
+	vlapic = vm_lapic(vm, cpu);
+	error = vlapic_op_mem_read(vlapic, off, DWORD, rval);
+	return (error);
+}
diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h
new file mode 100644
index 0000000..a79912e
--- /dev/null
+++ b/sys/amd64/vmm/vmm_lapic.h
@@ -0,0 +1,71 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_LAPIC_H_
+#define	_VMM_LAPIC_H_
+
+struct vm;
+
+boolean_t lapic_msr(u_int num);
+int	lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval);
+int	lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval);
+
+int	lapic_mmio_read(void *vm, int cpu, uint64_t gpa,
+			uint64_t *rval, int size, void *arg);
+int	lapic_mmio_write(void *vm, int cpu, uint64_t gpa,
+			 uint64_t wval, int size, void *arg);
+
+int	lapic_timer_tick(struct vm *vm, int cpu);
+
+/*
+ * Returns a vector between 32 and 255 if an interrupt is pending in the
+ * IRR that can be delivered based on the current state of ISR and TPR.
+ *
+ * Note that the vector does not automatically transition to the ISR as a
+ * result of calling this function.
+ *
+ * Returns -1 if there is no eligible vector that can be delivered to the
+ * guest at this time.
+ */
+int	lapic_pending_intr(struct vm *vm, int cpu);
+
+/*
+ * Transition 'vector' from IRR to ISR. This function is called with the
+ * vector returned by 'lapic_pending_intr()' when the guest is able to
+ * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that
+ * block interrupt delivery).
+ */
+void	lapic_intr_accepted(struct vm *vm, int cpu, int vector);
+
+/*
+ * Signals to the LAPIC that an interrupt at 'vector' needs to be generated
+ * to the 'cpu', the state is recorded in IRR.
+ */
+int	lapic_set_intr(struct vm *vm, int cpu, int vector);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c
new file mode 100644
index 0000000..04f99b1
--- /dev/null
+++ b/sys/amd64/vmm/vmm_mem.c
@@ -0,0 +1,135 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/linker.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
+
+#include <machine/md_var.h>
+#include <machine/metadata.h>
+#include <machine/pc/bios.h>
+#include <machine/vmparam.h>
+#include <machine/pmap.h>
+
+#include "vmm_util.h"
+#include "vmm_mem.h"
+
+SYSCTL_DECL(_hw_vmm);
+
+static u_long pages_allocated;
+SYSCTL_ULONG(_hw_vmm, OID_AUTO, pages_allocated, CTLFLAG_RD,
+	     &pages_allocated, 0, "4KB pages allocated");
+
+static void
+update_pages_allocated(int howmany)
+{
+	pages_allocated += howmany;	/* XXX locking? */
+}
+
+int
+vmm_mem_init(void)
+{
+
+	return (0);
+}
+
+vm_paddr_t
+vmm_mem_alloc(size_t size)
+{
+	int flags;
+	vm_page_t m;
+	vm_paddr_t pa;
+
+	if (size != PAGE_SIZE)
+		panic("vmm_mem_alloc: invalid allocation size %lu", size);
+
+	flags = VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
+		VM_ALLOC_ZERO;
+
+	while (1) {
+		/*
+		 * XXX need policy to determine when to back off the allocation
+		 */
+		m = vm_page_alloc(NULL, 0, flags);
+		if (m == NULL)
+			VM_WAIT;
+		else
+			break;
+	}
+
+	pa = VM_PAGE_TO_PHYS(m);
+	
+	if ((m->flags & PG_ZERO) == 0)
+		pagezero((void *)PHYS_TO_DMAP(pa));
+	m->valid = VM_PAGE_BITS_ALL;
+
+	update_pages_allocated(1);
+
+	return (pa);
+}
+
+void
+vmm_mem_free(vm_paddr_t base, size_t length)
+{
+	vm_page_t m;
+
+	if (base & PAGE_MASK) {
+		panic("vmm_mem_free: base 0x%0lx must be aligned on a "
+		      "0x%0x boundary\n", base, PAGE_SIZE);
+	}
+
+	if (length != PAGE_SIZE)
+		panic("vmm_mem_free: invalid length %lu", length);
+
+	m = PHYS_TO_VM_PAGE(base);
+	m->wire_count--;
+	vm_page_free(m);
+	atomic_subtract_int(&cnt.v_wire_count, 1);
+
+	update_pages_allocated(-1);
+}
+
+vm_paddr_t
+vmm_mem_maxaddr(void)
+{
+
+	return (ptoa(Maxmem));
+}
diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h
new file mode 100644
index 0000000..7d45c74
--- /dev/null
+++ b/sys/amd64/vmm/vmm_mem.h
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_VMM_MEM_H_
+#define	_VMM_MEM_H_
+
+int		vmm_mem_init(void);
+vm_paddr_t	vmm_mem_alloc(size_t size);
+void		vmm_mem_free(vm_paddr_t start, size_t size);
+vm_paddr_t	vmm_mem_maxaddr(void);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_msr.c b/sys/amd64/vmm/vmm_msr.c
new file mode 100644
index 0000000..d97c819
--- /dev/null
+++ b/sys/amd64/vmm/vmm_msr.c
@@ -0,0 +1,254 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+
+#include <machine/specialreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_lapic.h"
+#include "vmm_msr.h"
+
+#define	VMM_MSR_F_EMULATE	0x01
+#define	VMM_MSR_F_READONLY	0x02
+#define VMM_MSR_F_INVALID	0x04  /* guest_msr_valid() can override this */
+
+struct vmm_msr {
+	int		num;
+	int		flags;
+	uint64_t	hostval;
+};
+
+static struct vmm_msr vmm_msr[] = {
+	{ MSR_LSTAR,	0 },
+	{ MSR_CSTAR,	0 },
+	{ MSR_STAR,	0 },
+	{ MSR_SF_MASK,	0 },
+	{ MSR_PAT,      VMM_MSR_F_EMULATE | VMM_MSR_F_INVALID },
+	{ MSR_BIOS_SIGN,VMM_MSR_F_EMULATE },
+	{ MSR_MCG_CAP,	VMM_MSR_F_EMULATE | VMM_MSR_F_READONLY },
+};
+
+#define	vmm_msr_num	(sizeof(vmm_msr) / sizeof(vmm_msr[0]))
+CTASSERT(VMM_MSR_NUM >= vmm_msr_num);
+
+#define	readonly_msr(idx)	\
+	((vmm_msr[(idx)].flags & VMM_MSR_F_READONLY) != 0)
+
+#define	emulated_msr(idx)	\
+	((vmm_msr[(idx)].flags & VMM_MSR_F_EMULATE) != 0)
+
+#define invalid_msr(idx)	\
+	((vmm_msr[(idx)].flags & VMM_MSR_F_INVALID) != 0)
+
+void
+vmm_msr_init(void)
+{
+	int i;
+
+	for (i = 0; i < vmm_msr_num; i++) {
+		if (emulated_msr(i))
+			continue;
+		/*
+		 * XXX this assumes that the value of the host msr does not
+		 * change after we have cached it.
+		 */
+		vmm_msr[i].hostval = rdmsr(vmm_msr[i].num);
+	}
+}
+
+void
+guest_msrs_init(struct vm *vm, int cpu)
+{
+	int i;
+	uint64_t *guest_msrs;
+
+	guest_msrs = vm_guest_msrs(vm, cpu);
+	
+	for (i = 0; i < vmm_msr_num; i++) {
+		switch (vmm_msr[i].num) {
+		case MSR_LSTAR:
+		case MSR_CSTAR:
+		case MSR_STAR:
+		case MSR_SF_MASK:
+		case MSR_BIOS_SIGN:
+		case MSR_MCG_CAP:
+			guest_msrs[i] = 0;
+			break;
+		case MSR_PAT:
+			guest_msrs[i] = PAT_VALUE(0, PAT_WRITE_BACK)      |
+				PAT_VALUE(1, PAT_WRITE_THROUGH)   |
+				PAT_VALUE(2, PAT_UNCACHED)        |
+				PAT_VALUE(3, PAT_UNCACHEABLE)     |
+				PAT_VALUE(4, PAT_WRITE_BACK)      |
+				PAT_VALUE(5, PAT_WRITE_THROUGH)   |
+				PAT_VALUE(6, PAT_UNCACHED)        |
+				PAT_VALUE(7, PAT_UNCACHEABLE);
+			break;
+		default:
+			panic("guest_msrs_init: missing initialization for msr "
+			      "0x%0x", vmm_msr[i].num);
+		}
+	}
+}
+
+static int
+msr_num_to_idx(u_int num)
+{
+	int i;
+
+	for (i = 0; i < vmm_msr_num; i++)
+		if (vmm_msr[i].num == num)
+			return (i);
+
+	return (-1);
+}
+
+int
+emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val)
+{
+	int idx;
+	uint64_t *guest_msrs;
+
+	if (lapic_msr(num))
+		return (lapic_wrmsr(vm, cpu, num, val));
+
+	idx = msr_num_to_idx(num);
+	if (idx < 0 || invalid_msr(idx))
+		return (EINVAL);
+
+	if (!readonly_msr(idx)) {
+		guest_msrs = vm_guest_msrs(vm, cpu);
+
+		/* Stash the value */
+		guest_msrs[idx] = val;
+
+		/* Update processor state for non-emulated MSRs */
+		if (!emulated_msr(idx))
+			wrmsr(vmm_msr[idx].num, val);
+	}
+
+	return (0);
+}
+
+int
+emulate_rdmsr(struct vm *vm, int cpu, u_int num)
+{
+	int error, idx;
+	uint32_t eax, edx;
+	uint64_t result, *guest_msrs;
+
+	if (lapic_msr(num)) {
+		error = lapic_rdmsr(vm, cpu, num, &result);
+		goto done;
+	}
+
+	idx = msr_num_to_idx(num);
+	if (idx < 0 || invalid_msr(idx)) {
+		error = EINVAL;
+		goto done;
+	}
+
+	guest_msrs = vm_guest_msrs(vm, cpu);
+	result = guest_msrs[idx];
+
+	/*
+	 * If this is not an emulated msr register make sure that the processor
+	 * state matches our cached state.
+	 */
+	if (!emulated_msr(idx) && (rdmsr(num) != result)) {
+		panic("emulate_rdmsr: msr 0x%0x has inconsistent cached "
+		      "(0x%016lx) and actual (0x%016lx) values", num,
+		      result, rdmsr(num));
+	}
+
+	error = 0;
+
+done:
+	if (error == 0) {
+		eax = result;
+		edx = result >> 32;
+		error = vm_set_register(vm, cpu, VM_REG_GUEST_RAX, eax);
+		if (error)
+			panic("vm_set_register(rax) error %d", error);
+		error = vm_set_register(vm, cpu, VM_REG_GUEST_RDX, edx);
+		if (error)
+			panic("vm_set_register(rdx) error %d", error);
+	}
+	return (error);
+}
+
+void
+restore_guest_msrs(struct vm *vm, int cpu)
+{
+	int i;
+	uint64_t *guest_msrs;
+
+	guest_msrs = vm_guest_msrs(vm, cpu);
+
+	for (i = 0; i < vmm_msr_num; i++) {
+		if (emulated_msr(i))
+			continue;
+		else
+			wrmsr(vmm_msr[i].num, guest_msrs[i]);
+	}
+}
+
+void
+restore_host_msrs(struct vm *vm, int cpu)
+{
+	int i;
+
+	for (i = 0; i < vmm_msr_num; i++) {
+		if (emulated_msr(i))
+			continue;
+		else
+			wrmsr(vmm_msr[i].num, vmm_msr[i].hostval);
+	}
+}
+
+/*
+ * Must be called by the CPU-specific code before any guests are
+ * created
+ */
+void
+guest_msr_valid(int msr)
+{
+	int i;
+
+	for (i = 0; i < vmm_msr_num; i++) {
+		if (vmm_msr[i].num == msr && invalid_msr(i)) {
+			vmm_msr[i].flags &= ~VMM_MSR_F_INVALID;
+		}
+	}
+}
diff --git a/sys/amd64/vmm/vmm_msr.h b/sys/amd64/vmm/vmm_msr.h
new file mode 100644
index 0000000..8a1fda3
--- /dev/null
+++ b/sys/amd64/vmm/vmm_msr.h
@@ -0,0 +1,43 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_VMM_MSR_H_
+#define	_VMM_MSR_H_
+
+#define	VMM_MSR_NUM	16
+struct vm;
+
+void	vmm_msr_init(void);
+int	emulate_wrmsr(struct vm *vm, int vcpu, u_int msr, uint64_t val);
+int	emulate_rdmsr(struct vm *vm, int vcpu, u_int msr);
+void	guest_msrs_init(struct vm *vm, int cpu);
+void	guest_msr_valid(int msr);
+void	restore_host_msrs(struct vm *vm, int cpu);
+void	restore_guest_msrs(struct vm *vm, int cpu);
+
+#endif
diff --git a/sys/amd64/vmm/vmm_stat.c b/sys/amd64/vmm/vmm_stat.c
new file mode 100644
index 0000000..ae60979
--- /dev/null
+++ b/sys/amd64/vmm/vmm_stat.c
@@ -0,0 +1,104 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/smp.h>
+
+#include <machine/vmm.h>
+#include "vmm_stat.h"
+
+static int vstnum;
+static struct vmm_stat_type *vsttab[MAX_VMM_STAT_TYPES];
+
+static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat");
+
+void
+vmm_stat_init(void *arg)
+{
+	struct vmm_stat_type *vst = arg;
+
+	/* We require all stats to identify themselves with a description */
+	if (vst->desc == NULL)
+		return;
+
+	if (vstnum >= MAX_VMM_STAT_TYPES) {
+		printf("Cannot accomodate vmm stat type \"%s\"!\n", vst->desc);
+		return;
+	}
+
+	vst->index = vstnum;
+	vsttab[vstnum++] = vst;
+}
+
+int
+vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf)
+{
+	int i;
+	uint64_t *stats;
+
+	if (vcpu < 0 || vcpu >= VM_MAXCPU)
+		return (EINVAL);
+		
+	stats = vcpu_stats(vm, vcpu);
+	for (i = 0; i < vstnum; i++)
+		buf[i] = stats[i];
+	*num_stats = vstnum;
+	return (0);
+}
+
+void *
+vmm_stat_alloc(void)
+{
+	u_long size;
+	
+	size = vstnum * sizeof(uint64_t);
+
+	return (malloc(size, M_VMM_STAT, M_ZERO | M_WAITOK));
+}
+
+void
+vmm_stat_free(void *vp)
+{
+	free(vp, M_VMM_STAT);
+}
+
+const char *
+vmm_stat_desc(int index)
+{
+
+	if (index >= 0 && index < vstnum)
+		return (vsttab[index]->desc);
+	else
+		return (NULL);
+}
diff --git a/sys/amd64/vmm/vmm_stat.h b/sys/amd64/vmm/vmm_stat.h
new file mode 100644
index 0000000..7c075a6
--- /dev/null
+++ b/sys/amd64/vmm/vmm_stat.h
@@ -0,0 +1,71 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_STAT_H_
+#define	_VMM_STAT_H_
+
+struct vm;
+
+#define	MAX_VMM_STAT_TYPES	64		/* arbitrary */
+
+struct vmm_stat_type {
+	const char	*desc;		/* description of statistic */
+	int	index;			/* position in the stats buffer */
+};
+
+void	vmm_stat_init(void *arg);
+
+#define	VMM_STAT_DEFINE(type, desc)					\
+	struct vmm_stat_type type[1] = {				\
+		{ desc, -1 }						\
+	};								\
+	SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_init, type)
+
+void	*vmm_stat_alloc(void);
+void 	vmm_stat_free(void *vp);
+
+/*
+ * 'buf' should be at least fit 'MAX_VMM_STAT_TYPES' entries
+ */
+int	vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf);
+const char *vmm_stat_desc(int index);
+
+static void __inline
+vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x)
+{
+#ifdef	VMM_KEEP_STATS
+	uint64_t *stats = vcpu_stats(vm, vcpu);
+	if (vst->index >= 0)
+		stats[vst->index] += x;
+#endif
+}
+
+#endif
diff --git a/sys/amd64/vmm/vmm_support.S b/sys/amd64/vmm/vmm_support.S
new file mode 100644
index 0000000..2afc608
--- /dev/null
+++ b/sys/amd64/vmm/vmm_support.S
@@ -0,0 +1,42 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#define	LOCORE
+
+#include <machine/asmacros.h>
+
+#define	LA_EOI	0xB0
+
+	.text
+	SUPERALIGN_TEXT
+IDTVEC(justreturn)
+	pushq	%rax
+	movq	lapic, %rax
+	movl	$0, LA_EOI(%rax)
+	popq	%rax
+	iretq
diff --git a/sys/amd64/vmm/vmm_util.c b/sys/amd64/vmm/vmm_util.c
new file mode 100644
index 0000000..f245f92
--- /dev/null
+++ b/sys/amd64/vmm/vmm_util.c
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/libkern.h>
+
+#include <machine/md_var.h>
+
+#include "vmm_util.h"
+
+boolean_t
+vmm_is_intel(void)
+{
+
+	if (strcmp(cpu_vendor, "GenuineIntel") == 0)
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+boolean_t
+vmm_is_amd(void)
+{
+	if (strcmp(cpu_vendor, "AuthenticAMD") == 0)
+		return (TRUE);
+	else
+		return (FALSE);
+}
+
+boolean_t
+vmm_supports_1G_pages(void)
+{
+	unsigned int regs[4];
+
+	/*
+	 * CPUID.80000001:EDX[bit 26] = 1 indicates support for 1GB pages
+	 *
+	 * Both Intel and AMD support this bit.
+	 */
+	if (cpu_exthigh >= 0x80000001) {
+		do_cpuid(0x80000001, regs);
+		if (regs[3] & (1 << 26))
+			return (TRUE);
+	}
+	return (FALSE);
+}
+
+#include <sys/proc.h>
+#include <machine/frame.h>
+#define	DUMP_REG(x)	printf(#x "\t\t0x%016lx\n", (long)(tf->tf_ ## x))
+#define	DUMP_SEG(x)	printf(#x "\t\t0x%04x\n", (unsigned)(tf->tf_ ## x))
+void
+dump_trapframe(struct trapframe *tf)
+{
+	DUMP_REG(rdi);
+	DUMP_REG(rsi);
+	DUMP_REG(rdx);
+	DUMP_REG(rcx);
+	DUMP_REG(r8);
+	DUMP_REG(r9);
+	DUMP_REG(rax);
+	DUMP_REG(rbx);
+	DUMP_REG(rbp);
+	DUMP_REG(r10);
+	DUMP_REG(r11);
+	DUMP_REG(r12);
+	DUMP_REG(r13);
+	DUMP_REG(r14);
+	DUMP_REG(r15);
+	DUMP_REG(trapno);
+	DUMP_REG(addr);
+	DUMP_REG(flags);
+	DUMP_REG(err);
+	DUMP_REG(rip);
+	DUMP_REG(rflags);
+	DUMP_REG(rsp);
+	DUMP_SEG(cs);
+	DUMP_SEG(ss);
+	DUMP_SEG(fs);
+	DUMP_SEG(gs);
+	DUMP_SEG(es);
+	DUMP_SEG(ds);
+}
diff --git a/sys/amd64/vmm/vmm_util.h b/sys/amd64/vmm/vmm_util.h
new file mode 100644
index 0000000..7f82332
--- /dev/null
+++ b/sys/amd64/vmm/vmm_util.h
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VMM_UTIL_H_
+#define	_VMM_UTIL_H_
+
+struct trapframe;
+
+boolean_t	vmm_is_intel(void);
+boolean_t	vmm_is_amd(void);
+boolean_t	vmm_supports_1G_pages(void);
+
+void		dump_trapframe(struct trapframe *tf);
+
+#endif
diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c
new file mode 100644
index 0000000..94abe09
--- /dev/null
+++ b/sys/amd64/vmm/x86.c
@@ -0,0 +1,202 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/cpuset.h>
+
+#include <machine/cpufunc.h>
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+
+#include <machine/vmm.h>
+
+#include "x86.h"
+
+#define	CPUID_VM_HIGH		0x40000000
+
+static const char bhyve_id[12] = "BHyVE BHyVE ";
+
+int
+x86_emulate_cpuid(struct vm *vm, int vcpu_id,
+		  uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+{
+	int error;
+	unsigned int 	func, regs[4];
+	enum x2apic_state x2apic_state;
+
+	func = *eax;
+
+	/*
+	 * Requests for invalid CPUID levels should map to the highest
+	 * available level instead.
+	 */
+	if (cpu_exthigh != 0 && *eax >= 0x80000000) {
+		if (*eax > cpu_exthigh)
+			*eax = cpu_exthigh;
+	} else if (*eax >= 0x40000000) {
+		if (*eax > CPUID_VM_HIGH)
+			*eax = CPUID_VM_HIGH;
+	} else if (*eax > cpu_high) {
+		*eax = cpu_high;
+	}
+
+	/*
+	 * In general the approach used for CPU topology is to
+	 * advertise a flat topology where all CPUs are packages with
+	 * no multi-core or SMT.
+	 */
+	switch (func) {
+		case CPUID_0000_0000:
+		case CPUID_0000_0002:
+		case CPUID_0000_0003:
+		case CPUID_0000_000A:
+			cpuid_count(*eax, *ecx, regs);
+			break;
+
+		case CPUID_8000_0000:
+		case CPUID_8000_0001:
+		case CPUID_8000_0002:
+		case CPUID_8000_0003:
+		case CPUID_8000_0004:
+		case CPUID_8000_0006:
+		case CPUID_8000_0007:
+		case CPUID_8000_0008:
+			cpuid_count(*eax, *ecx, regs);
+			break;
+
+		case CPUID_0000_0001:
+			do_cpuid(1, regs);
+
+			error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
+			if (error) {
+				panic("x86_emulate_cpuid: error %d "
+				      "fetching x2apic state", error);
+			}
+
+			/*
+			 * Override the APIC ID only in ebx
+			 */
+			regs[1] &= ~(CPUID_LOCAL_APIC_ID);
+			regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
+
+			/*
+			 * Don't expose VMX, SpeedStep or TME capability.
+			 * Advertise x2APIC capability and Hypervisor guest.
+			 */
+			regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
+
+			regs[2] |= CPUID2_HV;
+
+			if (x2apic_state != X2APIC_DISABLED)
+				regs[2] |= CPUID2_X2APIC;
+
+			/*
+			 * Hide xsave/osxsave/avx until the FPU save/restore
+			 * issues are resolved
+			 */
+			regs[2] &= ~(CPUID2_XSAVE | CPUID2_OSXSAVE |
+				     CPUID2_AVX);
+
+			/*
+			 * Hide monitor/mwait until we know how to deal with
+			 * these instructions.
+			 */
+			regs[2] &= ~CPUID2_MON;
+
+			/*
+			 * Hide thermal monitoring
+			 */
+			regs[3] &= ~(CPUID_ACPI | CPUID_TM);
+			
+			/*
+			 * Machine check handling is done in the host.
+			 * Hide MTRR capability.
+			 */
+			regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
+
+			/*
+			 * Disable multi-core.
+			 */
+			regs[1] &= ~CPUID_HTT_CORES;
+			regs[3] &= ~CPUID_HTT;
+			break;
+
+		case CPUID_0000_0004:
+			do_cpuid(4, regs);
+
+			/*
+			 * Do not expose topology.
+			 */
+			regs[0] &= 0xffff8000;
+			regs[0] |= 0x04008000;
+			break;
+
+		case CPUID_0000_0006:
+		case CPUID_0000_0007:
+			/*
+			 * Handle the access, but report 0 for
+			 * all options
+			 */
+			regs[0] = 0;
+			regs[1] = 0;
+			regs[2] = 0;
+			regs[3] = 0;
+			break;
+
+		case CPUID_0000_000B:
+			/*
+			 * Processor topology enumeration
+			 */
+			regs[0] = 0;
+			regs[1] = 0;
+			regs[2] = *ecx & 0xff;
+			regs[3] = vcpu_id;
+			break;
+
+		case 0x40000000:
+			regs[0] = CPUID_VM_HIGH;
+			bcopy(bhyve_id, &regs[1], 4);
+			bcopy(bhyve_id, &regs[2], 4);
+			bcopy(bhyve_id, &regs[3], 4);
+			break;
+		default:
+			/* XXX: Leaf 5? */
+			return (0);
+	}
+
+	*eax = regs[0];
+	*ebx = regs[1];
+	*ecx = regs[2];
+	*edx = regs[3];
+	return (1);
+}
diff --git a/sys/amd64/vmm/x86.h b/sys/amd64/vmm/x86.h
new file mode 100644
index 0000000..368e967
--- /dev/null
+++ b/sys/amd64/vmm/x86.h
@@ -0,0 +1,64 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _X86_H_
+#define	_X86_H_
+
+#define CPUID_0000_0000 (0x0)
+#define CPUID_0000_0001	(0x1)
+#define CPUID_0000_0002 (0x2)
+#define CPUID_0000_0003 (0x3)
+#define CPUID_0000_0004 (0x4)
+#define CPUID_0000_0006 (0x6)
+#define CPUID_0000_0007 (0x7)
+#define	CPUID_0000_000A	(0xA)
+#define	CPUID_0000_000B	(0xB)
+#define CPUID_8000_0000	(0x80000000)
+#define CPUID_8000_0001	(0x80000001)
+#define CPUID_8000_0002	(0x80000002)
+#define CPUID_8000_0003	(0x80000003)
+#define CPUID_8000_0004	(0x80000004)
+#define CPUID_8000_0006	(0x80000006)
+#define CPUID_8000_0007	(0x80000007)
+#define CPUID_8000_0008	(0x80000008)
+
+/*
+ * CPUID instruction Fn0000_0001:
+ */
+#define CPUID_0000_0001_APICID_MASK			(0xff<<24)
+#define CPUID_0000_0001_APICID_SHIFT			24
+
+/*
+ * CPUID instruction Fn0000_0001 ECX
+ */
+#define CPUID_0000_0001_FEAT0_VMX	(1<<5)
+
+int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx,
+		      uint32_t *ecx, uint32_t *edx);
+
+#endif
author	neel <neel@FreeBSD.org>	2013-01-19 04:18:52 +0000
committer	neel <neel@FreeBSD.org>	2013-01-19 04:18:52 +0000
commit	363335d53e3c955602378aa434d0054c48a6e0d6 (patch)
tree	5af6fe77acd4da3002c907484fd64133deb95c8d /sys/amd64
parent	3600c83b820e00959d61600e67e9dcb32ef6b518 (diff)
parent	dde8bf641fc7c8e9541167cd7c01523973d0b569 (diff)
download	FreeBSD-src-363335d53e3c955602378aa434d0054c48a6e0d6.zip FreeBSD-src-363335d53e3c955602378aa434d0054c48a6e0d6.tar.gz