1 files changed, 490 insertions, 174 deletions
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 5fc6b94..5c2f202 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -39,18 +39,28 @@ __FBSDID("$FreeBSD$");
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/systm.h>
 
 #include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_param.h>
 
 #include <machine/vm.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 #include <x86/apicreg.h>
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
 
 #include <machine/vmm.h>
+#include "vmm_ktr.h"
 #include "vmm_host.h"
 #include "vmm_mem.h"
 #include "vmm_util.h"
@@ -84,15 +94,23 @@ struct vcpu {
 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
+#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
 
+struct mem_seg {
+	vm_paddr_t	gpa;
+	size_t		len;
+	boolean_t	wired;
+	vm_object_t	object;
+};
 #define	VM_MAX_MEMORY_SEGMENTS	2
 
 struct vm {
 	void		*cookie;	/* processor-specific data */
 	void		*iommu;		/* iommu-specific data */
+	struct vmspace	*vmspace;	/* guest's address space */
 	struct vcpu	vcpu[VM_MAXCPU];
 	int		num_mem_segs;
-	struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
+	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
 	char		name[VM_MAX_NAMELEN];
 
 	/*
@@ -109,16 +127,14 @@ static struct vmm_ops *ops;
 #define	VMM_INIT()	(ops != NULL ? (*ops->init)() : 0)
 #define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
 
-#define	VMINIT(vm)	(ops != NULL ? (*ops->vminit)(vm): NULL)
-#define	VMRUN(vmi, vcpu, rip) \
-	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO)
+#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
+#define	VMRUN(vmi, vcpu, rip, pmap) \
+	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap) : ENXIO)
 #define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
-#define	VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm)			\
-    	(ops != NULL ? 							\
-    	(*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) :	\
-	ENXIO)
-#define	VMMMAP_GET(vmi, gpa) \
-	(ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO)
+#define	VMSPACE_ALLOC(min, max) \
+	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
+#define	VMSPACE_FREE(vmspace) \
+	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
 #define	VMGETREG(vmi, vcpu, num, retval)		\
 	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
 #define	VMSETREG(vmi, vcpu, num, val)		\
@@ -213,8 +229,7 @@ vmm_handler(module_t mod, int what, void *arg)
 	switch (what) {
 	case MOD_LOAD:
 		vmmdev_init();
-		if (ppt_num_devices() > 0)
-			iommu_init();
+		iommu_init();
 		error = vmm_init();
 		if (error == 0)
 			vmm_initialized = 1;
@@ -265,7 +280,7 @@ vm_create(const char *name, struct vm **retvm)
 {
 	int i;
 	struct vm *vm;
-	vm_paddr_t maxaddr;
+	struct vmspace *vmspace;
 
 	const int BSP = 0;
 
@@ -279,59 +294,34 @@ vm_create(const char *name, struct vm **retvm)
 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
 		return (EINVAL);
 
+	vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
+	if (vmspace == NULL)
+		return (ENOMEM);
+
 	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
 	strcpy(vm->name, name);
-	vm->cookie = VMINIT(vm);
+	vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
 
 	for (i = 0; i < VM_MAXCPU; i++) {
 		vcpu_init(vm, i);
 		guest_msrs_init(vm, i);
 	}
 
-	maxaddr = vmm_mem_maxaddr();
-	vm->iommu = iommu_create_domain(maxaddr);
 	vm_activate_cpu(vm, BSP);
+	vm->vmspace = vmspace;
 
 	*retvm = vm;
 	return (0);
 }
 
 static void
-vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
+vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
 {
-	size_t len;
-	vm_paddr_t hpa;
-	void *host_domain;
-
-	host_domain = iommu_host_domain();
-
-	len = 0;
-	while (len < seg->len) {
-		hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE);
-		if (hpa == (vm_paddr_t)-1) {
-			panic("vm_free_mem_segs: cannot free hpa "
-			      "associated with gpa 0x%016lx", seg->gpa + len);
-		}
-
-		/*
-		 * Remove the 'gpa' to 'hpa' mapping in VMs domain.
-		 * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'.
-		 */
-		iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE);
-		iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE);
-
-		vmm_mem_free(hpa, PAGE_SIZE);
-
-		len += PAGE_SIZE;
-	}
 
-	/*
-	 * Invalidate cached translations associated with 'vm->iommu' since
-	 * we have now moved some pages from it.
-	 */
-	iommu_invalidate_tlb(vm->iommu);
+	if (seg->object != NULL)
+		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
 
-	bzero(seg, sizeof(struct vm_memory_segment));
+	bzero(seg, sizeof(*seg));
 }
 
 void
@@ -341,6 +331,9 @@ vm_destroy(struct vm *vm)
 
 	ppt_unassign_all(vm);
 
+	if (vm->iommu != NULL)
+		iommu_destroy_domain(vm->iommu);
+
 	for (i = 0; i < vm->num_mem_segs; i++)
 		vm_free_mem_seg(vm, &vm->mem_segs[i]);
 
@@ -349,7 +342,7 @@ vm_destroy(struct vm *vm)
 	for (i = 0; i < VM_MAXCPU; i++)
 		vcpu_cleanup(&vm->vcpu[i]);
 
-	iommu_destroy_domain(vm->iommu);
+	VMSPACE_FREE(vm->vmspace);
 
 	VMCLEANUP(vm->cookie);
 
@@ -365,52 +358,48 @@ vm_name(struct vm *vm)
 int
 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 {
-	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+	vm_object_t obj;
 
-	return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
-			   VM_PROT_RW, spok));
+	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
+		return (ENOMEM);
+	else
+		return (0);
 }
 
 int
 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
-	const boolean_t spok = TRUE;	/* superpage mappings are ok */
 
-	return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0,
-			   VM_PROT_NONE, spok));
+	vmm_mmio_free(vm->vmspace, gpa, len);
+	return (0);
 }
 
-/*
- * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise
- */
-static boolean_t
-vm_gpa_available(struct vm *vm, vm_paddr_t gpa)
+boolean_t
+vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
 {
 	int i;
 	vm_paddr_t gpabase, gpalimit;
 
-	if (gpa & PAGE_MASK)
-		panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa);
-
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		gpabase = vm->mem_segs[i].gpa;
 		gpalimit = gpabase + vm->mem_segs[i].len;
 		if (gpa >= gpabase && gpa < gpalimit)
-			return (FALSE);
+			return (TRUE);		/* 'gpa' is regular memory */
 	}
 
-	return (TRUE);
+	if (ppt_is_mmio(vm, gpa))
+		return (TRUE);			/* 'gpa' is pci passthru mmio */
+
+	return (FALSE);
 }
 
 int
 vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 {
-	int error, available, allocated;
-	struct vm_memory_segment *seg;
-	vm_paddr_t g, hpa;
-	void *host_domain;
-
-	const boolean_t spok = TRUE;	/* superpage mappings are ok */
+	int available, allocated;
+	struct mem_seg *seg;
+	vm_object_t object;
+	vm_paddr_t g;
 
 	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
 		return (EINVAL);
@@ -418,10 +407,10 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 	available = allocated = 0;
 	g = gpa;
 	while (g < gpa + len) {
-		if (vm_gpa_available(vm, g))
-			available++;
-		else
+		if (vm_mem_allocated(vm, g))
 			allocated++;
+		else
+			available++;
 
 		g += PAGE_SIZE;
 	}
@@ -443,61 +432,203 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
 		return (E2BIG);
 
-	host_domain = iommu_host_domain();
-
 	seg = &vm->mem_segs[vm->num_mem_segs];
 
-	error = 0;
+	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
+		return (ENOMEM);
+
 	seg->gpa = gpa;
-	seg->len = 0;
-	while (seg->len < len) {
-		hpa = vmm_mem_alloc(PAGE_SIZE);
-		if (hpa == 0) {
-			error = ENOMEM;
-			break;
-		}
+	seg->len = len;
+	seg->object = object;
+	seg->wired = FALSE;
 
-		error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE,
-				   VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok);
-		if (error)
+	vm->num_mem_segs++;
+
+	return (0);
+}
+
+static void
+vm_gpa_unwire(struct vm *vm)
+{
+	int i, rv;
+	struct mem_seg *seg;
+
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		seg = &vm->mem_segs[i];
+		if (!seg->wired)
+			continue;
+
+		rv = vm_map_unwire(&vm->vmspace->vm_map,
+				   seg->gpa, seg->gpa + seg->len,
+				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
+		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
+		    "%#lx/%ld could not be unwired: %d",
+		    vm_name(vm), seg->gpa, seg->len, rv));
+
+		seg->wired = FALSE;
+	}
+}
+
+static int
+vm_gpa_wire(struct vm *vm)
+{
+	int i, rv;
+	struct mem_seg *seg;
+
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		seg = &vm->mem_segs[i];
+		if (seg->wired)
+			continue;
+
+		/* XXX rlimits? */
+		rv = vm_map_wire(&vm->vmspace->vm_map,
+				 seg->gpa, seg->gpa + seg->len,
+				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
+		if (rv != KERN_SUCCESS)
 			break;
 
+		seg->wired = TRUE;
+	}
+
+	if (i < vm->num_mem_segs) {
 		/*
-		 * Remove the 1:1 mapping for 'hpa' from the 'host_domain'.
-		 * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain.
+		 * Undo the wiring before returning an error.
 		 */
-		iommu_remove_mapping(host_domain, hpa, PAGE_SIZE);
-		iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE);
+		vm_gpa_unwire(vm);
+		return (EAGAIN);
+	}
+
+	return (0);
+}
+
+static void
+vm_iommu_modify(struct vm *vm, boolean_t map)
+{
+	int i, sz;
+	vm_paddr_t gpa, hpa;
+	struct mem_seg *seg;
+	void *vp, *cookie, *host_domain;
 
-		seg->len += PAGE_SIZE;
+	sz = PAGE_SIZE;
+	host_domain = iommu_host_domain();
+
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		seg = &vm->mem_segs[i];
+		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
+		    vm_name(vm), seg->gpa, seg->len));
+
+		gpa = seg->gpa;
+		while (gpa < seg->gpa + seg->len) {
+			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
+					 &cookie);
+			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
+			    vm_name(vm), gpa));
+
+			vm_gpa_release(cookie);
+
+			hpa = DMAP_TO_PHYS((uintptr_t)vp);
+			if (map) {
+				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
+				iommu_remove_mapping(host_domain, hpa, sz);
+			} else {
+				iommu_remove_mapping(vm->iommu, gpa, sz);
+				iommu_create_mapping(host_domain, hpa, hpa, sz);
+			}
+
+			gpa += PAGE_SIZE;
+		}
 	}
 
-	if (error) {
-		vm_free_mem_seg(vm, seg);
+	/*
+	 * Invalidate the cached translations associated with the domain
+	 * from which pages were removed.
+	 */
+	if (map)
+		iommu_invalidate_tlb(host_domain);
+	else
+		iommu_invalidate_tlb(vm->iommu);
+}
+
+#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
+#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
+
+int
+vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
+{
+	int error;
+
+	error = ppt_unassign_device(vm, bus, slot, func);
+	if (error)
 		return (error);
+
+	if (ppt_num_devices(vm) == 0) {
+		vm_iommu_unmap(vm);
+		vm_gpa_unwire(vm);
 	}
+	return (0);
+}
+
+int
+vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
+{
+	int error;
+	vm_paddr_t maxaddr;
 
 	/*
-	 * Invalidate cached translations associated with 'host_domain' since
-	 * we have now moved some pages from it.
+	 * Virtual machines with pci passthru devices get special treatment:
+	 * - the guest physical memory is wired
+	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
+	 *
+	 * We need to do this before the first pci passthru device is attached.
 	 */
-	iommu_invalidate_tlb(host_domain);
+	if (ppt_num_devices(vm) == 0) {
+		KASSERT(vm->iommu == NULL,
+		    ("vm_assign_pptdev: iommu must be NULL"));
+		maxaddr = vmm_mem_maxaddr();
+		vm->iommu = iommu_create_domain(maxaddr);
 
-	vm->num_mem_segs++;
+		error = vm_gpa_wire(vm);
+		if (error)
+			return (error);
 
-	return (0);
+		vm_iommu_map(vm);
+	}
+
+	error = ppt_assign_device(vm, bus, slot, func);
+	return (error);
 }
 
-vm_paddr_t
-vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
+void *
+vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
+	    void **cookie)
 {
-	vm_paddr_t nextpage;
+	int count, pageoff;
+	vm_page_t m;
+
+	pageoff = gpa & PAGE_MASK;
+	if (len > PAGE_SIZE - pageoff)
+		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
 
-	nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE);
-	if (len > nextpage - gpa)
-		panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len);
+	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
+	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
 
-	return (VMMMAP_GET(vm->cookie, gpa));
+	if (count == 1) {
+		*cookie = m;
+		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
+	} else {
+		*cookie = NULL;
+		return (NULL);
+	}
+}
+
+void
+vm_gpa_release(void *cookie)
+{
+	vm_page_t m = cookie;
+
+	vm_page_lock(m);
+	vm_page_unhold(m);
+	vm_page_unlock(m);
 }
 
 int
@@ -508,7 +639,9 @@ vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
 
 	for (i = 0; i < vm->num_mem_segs; i++) {
 		if (gpabase == vm->mem_segs[i].gpa) {
-			*seg = vm->mem_segs[i];
+			seg->gpa = vm->mem_segs[i].gpa;
+			seg->len = vm->mem_segs[i].len;
+			seg->wired = vm->mem_segs[i].wired;
 			return (0);
 		}
 	}
@@ -516,6 +649,33 @@ vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
 }
 
 int
+vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
+	      vm_offset_t *offset, struct vm_object **object)
+{
+	int i;
+	size_t seg_len;
+	vm_paddr_t seg_gpa;
+	vm_object_t seg_obj;
+
+	for (i = 0; i < vm->num_mem_segs; i++) {
+		if ((seg_obj = vm->mem_segs[i].object) == NULL)
+			continue;
+
+		seg_gpa = vm->mem_segs[i].gpa;
+		seg_len = vm->mem_segs[i].len;
+
+		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
+			*offset = gpa - seg_gpa;
+			*object = seg_obj;
+			vm_object_reference(seg_obj);
+			return (0);
+		}
+	}
+
+	return (EINVAL);
+}
+
+int
 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
 {
 
@@ -633,26 +793,215 @@ save_guest_fpustate(struct vcpu *vcpu)
 
 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
 
+static int
+vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
+{
+	int error;
+
+	vcpu_assert_locked(vcpu);
+
+	/*
+	 * The following state transitions are allowed:
+	 * IDLE -> FROZEN -> IDLE
+	 * FROZEN -> RUNNING -> FROZEN
+	 * FROZEN -> SLEEPING -> FROZEN
+	 */
+	switch (vcpu->state) {
+	case VCPU_IDLE:
+	case VCPU_RUNNING:
+	case VCPU_SLEEPING:
+		error = (newstate != VCPU_FROZEN);
+		break;
+	case VCPU_FROZEN:
+		error = (newstate == VCPU_FROZEN);
+		break;
+	default:
+		error = 1;
+		break;
+	}
+
+	if (error == 0)
+		vcpu->state = newstate;
+	else
+		error = EBUSY;
+
+	return (error);
+}
+
+static void
+vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
+{
+	int error;
+
+	if ((error = vcpu_set_state(vm, vcpuid, newstate)) != 0)
+		panic("Error %d setting state to %d\n", error, newstate);
+}
+
+static void
+vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
+{
+	int error;
+
+	if ((error = vcpu_set_state_locked(vcpu, newstate)) != 0)
+		panic("Error %d setting state to %d", error, newstate);
+}
+
+/*
+ * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
+ */
+static int
+vm_handle_hlt(struct vm *vm, int vcpuid, boolean_t *retu)
+{
+	struct vcpu *vcpu;
+	int sleepticks, t;
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	vcpu_lock(vcpu);
+
+	/*
+	 * Figure out the number of host ticks until the next apic
+	 * timer interrupt in the guest.
+	 */
+	sleepticks = lapic_timer_tick(vm, vcpuid);
+
+	/*
+	 * If the guest local apic timer is disabled then sleep for
+	 * a long time but not forever.
+	 */
+	if (sleepticks < 0)
+		sleepticks = hz;
+
+	/*
+	 * Do a final check for pending NMI or interrupts before
+	 * really putting this thread to sleep.
+	 *
+	 * These interrupts could have happened any time after we
+	 * returned from VMRUN() and before we grabbed the vcpu lock.
+	 */
+	if (!vm_nmi_pending(vm, vcpuid) && lapic_pending_intr(vm, vcpuid) < 0) {
+		if (sleepticks <= 0)
+			panic("invalid sleepticks %d", sleepticks);
+		t = ticks;
+		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
+		msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
+		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
+		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
+	}
+	vcpu_unlock(vcpu);
+
+	return (0);
+}
+
+static int
+vm_handle_paging(struct vm *vm, int vcpuid, boolean_t *retu)
+{
+	int rv, ftype;
+	struct vm_map *map;
+	struct vcpu *vcpu;
+	struct vm_exit *vme;
+
+	vcpu = &vm->vcpu[vcpuid];
+	vme = &vcpu->exitinfo;
+
+	ftype = vme->u.paging.fault_type;
+	KASSERT(ftype == VM_PROT_READ ||
+	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
+	    ("vm_handle_paging: invalid fault_type %d", ftype));
+
+	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
+		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
+		    vme->u.paging.gpa, ftype);
+		if (rv == 0)
+			goto done;
+	}
+
+	map = &vm->vmspace->vm_map;
+	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
+
+	VMM_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, ftype = %d",
+		 rv, vme->u.paging.gpa, ftype);
+
+	if (rv != KERN_SUCCESS)
+		return (EFAULT);
+done:
+	/* restart execution at the faulting instruction */
+	vme->inst_length = 0;
+
+	return (0);
+}
+
+static int
+vm_handle_inst_emul(struct vm *vm, int vcpuid, boolean_t *retu)
+{
+	struct vie *vie;
+	struct vcpu *vcpu;
+	struct vm_exit *vme;
+	int error, inst_length;
+	uint64_t rip, gla, gpa, cr3;
+
+	vcpu = &vm->vcpu[vcpuid];
+	vme = &vcpu->exitinfo;
+
+	rip = vme->rip;
+	inst_length = vme->inst_length;
+
+	gla = vme->u.inst_emul.gla;
+	gpa = vme->u.inst_emul.gpa;
+	cr3 = vme->u.inst_emul.cr3;
+	vie = &vme->u.inst_emul.vie;
+
+	vie_init(vie);
+
+	/* Fetch, decode and emulate the faulting instruction */
+	if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0)
+		return (EFAULT);
+
+	if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0)
+		return (EFAULT);
+
+	/* return to userland unless this is a local apic access */
+	if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) {
+		*retu = TRUE;
+		return (0);
+	}
+
+	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie,
+					lapic_mmio_read, lapic_mmio_write, 0);
+
+	/* return to userland to spin up the AP */
+	if (error == 0 && vme->exitcode == VM_EXITCODE_SPINUP_AP)
+		*retu = TRUE;
+
+	return (error);
+}
+
 int
 vm_run(struct vm *vm, struct vm_run *vmrun)
 {
-	int error, vcpuid, sleepticks, t;
+	int error, vcpuid;
 	struct vcpu *vcpu;
 	struct pcb *pcb;
 	uint64_t tscval, rip;
 	struct vm_exit *vme;
+	boolean_t retu;
+	pmap_t pmap;
 
 	vcpuid = vmrun->cpuid;
 
 	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 		return (EINVAL);
 
+	pmap = vmspace_pmap(vm->vmspace);
 	vcpu = &vm->vcpu[vcpuid];
-	vme = &vmrun->vm_exit;
+	vme = &vcpu->exitinfo;
 	rip = vmrun->rip;
 restart:
 	critical_enter();
 
+	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
+	    ("vm_run: absurd pm_active"));
+
 	tscval = rdtsc();
 
 	pcb = PCPU_GET(curpcb);
@@ -661,62 +1010,44 @@ restart:
 	restore_guest_msrs(vm, vcpuid);	
 	restore_guest_fpustate(vcpu);
 
+	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
 	vcpu->hostcpu = curcpu;
-	error = VMRUN(vm->cookie, vcpuid, rip);
+	error = VMRUN(vm->cookie, vcpuid, rip, pmap);
 	vcpu->hostcpu = NOCPU;
+	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
 
 	save_guest_fpustate(vcpu);
 	restore_host_msrs(vm, vcpuid);
 
 	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
 
-	/* copy the exit information */
-	bcopy(&vcpu->exitinfo, vme, sizeof(struct vm_exit));
-
 	critical_exit();
 
-	/*
-	 * Oblige the guest's desire to 'hlt' by sleeping until the vcpu
-	 * is ready to run.
-	 */
-	if (error == 0 && vme->exitcode == VM_EXITCODE_HLT) {
-		vcpu_lock(vcpu);
-
-		/*
-		 * Figure out the number of host ticks until the next apic
-		 * timer interrupt in the guest.
-		 */
-		sleepticks = lapic_timer_tick(vm, vcpuid);
-
-		/*
-		 * If the guest local apic timer is disabled then sleep for
-		 * a long time but not forever.
-		 */
-		if (sleepticks < 0)
-			sleepticks = hz;
-
-		/*
-		 * Do a final check for pending NMI or interrupts before
-		 * really putting this thread to sleep.
-		 *
-		 * These interrupts could have happened any time after we
-		 * returned from VMRUN() and before we grabbed the vcpu lock.
-		 */
-		if (!vm_nmi_pending(vm, vcpuid) &&
-		    lapic_pending_intr(vm, vcpuid) < 0) {
-			if (sleepticks <= 0)
-				panic("invalid sleepticks %d", sleepticks);
-			t = ticks;
-			msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
-			vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
+	if (error == 0) {
+		retu = FALSE;
+		switch (vme->exitcode) {
+		case VM_EXITCODE_HLT:
+			error = vm_handle_hlt(vm, vcpuid, &retu);
+			break;
+		case VM_EXITCODE_PAGING:
+			error = vm_handle_paging(vm, vcpuid, &retu);
+			break;
+		case VM_EXITCODE_INST_EMUL:
+			error = vm_handle_inst_emul(vm, vcpuid, &retu);
+			break;
+		default:
+			retu = TRUE;	/* handled in userland */
+			break;
 		}
+	}
 
-		vcpu_unlock(vcpu);
-
+	if (error == 0 && retu == FALSE) {
 		rip = vme->rip + vme->inst_length;
 		goto restart;
 	}
 
+	/* copy the exit information */
+	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
 	return (error);
 }
 
@@ -869,7 +1200,7 @@ vm_iommu_domain(struct vm *vm)
 }
 
 int
-vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
+vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
 {
 	int error;
 	struct vcpu *vcpu;
@@ -880,20 +1211,7 @@ vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
-
-	/*
-	 * The following state transitions are allowed:
-	 * IDLE -> RUNNING -> IDLE
-	 * IDLE -> CANNOT_RUN -> IDLE
-	 */
-	if ((vcpu->state == VCPU_IDLE && state != VCPU_IDLE) ||
-	    (vcpu->state != VCPU_IDLE && state == VCPU_IDLE)) {
-		error = 0;
-		vcpu->state = state;
-	} else {
-		error = EBUSY;
-	}
-
+	error = vcpu_set_state_locked(vcpu, newstate);
 	vcpu_unlock(vcpu);
 
 	return (error);
@@ -979,16 +1297,7 @@ vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
 	vcpu_lock(vcpu);
 	hostcpu = vcpu->hostcpu;
 	if (hostcpu == NOCPU) {
-		/*
-		 * If the vcpu is 'RUNNING' but without a valid 'hostcpu' then
-		 * the host thread must be sleeping waiting for an event to
-		 * kick the vcpu out of 'hlt'.
-		 *
-		 * XXX this is racy because the condition exists right before
-		 * and after calling VMRUN() in vm_run(). The wakeup() is
-		 * benign in this case.
-		 */
-		if (vcpu->state == VCPU_RUNNING)
+		if (vcpu->state == VCPU_SLEEPING)
 			wakeup_one(vcpu);
 	} else {
 		if (vcpu->state != VCPU_RUNNING)
@@ -998,3 +1307,10 @@ vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
 	}
 	vcpu_unlock(vcpu);
 }
+
+struct vmspace *
+vm_get_vmspace(struct vm *vm)
+{
+
+	return (vm->vmspace);
+}