summaryrefslogtreecommitdiffstats
path: root/sys/amd64/vmm/vmm.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/amd64/vmm/vmm.c')
-rw-r--r--sys/amd64/vmm/vmm.c664
1 files changed, 490 insertions, 174 deletions
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 5fc6b94..5c2f202 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -39,18 +39,28 @@ __FBSDID("$FreeBSD$");
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
+#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/systm.h>
#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_param.h>
#include <machine/vm.h>
#include <machine/pcb.h>
#include <machine/smp.h>
#include <x86/apicreg.h>
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
#include <machine/vmm.h>
+#include "vmm_ktr.h"
#include "vmm_host.h"
#include "vmm_mem.h"
#include "vmm_util.h"
@@ -84,15 +94,23 @@ struct vcpu {
#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
+#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED)
+struct mem_seg {
+ vm_paddr_t gpa;
+ size_t len;
+ boolean_t wired;
+ vm_object_t object;
+};
#define VM_MAX_MEMORY_SEGMENTS 2
struct vm {
void *cookie; /* processor-specific data */
void *iommu; /* iommu-specific data */
+ struct vmspace *vmspace; /* guest's address space */
struct vcpu vcpu[VM_MAXCPU];
int num_mem_segs;
- struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
+ struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS];
char name[VM_MAX_NAMELEN];
/*
@@ -109,16 +127,14 @@ static struct vmm_ops *ops;
#define VMM_INIT() (ops != NULL ? (*ops->init)() : 0)
#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0)
-#define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL)
-#define VMRUN(vmi, vcpu, rip) \
- (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO)
+#define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
+#define VMRUN(vmi, vcpu, rip, pmap) \
+ (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap) : ENXIO)
#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
-#define VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm) \
- (ops != NULL ? \
- (*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) : \
- ENXIO)
-#define VMMMAP_GET(vmi, gpa) \
- (ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO)
+#define VMSPACE_ALLOC(min, max) \
+ (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
+#define VMSPACE_FREE(vmspace) \
+ (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
#define VMGETREG(vmi, vcpu, num, retval) \
(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
#define VMSETREG(vmi, vcpu, num, val) \
@@ -213,8 +229,7 @@ vmm_handler(module_t mod, int what, void *arg)
switch (what) {
case MOD_LOAD:
vmmdev_init();
- if (ppt_num_devices() > 0)
- iommu_init();
+ iommu_init();
error = vmm_init();
if (error == 0)
vmm_initialized = 1;
@@ -265,7 +280,7 @@ vm_create(const char *name, struct vm **retvm)
{
int i;
struct vm *vm;
- vm_paddr_t maxaddr;
+ struct vmspace *vmspace;
const int BSP = 0;
@@ -279,59 +294,34 @@ vm_create(const char *name, struct vm **retvm)
if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
return (EINVAL);
+ vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
+ if (vmspace == NULL)
+ return (ENOMEM);
+
vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
strcpy(vm->name, name);
- vm->cookie = VMINIT(vm);
+ vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
for (i = 0; i < VM_MAXCPU; i++) {
vcpu_init(vm, i);
guest_msrs_init(vm, i);
}
- maxaddr = vmm_mem_maxaddr();
- vm->iommu = iommu_create_domain(maxaddr);
vm_activate_cpu(vm, BSP);
+ vm->vmspace = vmspace;
*retvm = vm;
return (0);
}
static void
-vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
+vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
{
- size_t len;
- vm_paddr_t hpa;
- void *host_domain;
-
- host_domain = iommu_host_domain();
-
- len = 0;
- while (len < seg->len) {
- hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE);
- if (hpa == (vm_paddr_t)-1) {
- panic("vm_free_mem_segs: cannot free hpa "
- "associated with gpa 0x%016lx", seg->gpa + len);
- }
-
- /*
- * Remove the 'gpa' to 'hpa' mapping in VMs domain.
- * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'.
- */
- iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE);
- iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE);
-
- vmm_mem_free(hpa, PAGE_SIZE);
-
- len += PAGE_SIZE;
- }
- /*
- * Invalidate cached translations associated with 'vm->iommu' since
- * we have now moved some pages from it.
- */
- iommu_invalidate_tlb(vm->iommu);
+ if (seg->object != NULL)
+ vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
- bzero(seg, sizeof(struct vm_memory_segment));
+ bzero(seg, sizeof(*seg));
}
void
@@ -341,6 +331,9 @@ vm_destroy(struct vm *vm)
ppt_unassign_all(vm);
+ if (vm->iommu != NULL)
+ iommu_destroy_domain(vm->iommu);
+
for (i = 0; i < vm->num_mem_segs; i++)
vm_free_mem_seg(vm, &vm->mem_segs[i]);
@@ -349,7 +342,7 @@ vm_destroy(struct vm *vm)
for (i = 0; i < VM_MAXCPU; i++)
vcpu_cleanup(&vm->vcpu[i]);
- iommu_destroy_domain(vm->iommu);
+ VMSPACE_FREE(vm->vmspace);
VMCLEANUP(vm->cookie);
@@ -365,52 +358,48 @@ vm_name(struct vm *vm)
int
vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
{
- const boolean_t spok = TRUE; /* superpage mappings are ok */
+ vm_object_t obj;
- return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
- VM_PROT_RW, spok));
+ if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
+ return (ENOMEM);
+ else
+ return (0);
}
int
vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
{
- const boolean_t spok = TRUE; /* superpage mappings are ok */
- return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0,
- VM_PROT_NONE, spok));
+ vmm_mmio_free(vm->vmspace, gpa, len);
+ return (0);
}
-/*
- * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise
- */
-static boolean_t
-vm_gpa_available(struct vm *vm, vm_paddr_t gpa)
+boolean_t
+vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
{
int i;
vm_paddr_t gpabase, gpalimit;
- if (gpa & PAGE_MASK)
- panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa);
-
for (i = 0; i < vm->num_mem_segs; i++) {
gpabase = vm->mem_segs[i].gpa;
gpalimit = gpabase + vm->mem_segs[i].len;
if (gpa >= gpabase && gpa < gpalimit)
- return (FALSE);
+ return (TRUE); /* 'gpa' is regular memory */
}
- return (TRUE);
+ if (ppt_is_mmio(vm, gpa))
+ return (TRUE); /* 'gpa' is pci passthru mmio */
+
+ return (FALSE);
}
int
vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
{
- int error, available, allocated;
- struct vm_memory_segment *seg;
- vm_paddr_t g, hpa;
- void *host_domain;
-
- const boolean_t spok = TRUE; /* superpage mappings are ok */
+ int available, allocated;
+ struct mem_seg *seg;
+ vm_object_t object;
+ vm_paddr_t g;
if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
return (EINVAL);
@@ -418,10 +407,10 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
available = allocated = 0;
g = gpa;
while (g < gpa + len) {
- if (vm_gpa_available(vm, g))
- available++;
- else
+ if (vm_mem_allocated(vm, g))
allocated++;
+ else
+ available++;
g += PAGE_SIZE;
}
@@ -443,61 +432,203 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
return (E2BIG);
- host_domain = iommu_host_domain();
-
seg = &vm->mem_segs[vm->num_mem_segs];
- error = 0;
+ if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
+ return (ENOMEM);
+
seg->gpa = gpa;
- seg->len = 0;
- while (seg->len < len) {
- hpa = vmm_mem_alloc(PAGE_SIZE);
- if (hpa == 0) {
- error = ENOMEM;
- break;
- }
+ seg->len = len;
+ seg->object = object;
+ seg->wired = FALSE;
- error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE,
- VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok);
- if (error)
+ vm->num_mem_segs++;
+
+ return (0);
+}
+
+static void
+vm_gpa_unwire(struct vm *vm)
+{
+ int i, rv;
+ struct mem_seg *seg;
+
+ for (i = 0; i < vm->num_mem_segs; i++) {
+ seg = &vm->mem_segs[i];
+ if (!seg->wired)
+ continue;
+
+ rv = vm_map_unwire(&vm->vmspace->vm_map,
+ seg->gpa, seg->gpa + seg->len,
+ VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
+ KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
+ "%#lx/%ld could not be unwired: %d",
+ vm_name(vm), seg->gpa, seg->len, rv));
+
+ seg->wired = FALSE;
+ }
+}
+
+static int
+vm_gpa_wire(struct vm *vm)
+{
+ int i, rv;
+ struct mem_seg *seg;
+
+ for (i = 0; i < vm->num_mem_segs; i++) {
+ seg = &vm->mem_segs[i];
+ if (seg->wired)
+ continue;
+
+ /* XXX rlimits? */
+ rv = vm_map_wire(&vm->vmspace->vm_map,
+ seg->gpa, seg->gpa + seg->len,
+ VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
+ if (rv != KERN_SUCCESS)
break;
+ seg->wired = TRUE;
+ }
+
+ if (i < vm->num_mem_segs) {
/*
- * Remove the 1:1 mapping for 'hpa' from the 'host_domain'.
- * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain.
+ * Undo the wiring before returning an error.
*/
- iommu_remove_mapping(host_domain, hpa, PAGE_SIZE);
- iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE);
+ vm_gpa_unwire(vm);
+ return (EAGAIN);
+ }
+
+ return (0);
+}
+
+static void
+vm_iommu_modify(struct vm *vm, boolean_t map)
+{
+ int i, sz;
+ vm_paddr_t gpa, hpa;
+ struct mem_seg *seg;
+ void *vp, *cookie, *host_domain;
- seg->len += PAGE_SIZE;
+ sz = PAGE_SIZE;
+ host_domain = iommu_host_domain();
+
+ for (i = 0; i < vm->num_mem_segs; i++) {
+ seg = &vm->mem_segs[i];
+ KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
+ vm_name(vm), seg->gpa, seg->len));
+
+ gpa = seg->gpa;
+ while (gpa < seg->gpa + seg->len) {
+ vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
+ &cookie);
+ KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
+ vm_name(vm), gpa));
+
+ vm_gpa_release(cookie);
+
+ hpa = DMAP_TO_PHYS((uintptr_t)vp);
+ if (map) {
+ iommu_create_mapping(vm->iommu, gpa, hpa, sz);
+ iommu_remove_mapping(host_domain, hpa, sz);
+ } else {
+ iommu_remove_mapping(vm->iommu, gpa, sz);
+ iommu_create_mapping(host_domain, hpa, hpa, sz);
+ }
+
+ gpa += PAGE_SIZE;
+ }
}
- if (error) {
- vm_free_mem_seg(vm, seg);
+ /*
+ * Invalidate the cached translations associated with the domain
+ * from which pages were removed.
+ */
+ if (map)
+ iommu_invalidate_tlb(host_domain);
+ else
+ iommu_invalidate_tlb(vm->iommu);
+}
+
+#define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE)
+#define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE)
+
+int
+vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
+{
+ int error;
+
+ error = ppt_unassign_device(vm, bus, slot, func);
+ if (error)
return (error);
+
+ if (ppt_num_devices(vm) == 0) {
+ vm_iommu_unmap(vm);
+ vm_gpa_unwire(vm);
}
+ return (0);
+}
+
+int
+vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
+{
+ int error;
+ vm_paddr_t maxaddr;
/*
- * Invalidate cached translations associated with 'host_domain' since
- * we have now moved some pages from it.
+ * Virtual machines with pci passthru devices get special treatment:
+ * - the guest physical memory is wired
+ * - the iommu is programmed to do the 'gpa' to 'hpa' translation
+ *
+ * We need to do this before the first pci passthru device is attached.
*/
- iommu_invalidate_tlb(host_domain);
+ if (ppt_num_devices(vm) == 0) {
+ KASSERT(vm->iommu == NULL,
+ ("vm_assign_pptdev: iommu must be NULL"));
+ maxaddr = vmm_mem_maxaddr();
+ vm->iommu = iommu_create_domain(maxaddr);
- vm->num_mem_segs++;
+ error = vm_gpa_wire(vm);
+ if (error)
+ return (error);
- return (0);
+ vm_iommu_map(vm);
+ }
+
+ error = ppt_assign_device(vm, bus, slot, func);
+ return (error);
}
-vm_paddr_t
-vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
+void *
+vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
+ void **cookie)
{
- vm_paddr_t nextpage;
+ int count, pageoff;
+ vm_page_t m;
+
+ pageoff = gpa & PAGE_MASK;
+ if (len > PAGE_SIZE - pageoff)
+ panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
- nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE);
- if (len > nextpage - gpa)
- panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len);
+ count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
+ trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
- return (VMMMAP_GET(vm->cookie, gpa));
+ if (count == 1) {
+ *cookie = m;
+ return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
+ } else {
+ *cookie = NULL;
+ return (NULL);
+ }
+}
+
+void
+vm_gpa_release(void *cookie)
+{
+ vm_page_t m = cookie;
+
+ vm_page_lock(m);
+ vm_page_unhold(m);
+ vm_page_unlock(m);
}
int
@@ -508,7 +639,9 @@ vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
for (i = 0; i < vm->num_mem_segs; i++) {
if (gpabase == vm->mem_segs[i].gpa) {
- *seg = vm->mem_segs[i];
+ seg->gpa = vm->mem_segs[i].gpa;
+ seg->len = vm->mem_segs[i].len;
+ seg->wired = vm->mem_segs[i].wired;
return (0);
}
}
@@ -516,6 +649,33 @@ vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
}
int
+vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
+ vm_offset_t *offset, struct vm_object **object)
+{
+ int i;
+ size_t seg_len;
+ vm_paddr_t seg_gpa;
+ vm_object_t seg_obj;
+
+ for (i = 0; i < vm->num_mem_segs; i++) {
+ if ((seg_obj = vm->mem_segs[i].object) == NULL)
+ continue;
+
+ seg_gpa = vm->mem_segs[i].gpa;
+ seg_len = vm->mem_segs[i].len;
+
+ if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
+ *offset = gpa - seg_gpa;
+ *object = seg_obj;
+ vm_object_reference(seg_obj);
+ return (0);
+ }
+ }
+
+ return (EINVAL);
+}
+
+int
vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
{
@@ -633,26 +793,215 @@ save_guest_fpustate(struct vcpu *vcpu)
static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
+static int
+vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
+{
+ int error;
+
+ vcpu_assert_locked(vcpu);
+
+ /*
+ * The following state transitions are allowed:
+ * IDLE -> FROZEN -> IDLE
+ * FROZEN -> RUNNING -> FROZEN
+ * FROZEN -> SLEEPING -> FROZEN
+ */
+ switch (vcpu->state) {
+ case VCPU_IDLE:
+ case VCPU_RUNNING:
+ case VCPU_SLEEPING:
+ error = (newstate != VCPU_FROZEN);
+ break;
+ case VCPU_FROZEN:
+ error = (newstate == VCPU_FROZEN);
+ break;
+ default:
+ error = 1;
+ break;
+ }
+
+ if (error == 0)
+ vcpu->state = newstate;
+ else
+ error = EBUSY;
+
+ return (error);
+}
+
+static void
+vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
+{
+ int error;
+
+ if ((error = vcpu_set_state(vm, vcpuid, newstate)) != 0)
+ panic("Error %d setting state to %d\n", error, newstate);
+}
+
+static void
+vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
+{
+ int error;
+
+ if ((error = vcpu_set_state_locked(vcpu, newstate)) != 0)
+ panic("Error %d setting state to %d", error, newstate);
+}
+
+/*
+ * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
+ */
+static int
+vm_handle_hlt(struct vm *vm, int vcpuid, boolean_t *retu)
+{
+ struct vcpu *vcpu;
+ int sleepticks, t;
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu_lock(vcpu);
+
+ /*
+ * Figure out the number of host ticks until the next apic
+ * timer interrupt in the guest.
+ */
+ sleepticks = lapic_timer_tick(vm, vcpuid);
+
+ /*
+ * If the guest local apic timer is disabled then sleep for
+ * a long time but not forever.
+ */
+ if (sleepticks < 0)
+ sleepticks = hz;
+
+ /*
+ * Do a final check for pending NMI or interrupts before
+ * really putting this thread to sleep.
+ *
+ * These interrupts could have happened any time after we
+ * returned from VMRUN() and before we grabbed the vcpu lock.
+ */
+ if (!vm_nmi_pending(vm, vcpuid) && lapic_pending_intr(vm, vcpuid) < 0) {
+ if (sleepticks <= 0)
+ panic("invalid sleepticks %d", sleepticks);
+ t = ticks;
+ vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
+ msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
+ vcpu_require_state_locked(vcpu, VCPU_FROZEN);
+ vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
+ }
+ vcpu_unlock(vcpu);
+
+ return (0);
+}
+
+static int
+vm_handle_paging(struct vm *vm, int vcpuid, boolean_t *retu)
+{
+ int rv, ftype;
+ struct vm_map *map;
+ struct vcpu *vcpu;
+ struct vm_exit *vme;
+
+ vcpu = &vm->vcpu[vcpuid];
+ vme = &vcpu->exitinfo;
+
+ ftype = vme->u.paging.fault_type;
+ KASSERT(ftype == VM_PROT_READ ||
+ ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
+ ("vm_handle_paging: invalid fault_type %d", ftype));
+
+ if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
+ rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
+ vme->u.paging.gpa, ftype);
+ if (rv == 0)
+ goto done;
+ }
+
+ map = &vm->vmspace->vm_map;
+ rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
+
+ VMM_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, ftype = %d",
+ rv, vme->u.paging.gpa, ftype);
+
+ if (rv != KERN_SUCCESS)
+ return (EFAULT);
+done:
+ /* restart execution at the faulting instruction */
+ vme->inst_length = 0;
+
+ return (0);
+}
+
+static int
+vm_handle_inst_emul(struct vm *vm, int vcpuid, boolean_t *retu)
+{
+ struct vie *vie;
+ struct vcpu *vcpu;
+ struct vm_exit *vme;
+ int error, inst_length;
+ uint64_t rip, gla, gpa, cr3;
+
+ vcpu = &vm->vcpu[vcpuid];
+ vme = &vcpu->exitinfo;
+
+ rip = vme->rip;
+ inst_length = vme->inst_length;
+
+ gla = vme->u.inst_emul.gla;
+ gpa = vme->u.inst_emul.gpa;
+ cr3 = vme->u.inst_emul.cr3;
+ vie = &vme->u.inst_emul.vie;
+
+ vie_init(vie);
+
+ /* Fetch, decode and emulate the faulting instruction */
+ if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0)
+ return (EFAULT);
+
+ if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0)
+ return (EFAULT);
+
+ /* return to userland unless this is a local apic access */
+ if (gpa < DEFAULT_APIC_BASE || gpa >= DEFAULT_APIC_BASE + PAGE_SIZE) {
+ *retu = TRUE;
+ return (0);
+ }
+
+ error = vmm_emulate_instruction(vm, vcpuid, gpa, vie,
+ lapic_mmio_read, lapic_mmio_write, 0);
+
+ /* return to userland to spin up the AP */
+ if (error == 0 && vme->exitcode == VM_EXITCODE_SPINUP_AP)
+ *retu = TRUE;
+
+ return (error);
+}
+
int
vm_run(struct vm *vm, struct vm_run *vmrun)
{
- int error, vcpuid, sleepticks, t;
+ int error, vcpuid;
struct vcpu *vcpu;
struct pcb *pcb;
uint64_t tscval, rip;
struct vm_exit *vme;
+ boolean_t retu;
+ pmap_t pmap;
vcpuid = vmrun->cpuid;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
+ pmap = vmspace_pmap(vm->vmspace);
vcpu = &vm->vcpu[vcpuid];
- vme = &vmrun->vm_exit;
+ vme = &vcpu->exitinfo;
rip = vmrun->rip;
restart:
critical_enter();
+ KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
+ ("vm_run: absurd pm_active"));
+
tscval = rdtsc();
pcb = PCPU_GET(curpcb);
@@ -661,62 +1010,44 @@ restart:
restore_guest_msrs(vm, vcpuid);
restore_guest_fpustate(vcpu);
+ vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
vcpu->hostcpu = curcpu;
- error = VMRUN(vm->cookie, vcpuid, rip);
+ error = VMRUN(vm->cookie, vcpuid, rip, pmap);
vcpu->hostcpu = NOCPU;
+ vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
save_guest_fpustate(vcpu);
restore_host_msrs(vm, vcpuid);
vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
- /* copy the exit information */
- bcopy(&vcpu->exitinfo, vme, sizeof(struct vm_exit));
-
critical_exit();
- /*
- * Oblige the guest's desire to 'hlt' by sleeping until the vcpu
- * is ready to run.
- */
- if (error == 0 && vme->exitcode == VM_EXITCODE_HLT) {
- vcpu_lock(vcpu);
-
- /*
- * Figure out the number of host ticks until the next apic
- * timer interrupt in the guest.
- */
- sleepticks = lapic_timer_tick(vm, vcpuid);
-
- /*
- * If the guest local apic timer is disabled then sleep for
- * a long time but not forever.
- */
- if (sleepticks < 0)
- sleepticks = hz;
-
- /*
- * Do a final check for pending NMI or interrupts before
- * really putting this thread to sleep.
- *
- * These interrupts could have happened any time after we
- * returned from VMRUN() and before we grabbed the vcpu lock.
- */
- if (!vm_nmi_pending(vm, vcpuid) &&
- lapic_pending_intr(vm, vcpuid) < 0) {
- if (sleepticks <= 0)
- panic("invalid sleepticks %d", sleepticks);
- t = ticks;
- msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
- vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
+ if (error == 0) {
+ retu = FALSE;
+ switch (vme->exitcode) {
+ case VM_EXITCODE_HLT:
+ error = vm_handle_hlt(vm, vcpuid, &retu);
+ break;
+ case VM_EXITCODE_PAGING:
+ error = vm_handle_paging(vm, vcpuid, &retu);
+ break;
+ case VM_EXITCODE_INST_EMUL:
+ error = vm_handle_inst_emul(vm, vcpuid, &retu);
+ break;
+ default:
+ retu = TRUE; /* handled in userland */
+ break;
}
+ }
- vcpu_unlock(vcpu);
-
+ if (error == 0 && retu == FALSE) {
rip = vme->rip + vme->inst_length;
goto restart;
}
+ /* copy the exit information */
+ bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
return (error);
}
@@ -869,7 +1200,7 @@ vm_iommu_domain(struct vm *vm)
}
int
-vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
+vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
{
int error;
struct vcpu *vcpu;
@@ -880,20 +1211,7 @@ vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
-
- /*
- * The following state transitions are allowed:
- * IDLE -> RUNNING -> IDLE
- * IDLE -> CANNOT_RUN -> IDLE
- */
- if ((vcpu->state == VCPU_IDLE && state != VCPU_IDLE) ||
- (vcpu->state != VCPU_IDLE && state == VCPU_IDLE)) {
- error = 0;
- vcpu->state = state;
- } else {
- error = EBUSY;
- }
-
+ error = vcpu_set_state_locked(vcpu, newstate);
vcpu_unlock(vcpu);
return (error);
@@ -979,16 +1297,7 @@ vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
vcpu_lock(vcpu);
hostcpu = vcpu->hostcpu;
if (hostcpu == NOCPU) {
- /*
- * If the vcpu is 'RUNNING' but without a valid 'hostcpu' then
- * the host thread must be sleeping waiting for an event to
- * kick the vcpu out of 'hlt'.
- *
- * XXX this is racy because the condition exists right before
- * and after calling VMRUN() in vm_run(). The wakeup() is
- * benign in this case.
- */
- if (vcpu->state == VCPU_RUNNING)
+ if (vcpu->state == VCPU_SLEEPING)
wakeup_one(vcpu);
} else {
if (vcpu->state != VCPU_RUNNING)
@@ -998,3 +1307,10 @@ vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
}
vcpu_unlock(vcpu);
}
+
+struct vmspace *
+vm_get_vmspace(struct vm *vm)
+{
+
+ return (vm->vmspace);
+}
OpenPOWER on IntegriCloud