diff options
author | Renato Botelho <renato@netgate.com> | 2016-02-03 18:01:26 -0200 |
---|---|---|
committer | Renato Botelho <renato@netgate.com> | 2016-02-03 18:01:26 -0200 |
commit | af0758169e63a4e7f6024c241d9254a8bc09908d (patch) | |
tree | a31903fc62c875e8d23a391beb8ae57ed0092c45 /sys/amd64 | |
parent | 79f27b5150f7b79a6f1bcd30e9233f1abb9c3e36 (diff) | |
parent | 6114d518f71115abacc5d610c4d668ef6e0b2f37 (diff) | |
download | FreeBSD-src-af0758169e63a4e7f6024c241d9254a8bc09908d.zip FreeBSD-src-af0758169e63a4e7f6024c241d9254a8bc09908d.tar.gz |
Merge remote-tracking branch 'origin/stable/10' into devel
Diffstat (limited to 'sys/amd64')
-rw-r--r-- | sys/amd64/amd64/fpu.c | 21 | ||||
-rw-r--r-- | sys/amd64/include/vmm.h | 35 | ||||
-rw-r--r-- | sys/amd64/include/vmm_dev.h | 38 | ||||
-rw-r--r-- | sys/amd64/vmm/amd/svm.c | 2 | ||||
-rw-r--r-- | sys/amd64/vmm/intel/vmx.c | 2 | ||||
-rw-r--r-- | sys/amd64/vmm/io/ppt.c | 16 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm.c | 471 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_dev.c | 398 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_instruction_emul.c | 63 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_mem.c | 32 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_mem.h | 2 |
11 files changed, 758 insertions, 322 deletions
diff --git a/sys/amd64/amd64/fpu.c b/sys/amd64/amd64/fpu.c index f30c073..d9c967c 100644 --- a/sys/amd64/amd64/fpu.c +++ b/sys/amd64/amd64/fpu.c @@ -318,13 +318,15 @@ fpuinitstate(void *arg __unused) cpu_mxcsr_mask = 0xFFBF; /* - * The fninit instruction does not modify XMM registers. The - * fpusave call dumped the garbage contained in the registers - * after reset to the initial state saved. Clear XMM - * registers file image to make the startup program state and - * signal handler XMM register content predictable. + * The fninit instruction does not modify XMM registers or x87 + * registers (MM/ST). The fpusave call dumped the garbage + * contained in the registers after reset to the initial state + * saved. Clear XMM and x87 registers file image to make the + * startup program state and signal handler XMM/x87 register + * content predictable. */ - bzero(&fpu_initialstate->sv_xmm[0], sizeof(struct xmmacc)); + bzero(fpu_initialstate->sv_fp, sizeof(fpu_initialstate->sv_fp)); + bzero(fpu_initialstate->sv_xmm, sizeof(fpu_initialstate->sv_xmm)); /* * Create a table describing the layout of the CPU Extended @@ -375,7 +377,7 @@ fpuexit(struct thread *td) } int -fpuformat() +fpuformat(void) { return (_MC_FPFMT_XMM); @@ -661,7 +663,8 @@ fpudna(void) * fpu_initialstate, to ignite the XSAVEOPT * tracking engine. */ - bcopy(fpu_initialstate, curpcb->pcb_save, cpu_max_ext_state_size); + bcopy(fpu_initialstate, curpcb->pcb_save, + cpu_max_ext_state_size); fpurestore(curpcb->pcb_save); if (curpcb->pcb_initial_fpucw != __INITIAL_FPUCW__) fldcw(curpcb->pcb_initial_fpucw); @@ -676,7 +679,7 @@ fpudna(void) } void -fpudrop() +fpudrop(void) { struct thread *td; diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 1a4e5ab..f2de960 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -108,7 +108,6 @@ enum x2apic_state { struct vm; struct vm_exception; -struct vm_memory_segment; struct seg_desc; struct vm_exit; struct vm_run; @@ -175,17 +174,33 @@ int vm_create(const char *name, struct vm **retvm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); const char *vm_name(struct vm *vm); -int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len); + +/* + * APIs that modify the guest memory map require all vcpus to be frozen. + */ +int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off, + size_t len, int prot, int flags); +int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem); +void vm_free_memseg(struct vm *vm, int ident); int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); -void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot, - void **cookie); +int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func); +int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func); + +/* + * APIs that inspect the guest memory map require only a *single* vcpu to + * be frozen. This acts like a read lock on the guest memory map since any + * modification requires *all* vcpus to be frozen. + */ +int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, + vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); +int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, + struct vm_object **objptr); +void *vm_gpa_hold(struct vm *, int vcpuid, vm_paddr_t gpa, size_t len, + int prot, void **cookie); void vm_gpa_release(void *cookie); -int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, - struct vm_memory_segment *seg); -int vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len, - vm_offset_t *offset, struct vm_object **object); -boolean_t vm_mem_allocated(struct vm *vm, vm_paddr_t gpa); +bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa); + int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, @@ -302,8 +317,6 @@ vcpu_should_yield(struct vm *vm, int vcpu) void *vcpu_stats(struct vm *vm, int vcpu); void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr); struct vmspace *vm_get_vmspace(struct vm *vm); -int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func); -int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func); struct vatpic *vm_atpic(struct vm *vm); struct vatpit *vm_atpit(struct vm *vm); struct vpmtmr *vm_pmtmr(struct vm *vm); diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h index 9d031a9..1af75a3 100644 --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -34,10 +34,22 @@ void vmmdev_init(void); int vmmdev_cleanup(void); #endif -struct vm_memory_segment { - vm_paddr_t gpa; /* in */ +struct vm_memmap { + vm_paddr_t gpa; + int segid; /* memory segment */ + vm_ooffset_t segoff; /* offset into memory segment */ + size_t len; /* mmap length */ + int prot; /* RWX */ + int flags; +}; +#define VM_MEMMAP_F_WIRED 0x01 +#define VM_MEMMAP_F_IOMMU 0x02 + +#define VM_MEMSEG_NAME(m) ((m)->name[0] != '\0' ? (m)->name : NULL) +struct vm_memseg { + int segid; size_t len; - int wired; + char name[SPECNAMELEN + 1]; }; struct vm_register { @@ -214,10 +226,14 @@ enum { IOCNUM_REINIT = 5, /* memory apis */ - IOCNUM_MAP_MEMORY = 10, - IOCNUM_GET_MEMORY_SEG = 11, + IOCNUM_MAP_MEMORY = 10, /* deprecated */ + IOCNUM_GET_MEMORY_SEG = 11, /* deprecated */ IOCNUM_GET_GPA_PMAP = 12, IOCNUM_GLA2GPA = 13, + IOCNUM_ALLOC_MEMSEG = 14, + IOCNUM_GET_MEMSEG = 15, + IOCNUM_MMAP_MEMSEG = 16, + IOCNUM_MMAP_GETNEXT = 17, /* register/state accessors */ IOCNUM_SET_REGISTER = 20, @@ -278,10 +294,14 @@ enum { _IOW('v', IOCNUM_SUSPEND, struct vm_suspend) #define VM_REINIT \ _IO('v', IOCNUM_REINIT) -#define VM_MAP_MEMORY \ - _IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment) -#define VM_GET_MEMORY_SEG \ - _IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment) +#define VM_ALLOC_MEMSEG \ + _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg) +#define VM_GET_MEMSEG \ + _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg) +#define VM_MMAP_MEMSEG \ + _IOW('v', IOCNUM_MMAP_MEMSEG, struct vm_memmap) +#define VM_MMAP_GETNEXT \ + _IOWR('v', IOCNUM_MMAP_GETNEXT, struct vm_memmap) #define VM_SET_REGISTER \ _IOW('v', IOCNUM_SET_REGISTER, struct vm_register) #define VM_GET_REGISTER \ diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c index abca613..ca5141a 100644 --- a/sys/amd64/vmm/amd/svm.c +++ b/sys/amd64/vmm/amd/svm.c @@ -1477,7 +1477,7 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with " "reserved bits set: info1(%#lx) info2(%#lx)", info1, info2); - } else if (vm_mem_allocated(svm_sc->vm, info2)) { + } else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) { vmexit->exitcode = VM_EXITCODE_PAGING; vmexit->u.paging.gpa = info2; vmexit->u.paging.fault_type = npf_fault_type(info1); diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 5bbfe99..c5cafa8 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -2426,7 +2426,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) * this must be an instruction that accesses MMIO space. */ gpa = vmcs_gpa(); - if (vm_mem_allocated(vmx->vm, gpa) || + if (vm_mem_allocated(vmx->vm, vcpu, gpa) || apic_access_fault(vmx, vcpu, gpa)) { vmexit->exitcode = VM_EXITCODE_PAGING; vmexit->inst_length = 0; diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c index b789f77..692190a 100644 --- a/sys/amd64/vmm/io/ppt.c +++ b/sys/amd64/vmm/io/ppt.c @@ -76,11 +76,17 @@ struct pptintr_arg { /* pptintr(pptintr_arg) */ uint64_t msg_data; }; +struct pptseg { + vm_paddr_t gpa; + size_t len; + int wired; +}; + struct pptdev { device_t dev; struct vm *vm; /* owner of this device */ TAILQ_ENTRY(pptdev) next; - struct vm_memory_segment mmio[MAX_MMIOSEGS]; + struct pptseg mmio[MAX_MMIOSEGS]; struct { int num_msgs; /* guest state */ @@ -207,14 +213,14 @@ static void ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt) { int i; - struct vm_memory_segment *seg; + struct pptseg *seg; for (i = 0; i < MAX_MMIOSEGS; i++) { seg = &ppt->mmio[i]; if (seg->len == 0) continue; (void)vm_unmap_mmio(vm, seg->gpa, seg->len); - bzero(seg, sizeof(struct vm_memory_segment)); + bzero(seg, sizeof(struct pptseg)); } } @@ -324,7 +330,7 @@ ppt_is_mmio(struct vm *vm, vm_paddr_t gpa) { int i; struct pptdev *ppt; - struct vm_memory_segment *seg; + struct pptseg *seg; TAILQ_FOREACH(ppt, &pptdev_list, next) { if (ppt->vm != vm) @@ -410,7 +416,7 @@ ppt_map_mmio(struct vm *vm, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { int i, error; - struct vm_memory_segment *seg; + struct pptseg *seg; struct pptdev *ppt; ppt = ppt_find(bus, slot, func); diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index c3c7bb1..31e35d2 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -120,12 +120,21 @@ struct vcpu { #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) struct mem_seg { + size_t len; + bool sysmem; + struct vm_object *object; +}; +#define VM_MAX_MEMSEGS 2 + +struct mem_map { vm_paddr_t gpa; size_t len; - boolean_t wired; - vm_object_t object; + vm_ooffset_t segoff; + int segid; + int prot; + int flags; }; -#define VM_MAX_MEMORY_SEGMENTS 2 +#define VM_MAX_MEMMAPS 4 /* * Initialization: @@ -151,8 +160,8 @@ struct vm { void *rendezvous_arg; /* (x) rendezvous func/arg */ vm_rendezvous_func_t rendezvous_func; struct mtx rendezvous_mtx; /* (o) rendezvous lock */ - int num_mem_segs; /* (o) guest memory segments */ - struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS]; + struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ + struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ struct vmspace *vmspace; /* (o) guest's address space */ char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ @@ -224,6 +233,8 @@ TUNABLE_INT("hw.vmm.force_iommu", &vmm_force_iommu); SYSCTL_INT(_hw_vmm, OID_AUTO, force_iommu, CTLFLAG_RDTUN, &vmm_force_iommu, 0, "Force use of I/O MMU even if no passthrough devices were found."); +static void vm_free_memmap(struct vm *vm, int ident); +static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr); #ifdef KTR @@ -444,7 +455,6 @@ vm_create(const char *name, struct vm **retvm) vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); strcpy(vm->name, name); - vm->num_mem_segs = 0; vm->vmspace = vmspace; mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); @@ -455,18 +465,9 @@ vm_create(const char *name, struct vm **retvm) } static void -vm_free_mem_seg(struct vm *vm, struct mem_seg *seg) -{ - - if (seg->object != NULL) - vmm_mem_free(vm->vmspace, seg->gpa, seg->len); - - bzero(seg, sizeof(*seg)); -} - -static void vm_cleanup(struct vm *vm, bool destroy) { + struct mem_map *mm; int i; ppt_unassign_all(vm); @@ -489,11 +490,23 @@ vm_cleanup(struct vm *vm, bool destroy) VMCLEANUP(vm->cookie); - if (destroy) { - for (i = 0; i < vm->num_mem_segs; i++) - vm_free_mem_seg(vm, &vm->mem_segs[i]); + /* + * System memory is removed from the guest address space only when + * the VM is destroyed. This is because the mapping remains the same + * across VM reset. + * + * Device memory can be relocated by the guest (e.g. using PCI BARs) + * so those mappings are removed on a VM reset. + */ + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (destroy || !sysmem_mapping(vm, mm)) + vm_free_memmap(vm, i); + } - vm->num_mem_segs = 0; + if (destroy) { + for (i = 0; i < VM_MAX_MEMSEGS; i++) + vm_free_memseg(vm, i); VMSPACE_FREE(vm->vmspace); vm->vmspace = NULL; @@ -551,146 +564,243 @@ vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) return (0); } -boolean_t -vm_mem_allocated(struct vm *vm, vm_paddr_t gpa) +/* + * Return 'true' if 'gpa' is allocated in the guest address space. + * + * This function is called in the context of a running vcpu which acts as + * an implicit lock on 'vm->mem_maps[]'. + */ +bool +vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa) { + struct mem_map *mm; int i; - vm_paddr_t gpabase, gpalimit; - for (i = 0; i < vm->num_mem_segs; i++) { - gpabase = vm->mem_segs[i].gpa; - gpalimit = gpabase + vm->mem_segs[i].len; - if (gpa >= gpabase && gpa < gpalimit) - return (TRUE); /* 'gpa' is regular memory */ +#ifdef INVARIANTS + int hostcpu, state; + state = vcpu_get_state(vm, vcpuid, &hostcpu); + KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, + ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); +#endif + + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) + return (true); /* 'gpa' is sysmem or devmem */ } if (ppt_is_mmio(vm, gpa)) - return (TRUE); /* 'gpa' is pci passthru mmio */ + return (true); /* 'gpa' is pci passthru mmio */ - return (FALSE); + return (false); } int -vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) +vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) { - int available, allocated; struct mem_seg *seg; - vm_object_t object; - vm_paddr_t g; + vm_object_t obj; - if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0) + if (ident < 0 || ident >= VM_MAX_MEMSEGS) return (EINVAL); - - available = allocated = 0; - g = gpa; - while (g < gpa + len) { - if (vm_mem_allocated(vm, g)) - allocated++; - else - available++; - g += PAGE_SIZE; - } - - /* - * If there are some allocated and some available pages in the address - * range then it is an error. - */ - if (allocated && available) + if (len == 0 || (len & PAGE_MASK)) return (EINVAL); - /* - * If the entire address range being requested has already been - * allocated then there isn't anything more to do. - */ - if (allocated && available == 0) - return (0); - - if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) - return (E2BIG); - - seg = &vm->mem_segs[vm->num_mem_segs]; + seg = &vm->mem_segs[ident]; + if (seg->object != NULL) { + if (seg->len == len && seg->sysmem == sysmem) + return (EEXIST); + else + return (EINVAL); + } - if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL) + obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT); + if (obj == NULL) return (ENOMEM); - seg->gpa = gpa; seg->len = len; - seg->object = object; - seg->wired = FALSE; + seg->object = obj; + seg->sysmem = sysmem; + return (0); +} - vm->num_mem_segs++; +int +vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, + vm_object_t *objptr) +{ + struct mem_seg *seg; + + if (ident < 0 || ident >= VM_MAX_MEMSEGS) + return (EINVAL); + seg = &vm->mem_segs[ident]; + if (len) + *len = seg->len; + if (sysmem) + *sysmem = seg->sysmem; + if (objptr) + *objptr = seg->object; return (0); } -static vm_paddr_t -vm_maxmem(struct vm *vm) +void +vm_free_memseg(struct vm *vm, int ident) { - int i; - vm_paddr_t gpa, maxmem; + struct mem_seg *seg; - maxmem = 0; - for (i = 0; i < vm->num_mem_segs; i++) { - gpa = vm->mem_segs[i].gpa + vm->mem_segs[i].len; - if (gpa > maxmem) - maxmem = gpa; + KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, + ("%s: invalid memseg ident %d", __func__, ident)); + + seg = &vm->mem_segs[ident]; + if (seg->object != NULL) { + vm_object_deallocate(seg->object); + bzero(seg, sizeof(struct mem_seg)); } - return (maxmem); } -static void -vm_gpa_unwire(struct vm *vm) +int +vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, + size_t len, int prot, int flags) { - int i, rv; struct mem_seg *seg; + struct mem_map *m, *map; + vm_ooffset_t last; + int i, error; - for (i = 0; i < vm->num_mem_segs; i++) { - seg = &vm->mem_segs[i]; - if (!seg->wired) - continue; + if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0) + return (EINVAL); + + if (flags & ~VM_MEMMAP_F_WIRED) + return (EINVAL); + + if (segid < 0 || segid >= VM_MAX_MEMSEGS) + return (EINVAL); - rv = vm_map_unwire(&vm->vmspace->vm_map, - seg->gpa, seg->gpa + seg->len, - VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); - KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment " - "%#lx/%ld could not be unwired: %d", - vm_name(vm), seg->gpa, seg->len, rv)); + seg = &vm->mem_segs[segid]; + if (seg->object == NULL) + return (EINVAL); + + last = first + len; + if (first < 0 || first >= last || last > seg->len) + return (EINVAL); + + if ((gpa | first | last) & PAGE_MASK) + return (EINVAL); + + map = NULL; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + m = &vm->mem_maps[i]; + if (m->len == 0) { + map = m; + break; + } + } - seg->wired = FALSE; + if (map == NULL) + return (ENOSPC); + + error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa, + len, 0, VMFS_NO_SPACE, prot, prot, 0); + if (error != KERN_SUCCESS) + return (EFAULT); + + vm_object_reference(seg->object); + + if (flags & VM_MEMMAP_F_WIRED) { + error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len, + VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); + if (error != KERN_SUCCESS) { + vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len); + return (EFAULT); + } } + + map->gpa = gpa; + map->len = len; + map->segoff = first; + map->segid = segid; + map->prot = prot; + map->flags = flags; + return (0); } -static int -vm_gpa_wire(struct vm *vm) +int +vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, + vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) { - int i, rv; - struct mem_seg *seg; + struct mem_map *mm, *mmnext; + int i; - for (i = 0; i < vm->num_mem_segs; i++) { - seg = &vm->mem_segs[i]; - if (seg->wired) + mmnext = NULL; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (mm->len == 0 || mm->gpa < *gpa) continue; + if (mmnext == NULL || mm->gpa < mmnext->gpa) + mmnext = mm; + } - /* XXX rlimits? */ - rv = vm_map_wire(&vm->vmspace->vm_map, - seg->gpa, seg->gpa + seg->len, - VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); - if (rv != KERN_SUCCESS) - break; - - seg->wired = TRUE; + if (mmnext != NULL) { + *gpa = mmnext->gpa; + if (segid) + *segid = mmnext->segid; + if (segoff) + *segoff = mmnext->segoff; + if (len) + *len = mmnext->len; + if (prot) + *prot = mmnext->prot; + if (flags) + *flags = mmnext->flags; + return (0); + } else { + return (ENOENT); } +} - if (i < vm->num_mem_segs) { - /* - * Undo the wiring before returning an error. - */ - vm_gpa_unwire(vm); - return (EAGAIN); +static void +vm_free_memmap(struct vm *vm, int ident) +{ + struct mem_map *mm; + int error; + + mm = &vm->mem_maps[ident]; + if (mm->len) { + error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa, + mm->gpa + mm->len); + KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d", + __func__, error)); + bzero(mm, sizeof(struct mem_map)); } +} - return (0); +static __inline bool +sysmem_mapping(struct vm *vm, struct mem_map *mm) +{ + + if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) + return (true); + else + return (false); +} + +static vm_paddr_t +sysmem_maxaddr(struct vm *vm) +{ + struct mem_map *mm; + vm_paddr_t maxaddr; + int i; + + maxaddr = 0; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (sysmem_mapping(vm, mm)) { + if (maxaddr < mm->gpa + mm->len) + maxaddr = mm->gpa + mm->len; + } + } + return (maxaddr); } static void @@ -698,20 +808,36 @@ vm_iommu_modify(struct vm *vm, boolean_t map) { int i, sz; vm_paddr_t gpa, hpa; - struct mem_seg *seg; + struct mem_map *mm; void *vp, *cookie, *host_domain; sz = PAGE_SIZE; host_domain = iommu_host_domain(); - for (i = 0; i < vm->num_mem_segs; i++) { - seg = &vm->mem_segs[i]; - KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired", - vm_name(vm), seg->gpa, seg->len)); + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (!sysmem_mapping(vm, mm)) + continue; - gpa = seg->gpa; - while (gpa < seg->gpa + seg->len) { - vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE, + if (map) { + KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, + ("iommu map found invalid memmap %#lx/%#lx/%#x", + mm->gpa, mm->len, mm->flags)); + if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) + continue; + mm->flags |= VM_MEMMAP_F_IOMMU; + } else { + if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) + continue; + mm->flags &= ~VM_MEMMAP_F_IOMMU; + KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, + ("iommu unmap found invalid memmap %#lx/%#lx/%#x", + mm->gpa, mm->len, mm->flags)); + } + + gpa = mm->gpa; + while (gpa < mm->gpa + mm->len) { + vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE, &cookie); KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", vm_name(vm), gpa)); @@ -753,10 +879,9 @@ vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) if (error) return (error); - if (ppt_assigned_devices(vm) == 0) { + if (ppt_assigned_devices(vm) == 0) vm_iommu_unmap(vm); - vm_gpa_unwire(vm); - } + return (0); } @@ -766,23 +891,12 @@ vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) int error; vm_paddr_t maxaddr; - /* - * Virtual machines with pci passthru devices get special treatment: - * - the guest physical memory is wired - * - the iommu is programmed to do the 'gpa' to 'hpa' translation - * - * We need to do this before the first pci passthru device is attached. - */ + /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ if (ppt_assigned_devices(vm) == 0) { KASSERT(vm->iommu == NULL, ("vm_assign_pptdev: iommu must be NULL")); - maxaddr = vm_maxmem(vm); + maxaddr = sysmem_maxaddr(vm); vm->iommu = iommu_create_domain(maxaddr); - - error = vm_gpa_wire(vm); - if (error) - return (error); - vm_iommu_map(vm); } @@ -791,18 +905,43 @@ vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) } void * -vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, +vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot, void **cookie) { - int count, pageoff; + int i, count, pageoff; + struct mem_map *mm; vm_page_t m; - +#ifdef INVARIANTS + /* + * All vcpus are frozen by ioctls that modify the memory map + * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is + * guaranteed if at least one vcpu is in the VCPU_FROZEN state. + */ + int state; + KASSERT(vcpuid >= -1 || vcpuid < VM_MAXCPU, ("%s: invalid vcpuid %d", + __func__, vcpuid)); + for (i = 0; i < VM_MAXCPU; i++) { + if (vcpuid != -1 && vcpuid != i) + continue; + state = vcpu_get_state(vm, i, NULL); + KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d", + __func__, state)); + } +#endif pageoff = gpa & PAGE_MASK; if (len > PAGE_SIZE - pageoff) panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); - count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, - trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); + count = 0; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (sysmem_mapping(vm, mm) && gpa >= mm->gpa && + gpa < mm->gpa + mm->len) { + count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, + trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); + break; + } + } if (count == 1) { *cookie = m; @@ -824,50 +963,6 @@ vm_gpa_release(void *cookie) } int -vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, - struct vm_memory_segment *seg) -{ - int i; - - for (i = 0; i < vm->num_mem_segs; i++) { - if (gpabase == vm->mem_segs[i].gpa) { - seg->gpa = vm->mem_segs[i].gpa; - seg->len = vm->mem_segs[i].len; - seg->wired = vm->mem_segs[i].wired; - return (0); - } - } - return (-1); -} - -int -vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len, - vm_offset_t *offset, struct vm_object **object) -{ - int i; - size_t seg_len; - vm_paddr_t seg_gpa; - vm_object_t seg_obj; - - for (i = 0; i < vm->num_mem_segs; i++) { - if ((seg_obj = vm->mem_segs[i].object) == NULL) - continue; - - seg_gpa = vm->mem_segs[i].gpa; - seg_len = vm->mem_segs[i].len; - - if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) { - *offset = gpa - seg_gpa; - *object = seg_obj; - vm_object_reference(seg_obj); - return (0); - } - } - - return (EINVAL); -} - -int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) { @@ -2425,8 +2520,8 @@ vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, } for (idx = 0; idx < nused; idx++) { - hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len, - prot, &cookie); + hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa, + copyinfo[idx].len, prot, &cookie); if (hva == NULL) break; copyinfo[idx].hva = hva; diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c index e3e140a..5cb4150 100644 --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$"); #include <vm/vm.h> #include <vm/pmap.h> #include <vm/vm_map.h> +#include <vm/vm_object.h> #include <machine/vmparam.h> #include <machine/vmm.h> @@ -60,10 +61,19 @@ __FBSDID("$FreeBSD$"); #include "io/vhpet.h" #include "io/vrtc.h" +struct devmem_softc { + int segid; + char *name; + struct cdev *cdev; + struct vmmdev_softc *sc; + SLIST_ENTRY(devmem_softc) link; +}; + struct vmmdev_softc { struct vm *vm; /* vm instance cookie */ struct cdev *cdev; SLIST_ENTRY(vmmdev_softc) link; + SLIST_HEAD(, devmem_softc) devmem; int flags; }; #define VSC_LINKED 0x01 @@ -76,6 +86,63 @@ static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); SYSCTL_DECL(_hw_vmm); +static int devmem_create_cdev(const char *vmname, int id, char *devmem); +static void devmem_destroy(void *arg); + +static int +vcpu_lock_one(struct vmmdev_softc *sc, int vcpu) +{ + int error; + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); + return (error); +} + +static void +vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu) +{ + enum vcpu_state state; + + state = vcpu_get_state(sc->vm, vcpu, NULL); + if (state != VCPU_FROZEN) { + panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm), + vcpu, state); + } + + vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); +} + +static int +vcpu_lock_all(struct vmmdev_softc *sc) +{ + int error, vcpu; + + for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { + error = vcpu_lock_one(sc, vcpu); + if (error) + break; + } + + if (error) { + while (--vcpu >= 0) + vcpu_unlock_one(sc, vcpu); + } + + return (error); +} + +static void +vcpu_unlock_all(struct vmmdev_softc *sc) +{ + int vcpu; + + for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) + vcpu_unlock_one(sc, vcpu); +} + static struct vmmdev_softc * vmmdev_lookup(const char *name) { @@ -108,12 +175,16 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) void *hpa, *cookie; struct vmmdev_softc *sc; - static char zerobuf[PAGE_SIZE]; - - error = 0; sc = vmmdev_lookup2(cdev); if (sc == NULL) - error = ENXIO; + return (ENXIO); + + /* + * Get a read lock on the guest memory map by freezing any vcpu. + */ + error = vcpu_lock_one(sc, VM_MAXCPU - 1); + if (error) + return (error); prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); while (uio->uio_resid > 0 && error == 0) { @@ -129,10 +200,11 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) * Since this device does not support lseek(2), dd(1) will * read(2) blocks of data to simulate the lseek(2). */ - hpa = vm_gpa_hold(sc->vm, gpa, c, prot, &cookie); + hpa = vm_gpa_hold(sc->vm, VM_MAXCPU - 1, gpa, c, prot, &cookie); if (hpa == NULL) { if (uio->uio_rw == UIO_READ) - error = uiomove(zerobuf, c, uio); + error = uiomove(__DECONST(void *, zero_region), + c, uio); else error = EFAULT; } else { @@ -140,6 +212,70 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) vm_gpa_release(cookie); } } + vcpu_unlock_one(sc, VM_MAXCPU - 1); + return (error); +} + +CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= SPECNAMELEN + 1); + +static int +get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg) +{ + struct devmem_softc *dsc; + int error; + bool sysmem; + + error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL); + if (error || mseg->len == 0) + return (error); + + if (!sysmem) { + SLIST_FOREACH(dsc, &sc->devmem, link) { + if (dsc->segid == mseg->segid) + break; + } + KASSERT(dsc != NULL, ("%s: devmem segment %d not found", + __func__, mseg->segid)); + error = copystr(dsc->name, mseg->name, SPECNAMELEN + 1, NULL); + } else { + bzero(mseg->name, sizeof(mseg->name)); + } + + return (error); +} + +static int +alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg) +{ + char *name; + int error; + bool sysmem; + + error = 0; + name = NULL; + sysmem = true; + + if (VM_MEMSEG_NAME(mseg)) { + sysmem = false; + name = malloc(SPECNAMELEN + 1, M_VMMDEV, M_WAITOK); + error = copystr(VM_MEMSEG_NAME(mseg), name, SPECNAMELEN + 1, 0); + if (error) + goto done; + } + + error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem); + if (error) + goto done; + + if (VM_MEMSEG_NAME(mseg)) { + error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name); + if (error) + vm_free_memseg(sc->vm, mseg->segid); + else + name = NULL; /* freed when 'cdev' is destroyed */ + } +done: + free(name, M_VMMDEV); return (error); } @@ -150,7 +286,6 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, int error, vcpu, state_changed, size; cpuset_t *cpuset; struct vmmdev_softc *sc; - struct vm_memory_segment *seg; struct vm_register *vmreg; struct vm_seg_desc *vmsegdesc; struct vm_run *vmrun; @@ -177,6 +312,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct vm_intinfo *vmii; struct vm_rtc_time *rtctime; struct vm_rtc_data *rtcdata; + struct vm_memmap *mm; sc = vmmdev_lookup2(cdev); if (sc == NULL) @@ -211,43 +347,41 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, * Assumes that the first field of the ioctl data is the vcpu. */ vcpu = *(int *)data; - if (vcpu < 0 || vcpu >= VM_MAXCPU) { - error = EINVAL; - goto done; - } - - error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); + error = vcpu_lock_one(sc, vcpu); if (error) goto done; - state_changed = 1; break; case VM_MAP_PPTDEV_MMIO: case VM_BIND_PPTDEV: case VM_UNBIND_PPTDEV: - case VM_MAP_MEMORY: + case VM_ALLOC_MEMSEG: + case VM_MMAP_MEMSEG: case VM_REINIT: /* * ioctls that operate on the entire virtual machine must * prevent all vcpus from running. */ - error = 0; - for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { - error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); - if (error) - break; - } - - if (error) { - while (--vcpu >= 0) - vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); + error = vcpu_lock_all(sc); + if (error) goto done; - } - state_changed = 2; break; + case VM_GET_MEMSEG: + case VM_MMAP_GETNEXT: + /* + * Lock a vcpu to make sure that the memory map cannot be + * modified while it is being inspected. + */ + vcpu = VM_MAXCPU - 1; + error = vcpu_lock_one(sc, vcpu); + if (error) + goto done; + state_changed = 1; + break; + default: break; } @@ -372,15 +506,21 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, error = vatpic_set_irq_trigger(sc->vm, isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger); break; - case VM_MAP_MEMORY: - seg = (struct vm_memory_segment *)data; - error = vm_malloc(sc->vm, seg->gpa, seg->len); + case VM_MMAP_GETNEXT: + mm = (struct vm_memmap *)data; + error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid, + &mm->segoff, &mm->len, &mm->prot, &mm->flags); break; - case VM_GET_MEMORY_SEG: - seg = (struct vm_memory_segment *)data; - seg->len = 0; - (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg); - error = 0; + case VM_MMAP_MEMSEG: + mm = (struct vm_memmap *)data; + error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff, + mm->len, mm->prot, mm->flags); + break; + case VM_ALLOC_MEMSEG: + error = alloc_memseg(sc, (struct vm_memseg *)data); + break; + case VM_GET_MEMSEG: + error = get_memseg(sc, (struct vm_memseg *)data); break; case VM_GET_REGISTER: vmreg = (struct vm_register *)data; @@ -505,12 +645,10 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, break; } - if (state_changed == 1) { - vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); - } else if (state_changed == 2) { - for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) - vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); - } + if (state_changed == 1) + vcpu_unlock_one(sc, vcpu); + else if (state_changed == 2) + vcpu_unlock_all(sc); done: /* Make sure that no handler returns a bogus value like ERESTART */ @@ -519,26 +657,79 @@ done: } static int -vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, - vm_size_t size, struct vm_object **object, int nprot) +vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize, + struct vm_object **objp, int nprot) { - int error; struct vmmdev_softc *sc; + vm_paddr_t gpa; + size_t len; + vm_ooffset_t segoff, first, last; + int error, found, segid; + bool sysmem; + + first = *offset; + last = first + mapsize; + if ((nprot & PROT_EXEC) || first < 0 || first >= last) + return (EINVAL); sc = vmmdev_lookup2(cdev); - if (sc != NULL && (nprot & PROT_EXEC) == 0) - error = vm_get_memobj(sc->vm, *offset, size, offset, object); - else - error = EINVAL; + if (sc == NULL) { + /* virtual machine is in the process of being created */ + return (EINVAL); + } + /* + * Get a read lock on the guest memory map by freezing any vcpu. + */ + error = vcpu_lock_one(sc, VM_MAXCPU - 1); + if (error) + return (error); + + gpa = 0; + found = 0; + while (!found) { + error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len, + NULL, NULL); + if (error) + break; + + if (first >= gpa && last <= gpa + len) + found = 1; + else + gpa += len; + } + + if (found) { + error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp); + KASSERT(error == 0 && *objp != NULL, + ("%s: invalid memory segment %d", __func__, segid)); + if (sysmem) { + vm_object_reference(*objp); + *offset = segoff + (first - gpa); + } else { + error = EINVAL; + } + } + vcpu_unlock_one(sc, VM_MAXCPU - 1); return (error); } static void vmmdev_destroy(void *arg) { - struct vmmdev_softc *sc = arg; + struct devmem_softc *dsc; + int error; + + error = vcpu_lock_all(sc); + KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error)); + + while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) { + KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__)); + SLIST_REMOVE_HEAD(&sc->devmem, link); + free(dsc->name, M_VMMDEV); + free(dsc, M_VMMDEV); + } if (sc->cdev != NULL) destroy_dev(sc->cdev); @@ -560,6 +751,7 @@ sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) { int error; char buf[VM_MAX_NAMELEN]; + struct devmem_softc *dsc; struct vmmdev_softc *sc; struct cdev *cdev; @@ -578,22 +770,30 @@ sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) /* * The 'cdev' will be destroyed asynchronously when 'si_threadcount' * goes down to 0 so we should not do it again in the callback. + * + * Setting 'sc->cdev' to NULL is also used to indicate that the VM + * is scheduled for destruction. */ cdev = sc->cdev; sc->cdev = NULL; mtx_unlock(&vmmdev_mtx); /* - * Schedule the 'cdev' to be destroyed: + * Schedule all cdevs to be destroyed: * - * - any new operations on this 'cdev' will return an error (ENXIO). + * - any new operations on the 'cdev' will return an error (ENXIO). * * - when the 'si_threadcount' dwindles down to zero the 'cdev' will * be destroyed and the callback will be invoked in a taskqueue * context. + * + * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev' */ + SLIST_FOREACH(dsc, &sc->devmem, link) { + KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed")); + destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc); + } destroy_dev_sched_cb(cdev, vmmdev_destroy, sc); - return (0); } SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW, @@ -634,6 +834,7 @@ sysctl_vmm_create(SYSCTL_HANDLER_ARGS) sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); sc->vm = vm; + SLIST_INIT(&sc->devmem); /* * Lookup the name again just in case somebody sneaked in when we @@ -687,3 +888,96 @@ vmmdev_cleanup(void) return (error); } + +static int +devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len, + struct vm_object **objp, int nprot) +{ + struct devmem_softc *dsc; + vm_ooffset_t first, last; + size_t seglen; + int error; + bool sysmem; + + dsc = cdev->si_drv1; + if (dsc == NULL) { + /* 'cdev' has been created but is not ready for use */ + return (ENXIO); + } + + first = *offset; + last = *offset + len; + if ((nprot & PROT_EXEC) || first < 0 || first >= last) + return (EINVAL); + + error = vcpu_lock_one(dsc->sc, VM_MAXCPU - 1); + if (error) + return (error); + + error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp); + KASSERT(error == 0 && !sysmem && *objp != NULL, + ("%s: invalid devmem segment %d", __func__, dsc->segid)); + + vcpu_unlock_one(dsc->sc, VM_MAXCPU - 1); + + if (seglen >= last) { + vm_object_reference(*objp); + return (0); + } else { + return (EINVAL); + } +} + +static struct cdevsw devmemsw = { + .d_name = "devmem", + .d_version = D_VERSION, + .d_mmap_single = devmem_mmap_single, +}; + +static int +devmem_create_cdev(const char *vmname, int segid, char *devname) +{ + struct devmem_softc *dsc; + struct vmmdev_softc *sc; + struct cdev *cdev; + int error; + + error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL, + UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname); + if (error) + return (error); + + dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO); + + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup(vmname); + KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname)); + if (sc->cdev == NULL) { + /* virtual machine is being created or destroyed */ + mtx_unlock(&vmmdev_mtx); + free(dsc, M_VMMDEV); + destroy_dev_sched_cb(cdev, NULL, 0); + return (ENODEV); + } + + dsc->segid = segid; + dsc->name = devname; + dsc->cdev = cdev; + dsc->sc = sc; + SLIST_INSERT_HEAD(&sc->devmem, dsc, link); + mtx_unlock(&vmmdev_mtx); + + /* The 'cdev' is ready for use after 'si_drv1' is initialized */ + cdev->si_drv1 = dsc; + return (0); +} + +static void +devmem_destroy(void *arg) +{ + struct devmem_softc *dsc = arg; + + KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__)); + dsc->cdev = NULL; + dsc->sc = NULL; +} diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c index 758b7e8..ae5330f 100644 --- a/sys/amd64/vmm/vmm_instruction_emul.c +++ b/sys/amd64/vmm/vmm_instruction_emul.c @@ -1677,12 +1677,12 @@ ptp_release(void **cookie) } static void * -ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie) +ptp_hold(struct vm *vm, int vcpu, vm_paddr_t ptpphys, size_t len, void **cookie) { void *ptr; ptp_release(cookie); - ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie); + ptr = vm_gpa_hold(vm, vcpu, ptpphys, len, VM_PROT_RW, cookie); return (ptr); } @@ -1729,7 +1729,8 @@ restart: /* Zero out the lower 12 bits. */ ptpphys &= ~0xfff; - ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); + ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, + &cookie); if (ptpbase32 == NULL) goto error; @@ -1788,7 +1789,8 @@ restart: /* Zero out the lower 5 bits and the upper 32 bits */ ptpphys &= 0xffffffe0UL; - ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie); + ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof(*ptpbase) * 4, + &cookie); if (ptpbase == NULL) goto error; @@ -1811,7 +1813,7 @@ restart: /* Zero out the lower 12 bits and the upper 12 bits */ ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; - ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); + ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie); if (ptpbase == NULL) goto error; @@ -2319,10 +2321,13 @@ decode_moffset(struct vie *vie) * page table fault matches with our instruction decoding. */ static int -verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) +verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie, + enum vm_cpu_mode cpu_mode) { int error; - uint64_t base, idx, gla2; + uint64_t base, segbase, idx, gla2; + enum vm_reg_name seg; + struct seg_desc desc; /* Skip 'gla' verification */ if (gla == VIE_INVALID_GLA) @@ -2355,14 +2360,48 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) } } - /* XXX assuming that the base address of the segment is 0 */ - gla2 = base + vie->scale * idx + vie->displacement; + /* + * From "Specifying a Segment Selector", Intel SDM, Vol 1 + * + * In 64-bit mode, segmentation is generally (but not + * completely) disabled. The exceptions are the FS and GS + * segments. + * + * In legacy IA-32 mode, when the ESP or EBP register is used + * as the base, the SS segment is the default segment. For + * other data references, except when relative to stack or + * string destination the DS segment is the default. These + * can be overridden to allow other segments to be accessed. + */ + if (vie->segment_override) + seg = vie->segment_register; + else if (vie->base_register == VM_REG_GUEST_RSP || + vie->base_register == VM_REG_GUEST_RBP) + seg = VM_REG_GUEST_SS; + else + seg = VM_REG_GUEST_DS; + if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && + seg != VM_REG_GUEST_GS) { + segbase = 0; + } else { + error = vm_get_seg_desc(vm, cpuid, seg, &desc); + if (error) { + printf("verify_gla: error %d getting segment" + " descriptor %d", error, + vie->segment_register); + return (-1); + } + segbase = desc.base; + } + + gla2 = segbase + base + vie->scale * idx + vie->displacement; gla2 &= size2mask[vie->addrsize]; if (gla != gla2) { - printf("verify_gla mismatch: " + printf("verify_gla mismatch: segbase(0x%0lx)" "base(0x%0lx), scale(%d), index(0x%0lx), " "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", - base, vie->scale, idx, vie->displacement, gla, gla2); + segbase, base, vie->scale, idx, vie->displacement, + gla, gla2); return (-1); } @@ -2396,7 +2435,7 @@ vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, return (-1); if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) { - if (verify_gla(vm, cpuid, gla, vie)) + if (verify_gla(vm, cpuid, gla, vie, cpu_mode)) return (-1); } diff --git a/sys/amd64/vmm/vmm_mem.c b/sys/amd64/vmm/vmm_mem.c index 1019f2b..c9be6c9 100644 --- a/sys/amd64/vmm/vmm_mem.c +++ b/sys/amd64/vmm/vmm_mem.c @@ -114,38 +114,6 @@ vmm_mmio_free(struct vmspace *vmspace, vm_paddr_t gpa, size_t len) vm_map_remove(&vmspace->vm_map, gpa, gpa + len); } -vm_object_t -vmm_mem_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len) -{ - int error; - vm_object_t obj; - - if (gpa & PAGE_MASK) - panic("vmm_mem_alloc: invalid gpa %#lx", gpa); - - if (len == 0 || (len & PAGE_MASK) != 0) - panic("vmm_mem_alloc: invalid allocation size %lu", len); - - obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT); - if (obj != NULL) { - error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0, - VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); - if (error != KERN_SUCCESS) { - vm_object_deallocate(obj); - obj = NULL; - } - } - - return (obj); -} - -void -vmm_mem_free(struct vmspace *vmspace, vm_paddr_t gpa, size_t len) -{ - - vm_map_remove(&vmspace->vm_map, gpa, gpa + len); -} - vm_paddr_t vmm_mem_maxaddr(void) { diff --git a/sys/amd64/vmm/vmm_mem.h b/sys/amd64/vmm/vmm_mem.h index a375070..7773faa 100644 --- a/sys/amd64/vmm/vmm_mem.h +++ b/sys/amd64/vmm/vmm_mem.h @@ -33,10 +33,8 @@ struct vmspace; struct vm_object; int vmm_mem_init(void); -struct vm_object *vmm_mem_alloc(struct vmspace *, vm_paddr_t gpa, size_t size); struct vm_object *vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); -void vmm_mem_free(struct vmspace *, vm_paddr_t gpa, size_t size); void vmm_mmio_free(struct vmspace *, vm_paddr_t gpa, size_t size); vm_paddr_t vmm_mem_maxaddr(void); |