diff options
author | neel <neel@FreeBSD.org> | 2015-06-28 01:21:55 +0000 |
---|---|---|
committer | neel <neel@FreeBSD.org> | 2015-06-28 01:21:55 +0000 |
commit | c85aee0195bf01e2e4666927083768c4f182c0c0 (patch) | |
tree | e72b32c6a72fb1e7f1b2f913718581599207091c /sys | |
parent | 115742fae3f7a7c52d6d5f4894f37e68dff4fd5c (diff) | |
download | FreeBSD-src-c85aee0195bf01e2e4666927083768c4f182c0c0.zip FreeBSD-src-c85aee0195bf01e2e4666927083768c4f182c0c0.tar.gz |
MFC r279444:
Allow passthrough devices to be hinted.
MFC r279683:
When ICW1 is issued the edge sense circuit is reset which means that
following an initialization a low-to-high transistion is necesary to
generate an interrupt.
MFC r279925:
Add -p parameter to list PCI device to pass through to the guest.
MFC r281559:
Fix handling of BUS_PROBE_NOWILDCARD in 'device_probe_child()'.
MFC r280447:
When fetching an instruction in non-64bit mode, consider the value of the
code segment base address.
MFC r280725:
Move legacy interrupt allocation for virtio devices to common code.
MFC r280775:
Fix the RTC device model to operate correctly in 12-hour mode.
MFC r280929:
Fix "MOVS" instruction memory to MMIO emulation.
MFC r280968:
Display instruction bytes and %rip prior to aborting due to an instruction
emulation error.
MFC r281145:
Enhance the support for Group 1 Extended opcodes for CMP, AND, OR instructions.
MFC r281542:
Initialize 'error' before use (Coverity IDs 1249748, 1249747, 1249751, 1249749)
MFC r281561:
Prior to aborting due to an ioport error, it is always interesting to see what
the guest's %rip is.
MFC r281611:
If the number of guest vcpus is less than '1' then flag it as an error.
MFC r281612:
Prefer 'vcpu_should_yield()' over checking 'curthread->td_flags' directly.
MFC r281630:
Relax the check on which vectors can be delivered through the APIC. According
to the Intel SDM vectors 16 through 255 are allowed to be delivered via the
local APIC.
MFC r281879:
Missing break in switch case (Coverity ID 1292499)
MFC r281946:
Don't allow guest to modify readonly bits in the PCI config 'status' register.
MFC r281987:
STOS/STOSB/STOSW/STOSD/STOSQ instruction emulation.
MFC r282206:
Implement the century byte in the RTC.
Diffstat (limited to 'sys')
-rw-r--r-- | sys/amd64/include/vmm.h | 1 | ||||
-rw-r--r-- | sys/amd64/include/vmm_instruction_emul.h | 2 | ||||
-rw-r--r-- | sys/amd64/vmm/amd/svm.c | 9 | ||||
-rw-r--r-- | sys/amd64/vmm/intel/vmx.c | 6 | ||||
-rw-r--r-- | sys/amd64/vmm/io/ppt.c | 78 | ||||
-rw-r--r-- | sys/amd64/vmm/io/vatpic.c | 1 | ||||
-rw-r--r-- | sys/amd64/vmm/io/vrtc.c | 113 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm.c | 16 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_dev.c | 4 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_instruction_emul.c | 286 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_lapic.c | 6 | ||||
-rw-r--r-- | sys/kern/subr_bus.c | 18 |
12 files changed, 389 insertions, 151 deletions
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index cf7f5bc..52294bd 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -551,6 +551,7 @@ struct vm_exit { struct { uint64_t gpa; uint64_t gla; + uint64_t cs_base; int cs_d; /* CS.D */ struct vm_guest_paging paging; struct vie vie; diff --git a/sys/amd64/include/vmm_instruction_emul.h b/sys/amd64/include/vmm_instruction_emul.h index 516cc01..651b3b3 100644 --- a/sys/amd64/include/vmm_instruction_emul.h +++ b/sys/amd64/include/vmm_instruction_emul.h @@ -90,7 +90,7 @@ int vmm_fetch_instruction(struct vm *vm, int cpuid, * Returns 1 if an exception was injected into the guest. * Returns -1 otherwise. */ -int vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, +int vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa); void vie_init(struct vie *vie, const char *inst_bytes, int inst_length); diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c index 88a846d..7cc13ca 100644 --- a/sys/amd64/vmm/amd/svm.c +++ b/sys/amd64/vmm/amd/svm.c @@ -799,8 +799,14 @@ svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit) KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error)); switch(paging->cpu_mode) { + case CPU_MODE_REAL: + vmexit->u.inst_emul.cs_base = seg.base; + vmexit->u.inst_emul.cs_d = 0; + break; case CPU_MODE_PROTECTED: case CPU_MODE_COMPATIBILITY: + vmexit->u.inst_emul.cs_base = seg.base; + /* * Section 4.8.1 of APM2, Default Operand Size or D bit. */ @@ -808,6 +814,7 @@ svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit) 1 : 0; break; default: + vmexit->u.inst_emul.cs_base = 0; vmexit->u.inst_emul.cs_d = 0; break; } @@ -1911,7 +1918,7 @@ svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, } /* We are asked to give the cpu by scheduler. */ - if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) { + if (vcpu_should_yield(vm, vcpu)) { enable_gintr(); vm_exit_astpending(vm, vcpu, state->rip); break; diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index b81e48b..9aa55e2 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -1785,12 +1785,18 @@ vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) vmexit->u.inst_emul.gla = gla; vmx_paging_info(paging); switch (paging->cpu_mode) { + case CPU_MODE_REAL: + vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); + vmexit->u.inst_emul.cs_d = 0; + break; case CPU_MODE_PROTECTED: case CPU_MODE_COMPATIBILITY: + vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar); break; default: + vmexit->u.inst_emul.cs_base = 0; vmexit->u.inst_emul.cs_d = 0; break; } diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c index fa7083e..b789f77 100644 --- a/sys/amd64/vmm/io/ppt.c +++ b/sys/amd64/vmm/io/ppt.c @@ -56,7 +56,6 @@ __FBSDID("$FreeBSD$"); /* XXX locking */ -#define MAX_PPTDEVS (sizeof(pptdevs) / sizeof(pptdevs[0])) #define MAX_MSIMSGS 32 /* @@ -77,9 +76,10 @@ struct pptintr_arg { /* pptintr(pptintr_arg) */ uint64_t msg_data; }; -static struct pptdev { +struct pptdev { device_t dev; struct vm *vm; /* owner of this device */ + TAILQ_ENTRY(pptdev) next; struct vm_memory_segment mmio[MAX_MMIOSEGS]; struct { int num_msgs; /* guest state */ @@ -99,7 +99,7 @@ static struct pptdev { void **cookie; struct pptintr_arg *arg; } msix; -} pptdevs[64]; +}; SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, ppt, CTLFLAG_RW, 0, "bhyve passthru devices"); @@ -108,6 +108,8 @@ static int num_pptdevs; SYSCTL_INT(_hw_vmm_ppt, OID_AUTO, devices, CTLFLAG_RD, &num_pptdevs, 0, "number of pci passthru devices"); +static TAILQ_HEAD(, pptdev) pptdev_list = TAILQ_HEAD_INITIALIZER(pptdev_list); + static int ppt_probe(device_t dev) { @@ -125,26 +127,30 @@ ppt_probe(device_t dev) * - be allowed by administrator to be used in this role * - be an endpoint device */ - if (vmm_is_pptdev(bus, slot, func) && - (dinfo->cfg.hdrtype & PCIM_HDRTYPE) == PCIM_HDRTYPE_NORMAL) + if ((dinfo->cfg.hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_NORMAL) + return (ENXIO); + else if (vmm_is_pptdev(bus, slot, func)) return (0); else - return (ENXIO); + /* + * Returning BUS_PROBE_NOWILDCARD here matches devices that the + * SR-IOV infrastructure specified as "ppt" passthrough devices. + * All normal devices that did not have "ppt" specified as their + * driver will not be matched by this. + */ + return (BUS_PROBE_NOWILDCARD); } static int ppt_attach(device_t dev) { - int n; + struct pptdev *ppt; - if (num_pptdevs >= MAX_PPTDEVS) { - printf("ppt_attach: maximum number of pci passthrough devices " - "exceeded\n"); - return (ENXIO); - } + ppt = device_get_softc(dev); - n = num_pptdevs++; - pptdevs[n].dev = dev; + num_pptdevs++; + TAILQ_INSERT_TAIL(&pptdev_list, ppt, next); + ppt->dev = dev; if (bootverbose) device_printf(dev, "attached\n"); @@ -155,10 +161,14 @@ ppt_attach(device_t dev) static int ppt_detach(device_t dev) { - /* - * XXX check whether there are any pci passthrough devices assigned - * to guests before we allow this driver to detach. - */ + struct pptdev *ppt; + + ppt = device_get_softc(dev); + + if (ppt->vm != NULL) + return (EBUSY); + num_pptdevs--; + TAILQ_REMOVE(&pptdev_list, ppt, next); return (0); } @@ -172,22 +182,23 @@ static device_method_t ppt_methods[] = { }; static devclass_t ppt_devclass; -DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, 0); +DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, sizeof(struct pptdev)); DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL); static struct pptdev * ppt_find(int bus, int slot, int func) { device_t dev; - int i, b, s, f; + struct pptdev *ppt; + int b, s, f; - for (i = 0; i < num_pptdevs; i++) { - dev = pptdevs[i].dev; + TAILQ_FOREACH(ppt, &pptdev_list, next) { + dev = ppt->dev; b = pci_get_bus(dev); s = pci_get_slot(dev); f = pci_get_function(dev); if (bus == b && slot == s && func == f) - return (&pptdevs[i]); + return (ppt); } return (NULL); } @@ -297,11 +308,12 @@ ppt_avail_devices(void) int ppt_assigned_devices(struct vm *vm) { - int i, num; + struct pptdev *ppt; + int num; num = 0; - for (i = 0; i < num_pptdevs; i++) { - if (pptdevs[i].vm == vm) + TAILQ_FOREACH(ppt, &pptdev_list, next) { + if (ppt->vm == vm) num++; } return (num); @@ -310,12 +322,11 @@ ppt_assigned_devices(struct vm *vm) boolean_t ppt_is_mmio(struct vm *vm, vm_paddr_t gpa) { - int i, n; + int i; struct pptdev *ppt; struct vm_memory_segment *seg; - for (n = 0; n < num_pptdevs; n++) { - ppt = &pptdevs[n]; + TAILQ_FOREACH(ppt, &pptdev_list, next) { if (ppt->vm != vm) continue; @@ -377,12 +388,13 @@ ppt_unassign_device(struct vm *vm, int bus, int slot, int func) int ppt_unassign_all(struct vm *vm) { - int i, bus, slot, func; + struct pptdev *ppt; + int bus, slot, func; device_t dev; - for (i = 0; i < num_pptdevs; i++) { - if (pptdevs[i].vm == vm) { - dev = pptdevs[i].dev; + TAILQ_FOREACH(ppt, &pptdev_list, next) { + if (ppt->vm == vm) { + dev = ppt->dev; bus = pci_get_bus(dev); slot = pci_get_slot(dev); func = pci_get_function(dev); diff --git a/sys/amd64/vmm/io/vatpic.c b/sys/amd64/vmm/io/vatpic.c index 328c35f..0df6e7c 100644 --- a/sys/amd64/vmm/io/vatpic.c +++ b/sys/amd64/vmm/io/vatpic.c @@ -275,6 +275,7 @@ vatpic_icw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) atpic->ready = false; atpic->icw_num = 1; + atpic->request = 0; atpic->mask = 0; atpic->lowprio = 7; atpic->rd_cmd_reg = 0; diff --git a/sys/amd64/vmm/io/vrtc.c b/sys/amd64/vmm/io/vrtc.c index d5e93dc..9d406c1 100644 --- a/sys/amd64/vmm/io/vrtc.c +++ b/sys/amd64/vmm/io/vrtc.c @@ -63,9 +63,12 @@ struct rtcdev { uint8_t reg_b; uint8_t reg_c; uint8_t reg_d; - uint8_t nvram[128 - 14]; + uint8_t nvram[36]; + uint8_t century; + uint8_t nvram2[128 - 51]; } __packed; CTASSERT(sizeof(struct rtcdev) == 128); +CTASSERT(offsetof(struct rtcdev, century) == RTC_CENTURY); struct vrtc { struct vm *vm; @@ -214,9 +217,27 @@ secs_to_rtc(time_t rtctime, struct vrtc *vrtc, int force_update) rtc->sec = rtcset(rtc, ct.sec); rtc->min = rtcset(rtc, ct.min); - hour = ct.hour; - if ((rtc->reg_b & RTCSB_24HR) == 0) - hour = (hour % 12) + 1; /* convert to a 12-hour format */ + if (rtc->reg_b & RTCSB_24HR) { + hour = ct.hour; + } else { + /* + * Convert to the 12-hour format. + */ + switch (ct.hour) { + case 0: /* 12 AM */ + case 12: /* 12 PM */ + hour = 12; + break; + default: + /* + * The remaining 'ct.hour' values are interpreted as: + * [1 - 11] -> 1 - 11 AM + * [13 - 23] -> 1 - 11 PM + */ + hour = ct.hour % 12; + break; + } + } rtc->hour = rtcset(rtc, hour); @@ -227,6 +248,7 @@ secs_to_rtc(time_t rtctime, struct vrtc *vrtc, int force_update) rtc->day_of_month = rtcset(rtc, ct.day); rtc->month = rtcset(rtc, ct.mon); rtc->year = rtcset(rtc, ct.year % 100); + rtc->century = rtcset(rtc, ct.year / 100); } static int @@ -256,7 +278,7 @@ rtc_to_secs(struct vrtc *vrtc) struct timespec ts; struct rtcdev *rtc; struct vm *vm; - int error, hour, pm, year; + int century, error, hour, pm, year; KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); @@ -287,9 +309,26 @@ rtc_to_secs(struct vrtc *vrtc) } error = rtcget(rtc, hour, &ct.hour); if ((rtc->reg_b & RTCSB_24HR) == 0) { - ct.hour -= 1; - if (pm) - ct.hour += 12; + if (ct.hour >= 1 && ct.hour <= 12) { + /* + * Convert from 12-hour format to internal 24-hour + * representation as follows: + * + * 12-hour format ct.hour + * 12 AM 0 + * 1 - 11 AM 1 - 11 + * 12 PM 12 + * 1 - 11 PM 13 - 23 + */ + if (ct.hour == 12) + ct.hour = 0; + if (pm) + ct.hour += 12; + } else { + VM_CTR2(vm, "Invalid RTC 12-hour format %#x/%d", + rtc->hour, ct.hour); + goto fail; + } } if (error || ct.hour < 0 || ct.hour > 23) { @@ -323,10 +362,14 @@ rtc_to_secs(struct vrtc *vrtc) VM_CTR2(vm, "Invalid RTC year %#x/%d", rtc->year, year); goto fail; } - if (year >= 70) - ct.year = 1900 + year; - else - ct.year = 2000 + year; + + error = rtcget(rtc, rtc->century, ¢ury); + ct.year = century * 100 + year; + if (error || ct.year < POSIX_BASE_YEAR) { + VM_CTR2(vm, "Invalid RTC century %#x/%d", rtc->century, + ct.year); + goto fail; + } error = clock_ct_to_ts(&ct, &ts); if (error || ts.tv_sec < 0) { @@ -338,7 +381,12 @@ rtc_to_secs(struct vrtc *vrtc) } return (ts.tv_sec); /* success */ fail: - return (VRTC_BROKEN_TIME); /* failure */ + /* + * Stop updating the RTC if the date/time fields programmed by + * the guest are invalid. + */ + VM_CTR0(vrtc->vm, "Invalid RTC date/time programming detected"); + return (VRTC_BROKEN_TIME); } static int @@ -593,13 +641,6 @@ vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval) if ((newval & RTCSB_HALT) == 0) { rtctime = rtc_to_secs(vrtc); if (rtctime == VRTC_BROKEN_TIME) { - /* - * Stop updating the RTC if the date/time - * programmed by the guest is not correct. - */ - VM_CTR0(vrtc->vm, "Invalid RTC date/time " - "programming detected"); - if (rtc_flag_broken_time) return (-1); } @@ -742,7 +783,7 @@ vrtc_nvram_write(struct vm *vm, int offset, uint8_t value) * Don't allow writes to RTC control registers or the date/time fields. */ if (offset < offsetof(struct rtcdev, nvram[0]) || - offset >= sizeof(struct rtcdev)) { + offset == RTC_CENTURY || offset >= sizeof(struct rtcdev)) { VM_CTR1(vrtc->vm, "RTC nvram write to invalid offset %d", offset); return (EINVAL); @@ -776,7 +817,7 @@ vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval) /* * Update RTC date/time fields if necessary. */ - if (offset < 10) { + if (offset < 10 || offset == RTC_CENTURY) { curtime = vrtc_curtime(vrtc); secs_to_rtc(curtime, vrtc, 0); } @@ -837,13 +878,17 @@ vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, curtime = vrtc_curtime(vrtc); vrtc_time_update(vrtc, curtime); - if (in) { - /* - * Update RTC date/time fields if necessary. - */ - if (offset < 10) - secs_to_rtc(curtime, vrtc, 0); + /* + * Update RTC date/time fields if necessary. + * + * This is not just for reads of the RTC. The side-effect of writing + * the century byte requires other RTC date/time fields (e.g. sec) + * to be updated here. + */ + if (offset < 10 || offset == RTC_CENTURY) + secs_to_rtc(curtime, vrtc, 0); + if (in) { if (offset == 12) { /* * XXX @@ -887,6 +932,18 @@ vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, *((uint8_t *)rtc + offset) = *val; break; } + + /* + * XXX some guests (e.g. OpenBSD) write the century byte + * outside of RTCSB_HALT so re-calculate the RTC date/time. + */ + if (offset == RTC_CENTURY && !rtc_halted(vrtc)) { + curtime = rtc_to_secs(vrtc); + error = vrtc_time_update(vrtc, curtime); + KASSERT(!error, ("vrtc_time_update error %d", error)); + if (curtime == VRTC_BROKEN_TIME && rtc_flag_broken_time) + error = -1; + } } VRTC_UNLOCK(vrtc); return (error); diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 7f90c61..0e78272 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -218,6 +218,11 @@ SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, &trace_guest_exceptions, 0, "Trap into hypervisor on all guest exceptions and reflect them back"); +static int vmm_force_iommu = 0; +TUNABLE_INT("hw.vmm.force_iommu", &vmm_force_iommu); +SYSCTL_INT(_hw_vmm, OID_AUTO, force_iommu, CTLFLAG_RDTUN, &vmm_force_iommu, 0, + "Force use of I/O MMU even if no passthrough devices were found."); + static void vcpu_cleanup(struct vm *vm, int i, bool destroy) { @@ -322,7 +327,7 @@ vmm_handler(module_t mod, int what, void *arg) switch (what) { case MOD_LOAD: vmmdev_init(); - if (ppt_avail_devices() > 0) + if (vmm_force_iommu || ppt_avail_devices() > 0) iommu_init(); error = vmm_init(); if (error == 0) @@ -1248,7 +1253,7 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) struct vie *vie; struct vcpu *vcpu; struct vm_exit *vme; - uint64_t gla, gpa; + uint64_t gla, gpa, cs_base; struct vm_guest_paging *paging; mem_region_read_t mread; mem_region_write_t mwrite; @@ -1260,6 +1265,7 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) gla = vme->u.inst_emul.gla; gpa = vme->u.inst_emul.gpa; + cs_base = vme->u.inst_emul.cs_base; cs_d = vme->u.inst_emul.cs_d; vie = &vme->u.inst_emul.vie; paging = &vme->u.inst_emul.paging; @@ -1274,8 +1280,8 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) * maximum size instruction. */ length = vme->inst_length ? vme->inst_length : VIE_INST_SIZE; - error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip, - length, vie); + error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip + + cs_base, length, vie); } else { /* * The instruction bytes have already been copied into 'vie' @@ -2328,7 +2334,7 @@ vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, remaining = len; while (remaining > 0) { KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); - error = vmm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa); + error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa); if (error) return (error); off = gpa & PAGE_MASK; diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c index 0293d191..5be99cb 100644 --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -440,10 +440,10 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, CTASSERT(PROT_WRITE == VM_PROT_WRITE); CTASSERT(PROT_EXEC == VM_PROT_EXECUTE); gg = (struct vm_gla2gpa *)data; - error = vmm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla, + error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla, gg->prot, &gg->gpa); KASSERT(error == 0 || error == 1 || error == -1, - ("%s: vmm_gla2gpa unknown error %d", __func__, error)); + ("%s: vm_gla2gpa unknown error %d", __func__, error)); if (error >= 0) { /* * error = 0: the translation was successful diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c index 3db890e..6f75515 100644 --- a/sys/amd64/vmm/vmm_instruction_emul.c +++ b/sys/amd64/vmm/vmm_instruction_emul.c @@ -71,6 +71,8 @@ enum { VIE_OP_TYPE_CMP, VIE_OP_TYPE_POP, VIE_OP_TYPE_MOVS, + VIE_OP_TYPE_GROUP1, + VIE_OP_TYPE_STOS, VIE_OP_TYPE_LAST }; @@ -145,6 +147,16 @@ static const struct vie_op one_byte_opcodes[256] = { .op_type = VIE_OP_TYPE_MOVS, .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION }, + [0xAA] = { + .op_byte = 0xAA, + .op_type = VIE_OP_TYPE_STOS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xAB] = { + .op_byte = 0xAB, + .op_type = VIE_OP_TYPE_STOS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, [0xC6] = { /* XXX Group 11 extended opcode - not just MOV */ .op_byte = 0xC6, @@ -161,15 +173,15 @@ static const struct vie_op one_byte_opcodes[256] = { .op_type = VIE_OP_TYPE_AND, }, [0x81] = { - /* XXX Group 1 extended opcode - not just AND */ + /* XXX Group 1 extended opcode */ .op_byte = 0x81, - .op_type = VIE_OP_TYPE_AND, + .op_type = VIE_OP_TYPE_GROUP1, .op_flags = VIE_OP_F_IMM, }, [0x83] = { - /* XXX Group 1 extended opcode - not just OR */ + /* XXX Group 1 extended opcode */ .op_byte = 0x83, - .op_type = VIE_OP_TYPE_OR, + .op_type = VIE_OP_TYPE_GROUP1, .op_flags = VIE_OP_F_IMM8, }, [0x8F] = { @@ -634,7 +646,7 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, #else struct iovec copyinfo[2]; #endif - uint64_t dstaddr, srcaddr, val; + uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; uint64_t rcx, rdi, rsi, rflags; int error, opsize, seg, repeat; @@ -669,7 +681,7 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * (1) memory memory n/a * (2) memory mmio emulated * (3) mmio memory emulated - * (4) mmio mmio not emulated + * (4) mmio mmio emulated * * At this point we don't have sufficient information to distinguish * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this @@ -694,7 +706,8 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, vm_copyin(vm, vcpuid, copyinfo, &val, opsize); vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); error = memwrite(vm, vcpuid, gpa, val, opsize, arg); - goto done; + if (error) + goto done; } else if (error > 0) { /* * Resume guest execution to handle fault. @@ -705,37 +718,55 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * 'vm_copy_setup()' is expected to fail for cases (3) and (4) * if 'srcaddr' is in the mmio space. */ - } - error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, - PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr); - if (error) - goto done; - - error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize, - PROT_WRITE, copyinfo, nitems(copyinfo)); - if (error == 0) { - /* - * case (3): read from MMIO and write to system memory. - * - * A MMIO read can have side-effects so we commit to it - * only after vm_copy_setup() is successful. If a page-fault - * needs to be injected into the guest then it will happen - * before the MMIO read is attempted. - */ - error = memread(vm, vcpuid, gpa, &val, opsize, arg); + error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, + PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr); if (error) goto done; - vm_copyout(vm, vcpuid, &val, copyinfo, opsize); - vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); - } else if (error > 0) { - /* - * Resume guest execution to handle fault. - */ - goto done; - } else { - goto done; + error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize, + PROT_WRITE, copyinfo, nitems(copyinfo)); + if (error == 0) { + /* + * case (3): read from MMIO and write to system memory. + * + * A MMIO read can have side-effects so we + * commit to it only after vm_copy_setup() is + * successful. If a page-fault needs to be + * injected into the guest then it will happen + * before the MMIO read is attempted. + */ + error = memread(vm, vcpuid, gpa, &val, opsize, arg); + if (error) + goto done; + + vm_copyout(vm, vcpuid, &val, copyinfo, opsize); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + } else if (error > 0) { + /* + * Resume guest execution to handle fault. + */ + goto done; + } else { + /* + * Case (4): read from and write to mmio. + */ + error = vm_gla2gpa(vm, vcpuid, paging, srcaddr, + PROT_READ, &srcgpa); + if (error) + goto done; + error = memread(vm, vcpuid, srcgpa, &val, opsize, arg); + if (error) + goto done; + + error = vm_gla2gpa(vm, vcpuid, paging, dstaddr, + PROT_WRITE, &dstgpa); + if (error) + goto done; + error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg); + if (error) + goto done; + } } error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi); @@ -783,6 +814,68 @@ done: } static int +emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ + int error, opsize, repeat; + uint64_t val; + uint64_t rcx, rdi, rflags; + + opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize; + repeat = vie->repz_present | vie->repnz_present; + + if (repeat) { + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); + KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); + + /* + * The count register is %rcx, %ecx or %cx depending on the + * address size of the instruction. + */ + if ((rcx & vie_size2mask(vie->addrsize)) == 0) + return (0); + } + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); + KASSERT(!error, ("%s: error %d getting rax", __func__, error)); + + error = memwrite(vm, vcpuid, gpa, val, opsize, arg); + if (error) + return (error); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); + KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + if (rflags & PSL_D) + rdi -= opsize; + else + rdi += opsize; + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); + + if (repeat) { + rcx = rcx - 1; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, + rcx, vie->addrsize); + KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); + + /* + * Repeat the instruction if the count register is not zero. + */ + if ((rcx & vie_size2mask(vie->addrsize)) != 0) + vm_restart_instruction(vm, vcpuid); + } + + return (0); +} + +static int emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { @@ -820,16 +913,18 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, error = vie_update_register(vm, vcpuid, reg, result, size); break; case 0x81: + case 0x83: /* - * AND/OR mem (ModRM:r/m) with immediate and store the + * AND mem (ModRM:r/m) with immediate and store the * result in mem. * - * AND: i = 4 - * OR: i = 1 - * 81 /i op r/m16, imm16 - * 81 /i op r/m32, imm32 - * REX.W + 81 /i op r/m64, imm32 sign-extended to 64 + * 81 /4 and r/m16, imm16 + * 81 /4 and r/m32, imm32 + * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 * + * 83 /4 and r/m16, imm8 sign-extended to 16 + * 83 /4 and r/m32, imm8 sign-extended to 32 + * REX.W + 83/4 and r/m64, imm8 sign-extended to 64 */ /* get the first operand */ @@ -838,26 +933,11 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, break; /* - * perform the operation with the pre-fetched immediate - * operand and write the result - */ - switch (vie->reg & 7) { - case 0x4: - /* modrm:reg == b100, AND */ - result = val1 & vie->immediate; - break; - case 0x1: - /* modrm:reg == b001, OR */ - result = val1 | vie->immediate; - break; - default: - error = EINVAL; - break; - } - if (error) - break; - - error = memwrite(vm, vcpuid, gpa, result, size, arg); + * perform the operation with the pre-fetched immediate + * operand and write the result + */ + result = val1 & vie->immediate; + error = memwrite(vm, vcpuid, gpa, result, size, arg); break; default: break; @@ -894,20 +974,20 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, error = EINVAL; switch (vie->op.op_byte) { + case 0x81: case 0x83: /* * OR mem (ModRM:r/m) with immediate and store the * result in mem. * - * 83 /1 OR r/m16, imm8 sign-extended to 16 - * 83 /1 OR r/m32, imm8 sign-extended to 32 - * REX.W + 83/1 OR r/m64, imm8 sign-extended to 64 + * 81 /1 or r/m16, imm16 + * 81 /1 or r/m32, imm32 + * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64 * - * Currently, only the OR operation of the 0x83 opcode - * is implemented (ModRM:reg = b001). + * 83 /1 or r/m16, imm8 sign-extended to 16 + * 83 /1 or r/m32, imm8 sign-extended to 32 + * REX.W + 83/1 or r/m64, imm8 sign-extended to 64 */ - if ((vie->reg & 7) != 1) - break; /* get the first operand */ error = memread(vm, vcpuid, gpa, &val1, size, arg); @@ -978,11 +1058,37 @@ emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, if (error) return (error); + rflags2 = getcc(size, op1, op2); + break; + case 0x81: + case 0x83: + /* + * 81 /7 cmp r/m16, imm16 + * 81 /7 cmp r/m32, imm32 + * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 + * + * 83 /7 cmp r/m16, imm8 sign-extended to 16 + * 83 /7 cmp r/m32, imm8 sign-extended to 32 + * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64 + * + * Compare mem (ModRM:r/m) with immediate and set + * status flags according to the results. The + * comparison is performed by subtracting the + * immediate from the first operand and then setting + * the status flags. + * + */ + + /* get the first operand */ + error = memread(vm, vcpuid, gpa, &op1, size, arg); + if (error) + return (error); + + rflags2 = getcc(size, op1, vie->immediate); break; default: return (EINVAL); } - rflags2 = getcc(size, op1, op2); error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); @@ -1201,6 +1307,34 @@ emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, return (error); } +static int +emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *memarg) +{ + int error; + + switch (vie->reg & 7) { + case 0x1: /* OR */ + error = emulate_or(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case 0x4: /* AND */ + error = emulate_and(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case 0x7: /* CMP */ + error = emulate_cmp(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + default: + error = EINVAL; + break; + } + + return (error); +} + int vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, @@ -1212,6 +1346,10 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, return (EINVAL); switch (vie->op.op_type) { + case VIE_OP_TYPE_GROUP1: + error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; case VIE_OP_TYPE_POP: error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread, memwrite, memarg); @@ -1237,6 +1375,10 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread, memwrite, memarg); break; + case VIE_OP_TYPE_STOS: + error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; case VIE_OP_TYPE_AND: error = emulate_and(vm, vcpuid, gpa, vie, memread, memwrite, memarg); @@ -1465,7 +1607,7 @@ ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie) } int -vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, +vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa) { int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; @@ -1825,12 +1967,12 @@ decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) { uint8_t x; - if (cpu_mode == CPU_MODE_REAL) - return (-1); - if (vie->op.op_flags & VIE_OP_F_NO_MODRM) return (0); + if (cpu_mode == CPU_MODE_REAL) + return (-1); + if (vie_peek(vie, &x)) return (-1); diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c index fa9832e..f06948b 100644 --- a/sys/amd64/vmm/vmm_lapic.c +++ b/sys/amd64/vmm/vmm_lapic.c @@ -58,7 +58,11 @@ lapic_set_intr(struct vm *vm, int cpu, int vector, bool level) if (cpu < 0 || cpu >= VM_MAXCPU) return (EINVAL); - if (vector < 32 || vector > 255) + /* + * According to section "Maskable Hardware Interrupts" in Intel SDM + * vectors 16 through 255 can be delivered through the local APIC. + */ + if (vector < 16 || vector > 255) return (EINVAL); vlapic = vm_lapic(vm, cpu); diff --git a/sys/kern/subr_bus.c b/sys/kern/subr_bus.c index fa058ed..bb6806c 100644 --- a/sys/kern/subr_bus.c +++ b/sys/kern/subr_bus.c @@ -2116,6 +2116,16 @@ device_probe_child(device_t dev, device_t child) } /* + * Probes that return BUS_PROBE_NOWILDCARD or lower + * only match on devices whose driver was explicitly + * specified. + */ + if (result <= BUS_PROBE_NOWILDCARD && + !(child->flags & DF_FIXEDCLASS)) { + result = ENXIO; + } + + /* * The driver returned an error so it * certainly doesn't match. */ @@ -2130,14 +2140,6 @@ device_probe_child(device_t dev, device_t child) * of pri for the first match. */ if (best == NULL || result > pri) { - /* - * Probes that return BUS_PROBE_NOWILDCARD - * or lower only match on devices whose - * driver was explicitly specified. - */ - if (result <= BUS_PROBE_NOWILDCARD && - !(child->flags & DF_FIXEDCLASS)) - continue; best = dl; pri = result; continue; |