diff options
Diffstat (limited to 'sys/amd64/vmm')
-rw-r--r-- | sys/amd64/vmm/intel/vmx.c | 27 | ||||
-rw-r--r-- | sys/amd64/vmm/io/ppt.c | 24 | ||||
-rw-r--r-- | sys/amd64/vmm/io/ppt.h | 4 | ||||
-rw-r--r-- | sys/amd64/vmm/io/vhpet.c | 25 | ||||
-rw-r--r-- | sys/amd64/vmm/io/vioapic.c | 58 | ||||
-rw-r--r-- | sys/amd64/vmm/io/vlapic.c | 812 | ||||
-rw-r--r-- | sys/amd64/vmm/io/vlapic.h | 14 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm.c | 77 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_dev.c | 16 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_lapic.c | 84 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_lapic.h | 16 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_msr.c | 8 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_msr.h | 5 |
13 files changed, 861 insertions, 309 deletions
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 9071f3e..10e83ea 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -1359,7 +1359,8 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) struct vmcs *vmcs; struct vmxctx *vmxctx; uint32_t eax, ecx, edx, idtvec_info, idtvec_err, reason; - uint64_t qual, gpa; + uint64_t qual, gpa, rflags; + bool retu; handled = 0; vmcs = &vmx->vmcs[vcpu]; @@ -1405,31 +1406,46 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) break; case EXIT_REASON_RDMSR: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); + retu = false; ecx = vmxctx->guest_rcx; - error = emulate_rdmsr(vmx->vm, vcpu, ecx); + error = emulate_rdmsr(vmx->vm, vcpu, ecx, &retu); if (error) { vmexit->exitcode = VM_EXITCODE_RDMSR; vmexit->u.msr.code = ecx; - } else + } else if (!retu) { handled = 1; + } else { + /* Return to userspace with a valid exitcode */ + KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, + ("emulate_wrmsr retu with bogus exitcode")); + } break; case EXIT_REASON_WRMSR: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); + retu = false; eax = vmxctx->guest_rax; ecx = vmxctx->guest_rcx; edx = vmxctx->guest_rdx; error = emulate_wrmsr(vmx->vm, vcpu, ecx, - (uint64_t)edx << 32 | eax); + (uint64_t)edx << 32 | eax, &retu); if (error) { vmexit->exitcode = VM_EXITCODE_WRMSR; vmexit->u.msr.code = ecx; vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; - } else + } else if (!retu) { handled = 1; + } else { + /* Return to userspace with a valid exitcode */ + KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, + ("emulate_wrmsr retu with bogus exitcode")); + } break; case EXIT_REASON_HLT: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); + if ((error = vmread(VMCS_GUEST_RFLAGS, &rflags)) != 0) + panic("vmx_exit_process: vmread(rflags) %d", error); vmexit->exitcode = VM_EXITCODE_HLT; + vmexit->u.hlt.rflags = rflags; break; case EXIT_REASON_MTF: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); @@ -1584,7 +1600,6 @@ vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap) panic("vmx_run: error %d setting up pcpu defaults", error); do { - lapic_timer_tick(vmx->vm, vcpu); vmx_inject_interrupts(vmx, vcpu); vmx_run_trace(vmx, vcpu); rc = vmx_setjmp(vmxctx); diff --git a/sys/amd64/vmm/io/ppt.c b/sys/amd64/vmm/io/ppt.c index fce4bbd..32d59a0 100644 --- a/sys/amd64/vmm/io/ppt.c +++ b/sys/amd64/vmm/io/ppt.c @@ -72,8 +72,8 @@ MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources"); struct pptintr_arg { /* pptintr(pptintr_arg) */ struct pptdev *pptdev; - int vec; - int vcpu; + uint64_t addr; + uint64_t msg_data; }; static struct pptdev { @@ -412,16 +412,14 @@ ppt_map_mmio(struct vm *vm, int bus, int slot, int func, static int pptintr(void *arg) { - int vec; struct pptdev *ppt; struct pptintr_arg *pptarg; pptarg = arg; ppt = pptarg->pptdev; - vec = pptarg->vec; if (ppt->vm != NULL) - lapic_intr_edge(ppt->vm, pptarg->vcpu, vec); + lapic_intr_msi(ppt->vm, pptarg->addr, pptarg->msg_data); else { /* * XXX @@ -441,15 +439,13 @@ pptintr(void *arg) int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, - int destcpu, int vector, int numvec) + uint64_t addr, uint64_t msg, int numvec) { int i, rid, flags; int msi_count, startrid, error, tmp; struct pptdev *ppt; - if ((destcpu >= VM_MAXCPU || destcpu < 0) || - (vector < 0 || vector > 255) || - (numvec < 0 || numvec > MAX_MSIMSGS)) + if (numvec < 0 || numvec > MAX_MSIMSGS) return (EINVAL); ppt = ppt_find(bus, slot, func); @@ -513,8 +509,8 @@ ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, break; ppt->msi.arg[i].pptdev = ppt; - ppt->msi.arg[i].vec = vector + i; - ppt->msi.arg[i].vcpu = destcpu; + ppt->msi.arg[i].addr = addr; + ppt->msi.arg[i].msg_data = msg + i; error = bus_setup_intr(ppt->dev, ppt->msi.res[i], INTR_TYPE_NET | INTR_MPSAFE, @@ -534,7 +530,7 @@ ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, int ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func, - int idx, uint32_t msg, uint32_t vector_control, uint64_t addr) + int idx, uint64_t addr, uint64_t msg, uint32_t vector_control) { struct pptdev *ppt; struct pci_devinfo *dinfo; @@ -605,8 +601,8 @@ ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func, return (ENXIO); ppt->msix.arg[idx].pptdev = ppt; - ppt->msix.arg[idx].vec = msg & 0xFF; - ppt->msix.arg[idx].vcpu = (addr >> 12) & 0xFF; + ppt->msix.arg[idx].addr = addr; + ppt->msix.arg[idx].msg_data = msg; /* Setup the MSI-X interrupt */ error = bus_setup_intr(ppt->dev, ppt->msix.res[idx], diff --git a/sys/amd64/vmm/io/ppt.h b/sys/amd64/vmm/io/ppt.h index 7670bc4..45ba323 100644 --- a/sys/amd64/vmm/io/ppt.h +++ b/sys/amd64/vmm/io/ppt.h @@ -33,9 +33,9 @@ int ppt_unassign_all(struct vm *vm); int ppt_map_mmio(struct vm *vm, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, - int destcpu, int vector, int numvec); + uint64_t addr, uint64_t msg, int numvec); int ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func, - int idx, uint32_t msg, uint32_t vector_control, uint64_t addr); + int idx, uint64_t addr, uint64_t msg, uint32_t vector_control); int ppt_num_devices(struct vm *vm); boolean_t ppt_is_mmio(struct vm *vm, vm_paddr_t gpa); diff --git a/sys/amd64/vmm/io/vhpet.c b/sys/amd64/vmm/io/vhpet.c index 112480ee..929b343 100644 --- a/sys/amd64/vmm/io/vhpet.c +++ b/sys/amd64/vmm/io/vhpet.c @@ -240,8 +240,7 @@ vhpet_timer_edge_trig(struct vhpet *vhpet, int n) static void vhpet_timer_interrupt(struct vhpet *vhpet, int n) { - int apicid, vector, vcpuid, pin; - cpuset_t dmask; + int pin; /* If interrupts are not enabled for this timer then just return. */ if (!vhpet_timer_interrupt_enabled(vhpet, n)) @@ -256,26 +255,8 @@ vhpet_timer_interrupt(struct vhpet *vhpet, int n) } if (vhpet_timer_msi_enabled(vhpet, n)) { - /* - * XXX should have an API 'vlapic_deliver_msi(vm, addr, data)' - * - assuming physical delivery mode - * - no need to interpret contents of 'msireg' here - */ - vector = vhpet->timer[n].msireg & 0xff; - apicid = (vhpet->timer[n].msireg >> (32 + 12)) & 0xff; - if (apicid != 0xff) { - /* unicast */ - vcpuid = vm_apicid2vcpuid(vhpet->vm, apicid); - lapic_intr_edge(vhpet->vm, vcpuid, vector); - } else { - /* broadcast */ - dmask = vm_active_cpus(vhpet->vm); - while ((vcpuid = CPU_FFS(&dmask)) != 0) { - vcpuid--; - CPU_CLR(vcpuid, &dmask); - lapic_intr_edge(vhpet->vm, vcpuid, vector); - } - } + lapic_intr_msi(vhpet->vm, vhpet->timer[n].msireg >> 32, + vhpet->timer[n].msireg & 0xffffffff); return; } diff --git a/sys/amd64/vmm/io/vioapic.c b/sys/amd64/vmm/io/vioapic.c index 167e8ab..151065a 100644 --- a/sys/amd64/vmm/io/vioapic.c +++ b/sys/amd64/vmm/io/vioapic.c @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$"); #include "vmm_ktr.h" #include "vmm_lapic.h" +#include "vlapic.h" #include "vioapic.h" #define IOREGSEL 0x00 @@ -91,25 +92,14 @@ pinstate_str(bool asserted) else return ("deasserted"); } - -static const char * -trigger_str(bool level) -{ - - if (level) - return ("level"); - else - return ("edge"); -} #endif static void vioapic_send_intr(struct vioapic *vioapic, int pin) { - int vector, apicid, vcpuid; - uint32_t low, high; - cpuset_t dmask; - bool level; + int vector, delmode; + uint32_t low, high, dest; + bool level, phys; KASSERT(pin >= 0 && pin < REDIR_ENTRIES, ("vioapic_set_pinstate: invalid pin number %d", pin)); @@ -120,52 +110,20 @@ vioapic_send_intr(struct vioapic *vioapic, int pin) low = vioapic->rtbl[pin].reg; high = vioapic->rtbl[pin].reg >> 32; - /* - * XXX We only deal with: - * - physical destination - * - fixed delivery mode - */ - if ((low & IOART_DESTMOD) != IOART_DESTPHY) { - VIOAPIC_CTR2(vioapic, "ioapic pin%d: unsupported dest mode " - "0x%08x", pin, low); - return; - } - - if ((low & IOART_DELMOD) != IOART_DELFIXED) { - VIOAPIC_CTR2(vioapic, "ioapic pin%d: unsupported delivery mode " - "0x%08x", pin, low); - return; - } - if ((low & IOART_INTMASK) == IOART_INTMSET) { VIOAPIC_CTR1(vioapic, "ioapic pin%d: masked", pin); return; } + phys = ((low & IOART_DESTMOD) == IOART_DESTPHY); + delmode = low & IOART_DELMOD; level = low & IOART_TRGRLVL ? true : false; if (level) vioapic->rtbl[pin].reg |= IOART_REM_IRR; vector = low & IOART_INTVEC; - apicid = high >> APIC_ID_SHIFT; - if (apicid != 0xff) { - /* unicast */ - vcpuid = vm_apicid2vcpuid(vioapic->vm, apicid); - VIOAPIC_CTR4(vioapic, "ioapic pin%d: %s triggered intr " - "vector %d on vcpuid %d", pin, trigger_str(level), - vector, vcpuid); - lapic_set_intr(vioapic->vm, vcpuid, vector, level); - } else { - /* broadcast */ - VIOAPIC_CTR3(vioapic, "ioapic pin%d: %s triggered intr " - "vector %d on all vcpus", pin, trigger_str(level), vector); - dmask = vm_active_cpus(vioapic->vm); - while ((vcpuid = CPU_FFS(&dmask)) != 0) { - vcpuid--; - CPU_CLR(vcpuid, &dmask); - lapic_set_intr(vioapic->vm, vcpuid, vector, level); - } - } + dest = high >> APIC_ID_SHIFT; + vlapic_deliver_intr(vioapic->vm, level, dest, phys, delmode, vector); } static void diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c index 6e5b5ea..695040d 100644 --- a/sys/amd64/vmm/io/vlapic.c +++ b/sys/amd64/vmm/io/vlapic.c @@ -30,8 +30,10 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> +#include <sys/lock.h> #include <sys/kernel.h> #include <sys/malloc.h> +#include <sys/mutex.h> #include <sys/systm.h> #include <sys/smp.h> @@ -53,6 +55,9 @@ __FBSDID("$FreeBSD$"); #define VLAPIC_CTR1(vlapic, format, p1) \ VCPU_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1) +#define VLAPIC_CTR2(vlapic, format, p1, p2) \ + VCPU_CTR2((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2) + #define VLAPIC_CTR_IRR(vlapic, msg) \ do { \ uint32_t *irrptr = &(vlapic)->apic.irr0; \ @@ -86,7 +91,7 @@ static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); #define PRIO(x) ((x) >> 4) #define VLAPIC_VERSION (16) -#define VLAPIC_MAXLVT_ENTRIES (5) +#define VLAPIC_MAXLVT_ENTRIES (APIC_LVT_CMCI) #define x2apic(vlapic) (((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0) @@ -100,12 +105,16 @@ struct vlapic { struct vm *vm; int vcpuid; - struct LAPIC apic; + struct LAPIC apic; - int esr_update; + uint32_t esr_pending; + int esr_firing; - int divisor; - int ccr_ticks; + struct callout callout; /* vlapic timer */ + struct bintime timer_fire_bt; /* callout expiry time */ + struct bintime timer_freq_bt; /* timer frequency */ + struct bintime timer_period_bt; /* timer period */ + struct mtx timer_mtx; /* * The 'isrvec_stk' is a stack of vectors injected by the local apic. @@ -120,8 +129,101 @@ struct vlapic { enum boot_state boot_state; }; +/* + * The 'vlapic->timer_mtx' is used to provide mutual exclusion between the + * vlapic_callout_handler() and vcpu accesses to the following registers: + * - initial count register aka icr_timer + * - current count register aka ccr_timer + * - divide config register aka dcr_timer + * - timer LVT register + * + * Note that the vlapic_callout_handler() does not write to any of these + * registers so they can be safely read from the vcpu context without locking. + */ +#define VLAPIC_TIMER_LOCK(vlapic) mtx_lock_spin(&((vlapic)->timer_mtx)) +#define VLAPIC_TIMER_UNLOCK(vlapic) mtx_unlock_spin(&((vlapic)->timer_mtx)) +#define VLAPIC_TIMER_LOCKED(vlapic) mtx_owned(&((vlapic)->timer_mtx)) + #define VLAPIC_BUS_FREQ tsc_freq +static __inline uint32_t +vlapic_get_id(struct vlapic *vlapic) +{ + + if (x2apic(vlapic)) + return (vlapic->vcpuid); + else + return (vlapic->vcpuid << 24); +} + +static __inline uint32_t +vlapic_get_ldr(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + int apicid; + uint32_t ldr; + + lapic = &vlapic->apic; + if (x2apic(vlapic)) { + apicid = vlapic_get_id(vlapic); + ldr = 1 << (apicid & 0xf); + ldr |= (apicid & 0xffff0) << 12; + return (ldr); + } else + return (lapic->ldr); +} + +static __inline uint32_t +vlapic_get_dfr(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = &vlapic->apic; + if (x2apic(vlapic)) + return (0); + else + return (lapic->dfr); +} + +static void +vlapic_set_dfr(struct vlapic *vlapic, uint32_t data) +{ + uint32_t dfr; + struct LAPIC *lapic; + + if (x2apic(vlapic)) { + VM_CTR1(vlapic->vm, "write to DFR in x2apic mode: %#x", data); + return; + } + + lapic = &vlapic->apic; + dfr = (lapic->dfr & APIC_DFR_RESERVED) | (data & APIC_DFR_MODEL_MASK); + if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT) + VLAPIC_CTR0(vlapic, "vlapic DFR in Flat Model"); + else if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER) + VLAPIC_CTR0(vlapic, "vlapic DFR in Cluster Model"); + else + VLAPIC_CTR1(vlapic, "vlapic DFR in Unknown Model %#x", dfr); + + lapic->dfr = dfr; +} + +static void +vlapic_set_ldr(struct vlapic *vlapic, uint32_t data) +{ + struct LAPIC *lapic; + + /* LDR is read-only in x2apic mode */ + if (x2apic(vlapic)) { + VLAPIC_CTR1(vlapic, "write to LDR in x2apic mode: %#x", data); + return; + } + + lapic = &vlapic->apic; + lapic->ldr = data & ~APIC_LDR_RESERVED; + VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr); +} + static int vlapic_timer_divisor(uint32_t dcr) { @@ -167,48 +269,92 @@ vlapic_dump_lvt(uint32_t offset, uint32_t *lvt) } #endif -static uint64_t +static uint32_t vlapic_get_ccr(struct vlapic *vlapic) { - struct LAPIC *lapic = &vlapic->apic; - return lapic->ccr_timer; + struct bintime bt_now, bt_rem; + struct LAPIC *lapic; + uint32_t ccr; + + ccr = 0; + lapic = &vlapic->apic; + + VLAPIC_TIMER_LOCK(vlapic); + if (callout_active(&vlapic->callout)) { + /* + * If the timer is scheduled to expire in the future then + * compute the value of 'ccr' based on the remaining time. + */ + binuptime(&bt_now); + if (bintime_cmp(&vlapic->timer_fire_bt, &bt_now, >)) { + bt_rem = vlapic->timer_fire_bt; + bintime_sub(&bt_rem, &bt_now); + ccr += bt_rem.sec * BT2FREQ(&vlapic->timer_freq_bt); + ccr += bt_rem.frac / vlapic->timer_freq_bt.frac; + } + } + KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %#x, " + "icr_timer is %#x", ccr, lapic->icr_timer)); + VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x", + ccr, lapic->icr_timer); + VLAPIC_TIMER_UNLOCK(vlapic); + return (ccr); } static void -vlapic_update_errors(struct vlapic *vlapic) +vlapic_set_dcr(struct vlapic *vlapic, uint32_t dcr) { - struct LAPIC *lapic = &vlapic->apic; - lapic->esr = 0; // XXX + struct LAPIC *lapic; + int divisor; + + lapic = &vlapic->apic; + VLAPIC_TIMER_LOCK(vlapic); + + lapic->dcr_timer = dcr; + divisor = vlapic_timer_divisor(dcr); + VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d", dcr, divisor); + + /* + * Update the timer frequency and the timer period. + * + * XXX changes to the frequency divider will not take effect until + * the timer is reloaded. + */ + FREQ2BT(VLAPIC_BUS_FREQ / divisor, &vlapic->timer_freq_bt); + vlapic->timer_period_bt = vlapic->timer_freq_bt; + bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer); + + VLAPIC_TIMER_UNLOCK(vlapic); } static void -vlapic_init_ipi(struct vlapic *vlapic) +vlapic_update_errors(struct vlapic *vlapic) { struct LAPIC *lapic = &vlapic->apic; - lapic->version = VLAPIC_VERSION; - lapic->version |= (VLAPIC_MAXLVT_ENTRIES < MAXLVTSHIFT); - lapic->dfr = 0xffffffff; - lapic->svr = APIC_SVR_VECTOR; - vlapic_mask_lvts(&lapic->lvt_timer, VLAPIC_MAXLVT_ENTRIES+1); + lapic->esr = vlapic->esr_pending; + vlapic->esr_pending = 0; } -static int +static void vlapic_reset(struct vlapic *vlapic) { - struct LAPIC *lapic = &vlapic->apic; + struct LAPIC *lapic; + + lapic = &vlapic->apic; + bzero(lapic, sizeof(struct LAPIC)); - memset(lapic, 0, sizeof(*lapic)); - lapic->apr = vlapic->vcpuid; - vlapic_init_ipi(vlapic); - vlapic->divisor = vlapic_timer_divisor(lapic->dcr_timer); + lapic->version = VLAPIC_VERSION; + lapic->version |= (VLAPIC_MAXLVT_ENTRIES << MAXLVTSHIFT); + lapic->dfr = 0xffffffff; + lapic->svr = APIC_SVR_VECTOR; + vlapic_mask_lvts(&lapic->lvt_timer, 6); + vlapic_mask_lvts(&lapic->lvt_cmci, 1); + vlapic_set_dcr(vlapic, 0); if (vlapic->vcpuid == 0) vlapic->boot_state = BS_RUNNING; /* BSP */ else vlapic->boot_state = BS_INIT; /* AP */ - - return 0; - } void @@ -221,6 +367,17 @@ vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level) if (vector < 0 || vector >= 256) panic("vlapic_set_intr_ready: invalid vector %d\n", vector); + if (!(lapic->svr & APIC_SVR_ENABLE)) { + VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring " + "interrupt %d", vector); + return; + } + + if (vector < 16) { + vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR); + return; + } + idx = (vector / 32) * 4; mask = 1 << (vector % 32); @@ -241,39 +398,93 @@ vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level) VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready"); } -static void -vlapic_start_timer(struct vlapic *vlapic, uint32_t elapsed) +static __inline uint32_t * +vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset) { - uint32_t icr_timer; - - icr_timer = vlapic->apic.icr_timer; + struct LAPIC *lapic = &vlapic->apic; + int i; - vlapic->ccr_ticks = ticks; - if (elapsed < icr_timer) - vlapic->apic.ccr_timer = icr_timer - elapsed; - else { - /* - * This can happen when the guest is trying to run its local - * apic timer higher that the setting of 'hz' in the host. - * - * We deal with this by running the guest local apic timer - * at the rate of the host's 'hz' setting. - */ - vlapic->apic.ccr_timer = 0; + switch (offset) { + case APIC_OFFSET_CMCI_LVT: + return (&lapic->lvt_cmci); + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + i = (offset - APIC_OFFSET_TIMER_LVT) >> 2; + return ((&lapic->lvt_timer) + i);; + default: + panic("vlapic_get_lvt: invalid LVT\n"); } } -static __inline uint32_t * +static __inline uint32_t vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset) { - struct LAPIC *lapic = &vlapic->apic; - int i; - if (offset < APIC_OFFSET_TIMER_LVT || offset > APIC_OFFSET_ERROR_LVT) { - panic("vlapic_get_lvt: invalid LVT\n"); + return (*vlapic_get_lvtptr(vlapic, offset)); +} + +static void +vlapic_set_lvt(struct vlapic *vlapic, uint32_t offset, uint32_t val) +{ + uint32_t *lvtptr, mask; + struct LAPIC *lapic; + + lapic = &vlapic->apic; + lvtptr = vlapic_get_lvtptr(vlapic, offset); + + if (offset == APIC_OFFSET_TIMER_LVT) + VLAPIC_TIMER_LOCK(vlapic); + + if (!(lapic->svr & APIC_SVR_ENABLE)) + val |= APIC_LVT_M; + mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR; + switch (offset) { + case APIC_OFFSET_TIMER_LVT: + mask |= APIC_LVTT_TM; + break; + case APIC_OFFSET_ERROR_LVT: + break; + case APIC_OFFSET_LINT0_LVT: + case APIC_OFFSET_LINT1_LVT: + mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP; + /* FALLTHROUGH */ + default: + mask |= APIC_LVT_DM; + break; } - i = (offset - APIC_OFFSET_TIMER_LVT) >> 2; - return ((&lapic->lvt_timer) + i);; + *lvtptr = val & mask; + + if (offset == APIC_OFFSET_TIMER_LVT) + VLAPIC_TIMER_UNLOCK(vlapic); +} + +static int +vlapic_fire_lvt(struct vlapic *vlapic, uint32_t lvt) +{ + uint32_t vec, mode; + + if (lvt & APIC_LVT_M) + return (0); + + vec = lvt & APIC_LVT_VECTOR; + mode = lvt & APIC_LVT_DM; + + switch (mode) { + case APIC_LVT_DM_FIXED: + if (vec < 16) { + vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR); + return (0); + } + vlapic_set_intr_ready(vlapic, vec, false); + vcpu_notify_event(vlapic->vm, vlapic->vcpuid); + break; + case APIC_LVT_DM_NMI: + vm_inject_nmi(vlapic->vm, vlapic->vcpuid); + break; + default: + // Other modes ignored + return (0); + } + return (1); } #if 1 @@ -398,44 +609,314 @@ vlapic_process_eoi(struct vlapic *vlapic) } static __inline int -vlapic_get_lvt_field(uint32_t *lvt, uint32_t mask) +vlapic_get_lvt_field(uint32_t lvt, uint32_t mask) { - return (*lvt & mask); + + return (lvt & mask); } static __inline int vlapic_periodic_timer(struct vlapic *vlapic) { - uint32_t *lvt; + uint32_t lvt; lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC)); } +static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic"); + +void +vlapic_set_error(struct vlapic *vlapic, uint32_t mask) +{ + uint32_t lvt; + + vlapic->esr_pending |= mask; + if (vlapic->esr_firing) + return; + vlapic->esr_firing = 1; + + // The error LVT always uses the fixed delivery mode. + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT); + if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) { + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_ERROR, 1); + } + vlapic->esr_firing = 0; +} + static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic"); static void vlapic_fire_timer(struct vlapic *vlapic) { - int vector; - uint32_t *lvt; + uint32_t lvt; + + KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked")); + // The timer LVT always uses the fixed delivery mode. lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); - - if (!vlapic_get_lvt_field(lvt, APIC_LVTT_M)) { + if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) { vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1); - vector = vlapic_get_lvt_field(lvt,APIC_LVTT_VECTOR); - vlapic_set_intr_ready(vlapic, vector, false); + } +} + +static VMM_STAT(VLAPIC_INTR_CMC, + "corrected machine check interrupts generated by vlapic"); + +void +vlapic_fire_cmci(struct vlapic *vlapic) +{ + uint32_t lvt; + + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT); + if (vlapic_fire_lvt(vlapic, lvt)) { + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_CMC, 1); + } +} + +static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_ENTRIES, + "lvts triggered"); + +int +vlapic_trigger_lvt(struct vlapic *vlapic, int vector) +{ + uint32_t lvt; + + switch (vector) { + case APIC_LVT_LINT0: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT0_LVT); + break; + case APIC_LVT_LINT1: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT1_LVT); + break; + case APIC_LVT_TIMER: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); + lvt |= APIC_LVT_DM_FIXED; + break; + case APIC_LVT_ERROR: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT); + lvt |= APIC_LVT_DM_FIXED; + break; + case APIC_LVT_PMC: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_PERF_LVT); + break; + case APIC_LVT_THERMAL: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_THERM_LVT); + break; + case APIC_LVT_CMCI: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT); + break; + default: + return (EINVAL); + } + if (vlapic_fire_lvt(vlapic, lvt)) { + vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, + LVTS_TRIGGERRED, vector, 1); + } + return (0); +} + +static void +vlapic_callout_handler(void *arg) +{ + struct vlapic *vlapic; + struct bintime bt, btnow; + sbintime_t rem_sbt; + + vlapic = arg; + + VLAPIC_TIMER_LOCK(vlapic); + if (callout_pending(&vlapic->callout)) /* callout was reset */ + goto done; + + if (!callout_active(&vlapic->callout)) /* callout was stopped */ + goto done; + + callout_deactivate(&vlapic->callout); + + KASSERT(vlapic->apic.icr_timer != 0, ("vlapic timer is disabled")); + + vlapic_fire_timer(vlapic); + + if (vlapic_periodic_timer(vlapic)) { + binuptime(&btnow); + KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=), + ("vlapic callout at %#lx.%#lx, expected at %#lx.#%lx", + btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec, + vlapic->timer_fire_bt.frac)); + + /* + * Compute the delta between when the timer was supposed to + * fire and the present time. + */ + bt = btnow; + bintime_sub(&bt, &vlapic->timer_fire_bt); + + rem_sbt = bttosbt(vlapic->timer_period_bt); + if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) { + /* + * Adjust the time until the next countdown downward + * to account for the lost time. + */ + rem_sbt -= bttosbt(bt); + } else { + /* + * If the delta is greater than the timer period then + * just reset our time base instead of trying to catch + * up. + */ + vlapic->timer_fire_bt = btnow; + VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu " + "usecs, period is %lu usecs - resetting time base", + bttosbt(bt) / SBT_1US, + bttosbt(vlapic->timer_period_bt) / SBT_1US); + } + + bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); + callout_reset_sbt(&vlapic->callout, rem_sbt, 0, + vlapic_callout_handler, vlapic, 0); + } +done: + VLAPIC_TIMER_UNLOCK(vlapic); +} + +static void +vlapic_set_icr_timer(struct vlapic *vlapic, uint32_t icr_timer) +{ + struct LAPIC *lapic; + sbintime_t sbt; + + VLAPIC_TIMER_LOCK(vlapic); + + lapic = &vlapic->apic; + lapic->icr_timer = icr_timer; + + vlapic->timer_period_bt = vlapic->timer_freq_bt; + bintime_mul(&vlapic->timer_period_bt, icr_timer); + + if (icr_timer != 0) { + binuptime(&vlapic->timer_fire_bt); + bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); + + sbt = bttosbt(vlapic->timer_period_bt); + callout_reset_sbt(&vlapic->callout, sbt, 0, + vlapic_callout_handler, vlapic, 0); + } else + callout_stop(&vlapic->callout); + + VLAPIC_TIMER_UNLOCK(vlapic); +} + +/* + * This function populates 'dmask' with the set of vcpus that match the + * addressing specified by the (dest, phys, lowprio) tuple. + * + * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit) + * or xAPIC (8-bit) destination field. + */ +static void +vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, + bool lowprio, bool x2apic_dest) +{ + struct vlapic *vlapic; + uint32_t dfr, ldr, ldest, cluster; + uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id; + cpuset_t amask; + int vcpuid; + + if ((x2apic_dest && dest == 0xffffffff) || + (!x2apic_dest && dest == 0xff)) { + /* + * Broadcast in both logical and physical modes. + */ + *dmask = vm_active_cpus(vm); + return; + } + + if (phys) { + /* + * Physical mode: destination is APIC ID. + */ + CPU_ZERO(dmask); + vcpuid = vm_apicid2vcpuid(vm, dest); + if (vcpuid < VM_MAXCPU) + CPU_SET(vcpuid, dmask); + } else { + /* + * In the "Flat Model" the MDA is interpreted as an 8-bit wide + * bitmask. This model is only avilable in the xAPIC mode. + */ + mda_flat_ldest = dest & 0xff; + + /* + * In the "Cluster Model" the MDA is used to identify a + * specific cluster and a set of APICs in that cluster. + */ + if (x2apic_dest) { + mda_cluster_id = dest >> 16; + mda_cluster_ldest = dest & 0xffff; + } else { + mda_cluster_id = (dest >> 4) & 0xf; + mda_cluster_ldest = dest & 0xf; + } + + /* + * Logical mode: match each APIC that has a bit set + * in it's LDR that matches a bit in the ldest. + */ + CPU_ZERO(dmask); + amask = vm_active_cpus(vm); + while ((vcpuid = CPU_FFS(&amask)) != 0) { + vcpuid--; + CPU_CLR(vcpuid, &amask); + + vlapic = vm_lapic(vm, vcpuid); + dfr = vlapic_get_dfr(vlapic); + ldr = vlapic_get_ldr(vlapic); + + if ((dfr & APIC_DFR_MODEL_MASK) == + APIC_DFR_MODEL_FLAT) { + ldest = ldr >> 24; + mda_ldest = mda_flat_ldest; + } else if ((dfr & APIC_DFR_MODEL_MASK) == + APIC_DFR_MODEL_CLUSTER) { + if (x2apic(vlapic)) { + cluster = ldr >> 16; + ldest = ldr & 0xffff; + } else { + cluster = ldr >> 28; + ldest = (ldr >> 24) & 0xf; + } + if (cluster != mda_cluster_id) + continue; + mda_ldest = mda_cluster_ldest; + } else { + /* + * Guest has configured a bad logical + * model for this vcpu - skip it. + */ + VLAPIC_CTR1(vlapic, "vlapic has bad logical " + "model %x - cannot deliver interrupt", dfr); + continue; + } + + if ((mda_ldest & ldest) != 0) { + CPU_SET(vcpuid, dmask); + if (lowprio) + break; + } + } } } static VMM_STAT_ARRAY(IPIS_SENT, VM_MAXCPU, "ipis sent to vcpu"); static int -lapic_process_icr(struct vlapic *vlapic, uint64_t icrval) +lapic_process_icr(struct vlapic *vlapic, uint64_t icrval, bool *retu) { int i; + bool phys; cpuset_t dmask; uint32_t dest, vec, mode; struct vlapic *vlapic2; @@ -448,10 +929,17 @@ lapic_process_icr(struct vlapic *vlapic, uint64_t icrval) vec = icrval & APIC_VECTOR_MASK; mode = icrval & APIC_DELMODE_MASK; + if (mode == APIC_DELMODE_FIXED && vec < 16) { + vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR); + return (0); + } + if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) { switch (icrval & APIC_DEST_MASK) { case APIC_DEST_DESTFLD: - CPU_SETOF(dest, &dmask); + phys = ((icrval & APIC_DESTMODE_LOG) == 0); + vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false, + x2apic(vlapic)); break; case APIC_DEST_SELF: CPU_SETOF(vlapic->vcpuid, &dmask); @@ -508,17 +996,18 @@ lapic_process_icr(struct vlapic *vlapic, uint64_t icrval) if (vlapic2->boot_state != BS_SIPI) return (0); - vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); - vmexit->exitcode = VM_EXITCODE_SPINUP_AP; - vmexit->u.spinup_ap.vcpu = dest; - vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT; - /* * XXX this assumes that the startup IPI always succeeds */ vlapic2->boot_state = BS_RUNNING; vm_activate_cpu(vlapic2->vm, dest); + *retu = true; + vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); + vmexit->exitcode = VM_EXITCODE_SPINUP_AP; + vmexit->u.spinup_ap.vcpu = dest; + vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT; + return (0); } } @@ -555,7 +1044,6 @@ vlapic_pending_intr(struct vlapic *vlapic) break; } } - VLAPIC_CTR0(vlapic, "no pending intr"); return (-1); } @@ -593,8 +1081,39 @@ vlapic_intr_accepted(struct vlapic *vlapic, int vector) vlapic_update_ppr(vlapic); } +static void +lapic_set_svr(struct vlapic *vlapic, uint32_t new) +{ + struct LAPIC *lapic; + uint32_t old, changed; + + lapic = &vlapic->apic; + old = lapic->svr; + changed = old ^ new; + if ((changed & APIC_SVR_ENABLE) != 0) { + if ((new & APIC_SVR_ENABLE) == 0) { + /* + * The apic is now disabled so stop the apic timer. + */ + VLAPIC_CTR0(vlapic, "vlapic is software-disabled"); + VLAPIC_TIMER_LOCK(vlapic); + callout_stop(&vlapic->callout); + VLAPIC_TIMER_UNLOCK(vlapic); + } else { + /* + * The apic is now enabled so restart the apic timer + * if it is configured in periodic mode. + */ + VLAPIC_CTR0(vlapic, "vlapic is software-enabled"); + if (vlapic_periodic_timer(vlapic)) + vlapic_set_icr_timer(vlapic, lapic->icr_timer); + } + } + lapic->svr = new; +} + int -vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data) +vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data, bool *retu) { struct LAPIC *lapic = &vlapic->apic; uint32_t *reg; @@ -602,17 +1121,14 @@ vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data) if (offset > sizeof(*lapic)) { *data = 0; - return 0; + goto done; } offset &= ~3; switch(offset) { case APIC_OFFSET_ID: - if (x2apic(vlapic)) - *data = vlapic->vcpuid; - else - *data = vlapic->vcpuid << 24; + *data = vlapic_get_id(vlapic); break; case APIC_OFFSET_VER: *data = lapic->version; @@ -630,10 +1146,10 @@ vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data) *data = lapic->eoi; break; case APIC_OFFSET_LDR: - *data = lapic->ldr; + *data = vlapic_get_ldr(vlapic); break; case APIC_OFFSET_DFR: - *data = lapic->dfr; + *data = vlapic_get_dfr(vlapic); break; case APIC_OFFSET_SVR: *data = lapic->svr; @@ -662,9 +1178,9 @@ vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data) case APIC_OFFSET_ICR_HI: *data = lapic->icr_hi; break; + case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: - reg = vlapic_get_lvt(vlapic, offset); - *data = *(reg); + *data = vlapic_get_lvt(vlapic, offset); break; case APIC_OFFSET_ICR: *data = lapic->icr_timer; @@ -680,16 +1196,19 @@ vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data) *data = 0; break; } +done: + VLAPIC_CTR2(vlapic, "vlapic read offset %#x, data %#lx", offset, *data); return 0; } int -vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data) +vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data, bool *retu) { struct LAPIC *lapic = &vlapic->apic; - uint32_t *reg; int retval; + VLAPIC_CTR2(vlapic, "vlapic write offset %#x, data %#lx", offset, data); + if (offset > sizeof(*lapic)) { return 0; } @@ -708,18 +1227,20 @@ vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data) vlapic_process_eoi(vlapic); break; case APIC_OFFSET_LDR: + vlapic_set_ldr(vlapic, data); break; case APIC_OFFSET_DFR: + vlapic_set_dfr(vlapic, data); break; case APIC_OFFSET_SVR: - lapic->svr = data; + lapic_set_svr(vlapic, data); break; case APIC_OFFSET_ICR_LOW: if (!x2apic(vlapic)) { data &= 0xffffffff; data |= (uint64_t)lapic->icr_hi << 32; } - retval = lapic_process_icr(vlapic, data); + retval = lapic_process_icr(vlapic, data, retu); break; case APIC_OFFSET_ICR_HI: if (!x2apic(vlapic)) { @@ -727,22 +1248,16 @@ vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data) lapic->icr_hi = data; } break; + case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: - reg = vlapic_get_lvt(vlapic, offset); - if (!(lapic->svr & APIC_SVR_ENABLE)) { - data |= APIC_LVT_M; - } - *reg = data; - // vlapic_dump_lvt(offset, reg); + vlapic_set_lvt(vlapic, offset, data); break; case APIC_OFFSET_ICR: - lapic->icr_timer = data; - vlapic_start_timer(vlapic, 0); + vlapic_set_icr_timer(vlapic, data); break; case APIC_OFFSET_DCR: - lapic->dcr_timer = data; - vlapic->divisor = vlapic_timer_divisor(data); + vlapic_set_dcr(vlapic, data); break; case APIC_OFFSET_ESR: @@ -764,70 +1279,6 @@ vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data) return (retval); } -int -vlapic_timer_tick(struct vlapic *vlapic) -{ - int curticks, delta, periodic, fired; - uint32_t ccr; - uint32_t decrement, leftover; - -restart: - curticks = ticks; - delta = curticks - vlapic->ccr_ticks; - - /* Local APIC timer is disabled */ - if (vlapic->apic.icr_timer == 0) - return (-1); - - /* One-shot mode and timer has already counted down to zero */ - periodic = vlapic_periodic_timer(vlapic); - if (!periodic && vlapic->apic.ccr_timer == 0) - return (-1); - /* - * The 'curticks' and 'ccr_ticks' are out of sync by more than - * 2^31 ticks. We deal with this by restarting the timer. - */ - if (delta < 0) { - vlapic_start_timer(vlapic, 0); - goto restart; - } - - fired = 0; - decrement = (VLAPIC_BUS_FREQ / vlapic->divisor) / hz; - - vlapic->ccr_ticks = curticks; - ccr = vlapic->apic.ccr_timer; - - while (delta-- > 0) { - if (ccr > decrement) { - ccr -= decrement; - continue; - } - - /* Trigger the local apic timer interrupt */ - vlapic_fire_timer(vlapic); - if (periodic) { - leftover = decrement - ccr; - vlapic_start_timer(vlapic, leftover); - ccr = vlapic->apic.ccr_timer; - } else { - /* - * One-shot timer has counted down to zero. - */ - ccr = 0; - } - fired = 1; - break; - } - - vlapic->apic.ccr_timer = ccr; - - if (!fired) - return ((ccr / decrement) + 1); - else - return (0); -} - struct vlapic * vlapic_init(struct vm *vm, int vcpuid) { @@ -837,6 +1288,16 @@ vlapic_init(struct vm *vm, int vcpuid) vlapic->vm = vm; vlapic->vcpuid = vcpuid; + /* + * If the vlapic is configured in x2apic mode then it will be + * accessed in the critical section via the MSR emulation code. + * + * Therefore the timer mutex must be a spinlock because blockable + * mutexes cannot be acquired in a critical section. + */ + mtx_init(&vlapic->timer_mtx, "vlapic timer mtx", NULL, MTX_SPIN); + callout_init(&vlapic->callout, 1); + vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED; if (vcpuid == 0) @@ -851,6 +1312,7 @@ void vlapic_cleanup(struct vlapic *vlapic) { + callout_drain(&vlapic->callout); free(vlapic, M_VLAPIC); } @@ -887,3 +1349,43 @@ vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) if (state == X2APIC_DISABLED) vlapic->msr_apicbase &= ~APICBASE_X2APIC; } + +void +vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, + int delmode, int vec) +{ + bool lowprio; + int vcpuid; + cpuset_t dmask; + + if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) { + VM_CTR1(vm, "vlapic intr invalid delmode %#x", delmode); + return; + } + lowprio = (delmode == APIC_DELMODE_LOWPRIO); + + /* + * We don't provide any virtual interrupt redirection hardware so + * all interrupts originating from the ioapic or MSI specify the + * 'dest' in the legacy xAPIC format. + */ + vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false); + + while ((vcpuid = CPU_FFS(&dmask)) != 0) { + vcpuid--; + CPU_CLR(vcpuid, &dmask); + lapic_set_intr(vm, vcpuid, vec, level); + } +} + +bool +vlapic_enabled(struct vlapic *vlapic) +{ + struct LAPIC *lapic = &vlapic->apic; + + if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 && + (lapic->svr & APIC_SVR_ENABLE) != 0) + return (true); + else + return (false); +} diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h index 8ea65ee..98f377e 100644 --- a/sys/amd64/vmm/io/vlapic.h +++ b/sys/amd64/vmm/io/vlapic.h @@ -69,6 +69,7 @@ struct vm; #define APIC_OFFSET_IRR6 0x260 // IRR 192-223 R #define APIC_OFFSET_IRR7 0x270 // IRR 224-255 R #define APIC_OFFSET_ESR 0x280 // Error Status Register R +#define APIC_OFFSET_CMCI_LVT 0x2F0 // Local Vector Table (CMCI) R/W #define APIC_OFFSET_ICR_LOW 0x300 // Interrupt Command Reg. (0-31) R/W #define APIC_OFFSET_ICR_HI 0x310 // Interrupt Command Reg. (32-63) R/W #define APIC_OFFSET_TIMER_LVT 0x320 // Local Vector Table (Timer) R/W @@ -90,15 +91,22 @@ enum x2apic_state; struct vlapic *vlapic_init(struct vm *vm, int vcpuid); void vlapic_cleanup(struct vlapic *vlapic); -int vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data); -int vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data); +int vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data, + bool *retu); +int vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data, + bool *retu); int vlapic_pending_intr(struct vlapic *vlapic); void vlapic_intr_accepted(struct vlapic *vlapic, int vector); void vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level); -int vlapic_timer_tick(struct vlapic *vlapic); +void vlapic_set_error(struct vlapic *vlapic, uint32_t mask); +void vlapic_fire_cmci(struct vlapic *vlapic); +int vlapic_trigger_lvt(struct vlapic *vlapic, int vector); uint64_t vlapic_get_apicbase(struct vlapic *vlapic); void vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val); void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state s); +bool vlapic_enabled(struct vlapic *vlapic); +void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, + int delmode, int vec); #endif /* _VLAPIC_H_ */ diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index 8cbd679..f471218b 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$"); #include <machine/vm.h> #include <machine/pcb.h> #include <machine/smp.h> +#include <x86/psl.h> #include <x86/apicreg.h> #include <machine/vmparam.h> @@ -869,41 +870,44 @@ vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. */ static int -vm_handle_hlt(struct vm *vm, int vcpuid, boolean_t *retu) +vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) { + struct vm_exit *vmexit; struct vcpu *vcpu; - int sleepticks, t; + int t, timo; vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); /* - * Figure out the number of host ticks until the next apic - * timer interrupt in the guest. - */ - sleepticks = lapic_timer_tick(vm, vcpuid); - - /* - * If the guest local apic timer is disabled then sleep for - * a long time but not forever. - */ - if (sleepticks < 0) - sleepticks = hz; - - /* * Do a final check for pending NMI or interrupts before * really putting this thread to sleep. * * These interrupts could have happened any time after we * returned from VMRUN() and before we grabbed the vcpu lock. */ - if (!vm_nmi_pending(vm, vcpuid) && lapic_pending_intr(vm, vcpuid) < 0) { - if (sleepticks <= 0) - panic("invalid sleepticks %d", sleepticks); + if (!vm_nmi_pending(vm, vcpuid) && + (intr_disabled || vlapic_pending_intr(vcpu->vlapic) < 0)) { t = ticks; vcpu_require_state_locked(vcpu, VCPU_SLEEPING); - msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks); + if (vlapic_enabled(vcpu->vlapic)) { + /* + * XXX msleep_spin() is not interruptible so use the + * 'timo' to put an upper bound on the sleep time. + */ + timo = hz; + msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo); + } else { + /* + * Spindown the vcpu if the apic is disabled and it + * had entered the halted state. + */ + *retu = true; + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU; + VCPU_CTR0(vm, vcpuid, "spinning down cpu"); + } vcpu_require_state_locked(vcpu, VCPU_FROZEN); vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); } @@ -913,7 +917,7 @@ vm_handle_hlt(struct vm *vm, int vcpuid, boolean_t *retu) } static int -vm_handle_paging(struct vm *vm, int vcpuid, boolean_t *retu) +vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) { int rv, ftype; struct vm_map *map; @@ -951,7 +955,7 @@ done: } static int -vm_handle_inst_emul(struct vm *vm, int vcpuid, boolean_t *retu) +vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) { struct vie *vie; struct vcpu *vcpu; @@ -992,15 +996,12 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, boolean_t *retu) mread = vhpet_mmio_read; mwrite = vhpet_mmio_write; } else { - *retu = TRUE; + *retu = true; return (0); } - error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite, 0); - - /* return to userland to spin up the AP */ - if (error == 0 && vme->exitcode == VM_EXITCODE_SPINUP_AP) - *retu = TRUE; + error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite, + retu); return (error); } @@ -1013,7 +1014,7 @@ vm_run(struct vm *vm, struct vm_run *vmrun) struct pcb *pcb; uint64_t tscval, rip; struct vm_exit *vme; - boolean_t retu; + bool retu, intr_disabled; pmap_t pmap; vcpuid = vmrun->cpuid; @@ -1053,10 +1054,11 @@ restart: critical_exit(); if (error == 0) { - retu = FALSE; + retu = false; switch (vme->exitcode) { case VM_EXITCODE_HLT: - error = vm_handle_hlt(vm, vcpuid, &retu); + intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); + error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); break; case VM_EXITCODE_PAGING: error = vm_handle_paging(vm, vcpuid, &retu); @@ -1065,12 +1067,12 @@ restart: error = vm_handle_inst_emul(vm, vcpuid, &retu); break; default: - retu = TRUE; /* handled in userland */ + retu = true; /* handled in userland */ break; } } - if (error == 0 && retu == FALSE) { + if (error == 0 && retu == false) { rip = vme->rip + vme->inst_length; goto restart; } @@ -1109,7 +1111,7 @@ vm_inject_nmi(struct vm *vm, int vcpuid) vcpu = &vm->vcpu[vcpuid]; vcpu->nmi_pending = 1; - vm_interrupt_hostcpu(vm, vcpuid); + vcpu_notify_event(vm, vcpuid); return (0); } @@ -1329,8 +1331,15 @@ vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) return (0); } +/* + * This function is called to ensure that a vcpu "sees" a pending event + * as soon as possible: + * - If the vcpu thread is sleeping then it is woken up. + * - If the vcpu is running on a different host_cpu then an IPI will be directed + * to the host_cpu to cause the vcpu to trap into the hypervisor. + */ void -vm_interrupt_hostcpu(struct vm *vm, int vcpuid) +vcpu_notify_event(struct vm *vm, int vcpuid) { int hostcpu; struct vcpu *vcpu; diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c index f248f68..02847c2 100644 --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -152,6 +152,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct vm_run *vmrun; struct vm_event *vmevent; struct vm_lapic_irq *vmirq; + struct vm_lapic_msi *vmmsi; struct vm_ioapic_irq *ioapic_irq; struct vm_capability *vmcap; struct vm_pptdev *pptdev; @@ -254,7 +255,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, pptmsi = (struct vm_pptdev_msi *)data; error = ppt_setup_msi(sc->vm, pptmsi->vcpu, pptmsi->bus, pptmsi->slot, pptmsi->func, - pptmsi->destcpu, pptmsi->vector, + pptmsi->addr, pptmsi->msg, pptmsi->numvec); break; case VM_PPTDEV_MSIX: @@ -262,8 +263,8 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, error = ppt_setup_msix(sc->vm, pptmsix->vcpu, pptmsix->bus, pptmsix->slot, pptmsix->func, pptmsix->idx, - pptmsix->msg, pptmsix->vector_control, - pptmsix->addr); + pptmsix->addr, pptmsix->msg, + pptmsix->vector_control); break; case VM_MAP_PPTDEV_MMIO: pptmmio = (struct vm_pptdev_mmio *)data; @@ -296,6 +297,15 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, vmirq = (struct vm_lapic_irq *)data; error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector); break; + case VM_LAPIC_LOCAL_IRQ: + vmirq = (struct vm_lapic_irq *)data; + error = lapic_set_local_intr(sc->vm, vmirq->cpuid, + vmirq->vector); + break; + case VM_LAPIC_MSI: + vmmsi = (struct vm_lapic_msi *)data; + error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg); + break; case VM_IOAPIC_ASSERT_IRQ: ioapic_irq = (struct vm_ioapic_irq *)data; error = vioapic_assert_irq(sc->vm, ioapic_irq->irq); diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c index 465ce6c..8d915cd 100644 --- a/sys/amd64/vmm/vmm_lapic.c +++ b/sys/amd64/vmm/vmm_lapic.c @@ -38,9 +38,18 @@ __FBSDID("$FreeBSD$"); #include <machine/vmm.h> #include "vmm_ipi.h" +#include "vmm_ktr.h" #include "vmm_lapic.h" #include "vlapic.h" +/* + * Some MSI message definitions + */ +#define MSI_X86_ADDR_MASK 0xfff00000 +#define MSI_X86_ADDR_BASE 0xfee00000 +#define MSI_X86_ADDR_RH 0x00000008 /* Redirection Hint */ +#define MSI_X86_ADDR_LOG 0x00000004 /* Destination Mode */ + int lapic_pending_intr(struct vm *vm, int cpu) { @@ -75,19 +84,74 @@ lapic_set_intr(struct vm *vm, int cpu, int vector, bool level) vlapic = vm_lapic(vm, cpu); vlapic_set_intr_ready(vlapic, vector, level); - vm_interrupt_hostcpu(vm, cpu); + vcpu_notify_event(vm, cpu); return (0); } int -lapic_timer_tick(struct vm *vm, int cpu) +lapic_set_local_intr(struct vm *vm, int cpu, int vector) { struct vlapic *vlapic; + cpuset_t dmask; + int error; - vlapic = vm_lapic(vm, cpu); + if (cpu < -1 || cpu >= VM_MAXCPU) + return (EINVAL); + + if (cpu == -1) + dmask = vm_active_cpus(vm); + else + CPU_SETOF(cpu, &dmask); + error = 0; + while ((cpu = CPU_FFS(&dmask)) != 0) { + cpu--; + CPU_CLR(cpu, &dmask); + vlapic = vm_lapic(vm, cpu); + error = vlapic_trigger_lvt(vlapic, vector); + if (error) + break; + } + + return (error); +} - return (vlapic_timer_tick(vlapic)); +int +lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg) +{ + int delmode, vec; + uint32_t dest; + bool phys; + + VM_CTR2(vm, "lapic MSI addr: %#lx msg: %#lx", addr, msg); + + if ((addr & MSI_X86_ADDR_MASK) != MSI_X86_ADDR_BASE) { + VM_CTR1(vm, "lapic MSI invalid addr %#lx", addr); + return (-1); + } + + /* + * Extract the x86-specific fields from the MSI addr/msg + * params according to the Intel Arch spec, Vol3 Ch 10. + * + * The PCI specification does not support level triggered + * MSI/MSI-X so ignore trigger level in 'msg'. + * + * The 'dest' is interpreted as a logical APIC ID if both + * the Redirection Hint and Destination Mode are '1' and + * physical otherwise. + */ + dest = (addr >> 12) & 0xff; + phys = ((addr & (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG)) != + (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG)); + delmode = msg & APIC_DELMODE_MASK; + vec = msg & 0xff; + + VM_CTR3(vm, "lapic MSI %s dest %#x, vec %d", + phys ? "physical" : "logical", dest, vec); + + vlapic_deliver_intr(vm, LAPIC_TRIG_EDGE, dest, phys, delmode, vec); + return (0); } static boolean_t @@ -117,7 +181,7 @@ lapic_msr(u_int msr) } int -lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval) +lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval, bool *retu) { int error; u_int offset; @@ -130,14 +194,14 @@ lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval) error = 0; } else { offset = x2apic_msr_to_regoff(msr); - error = vlapic_read(vlapic, offset, rval); + error = vlapic_read(vlapic, offset, rval, retu); } return (error); } int -lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val) +lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val, bool *retu) { int error; u_int offset; @@ -150,7 +214,7 @@ lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val) error = 0; } else { offset = x2apic_msr_to_regoff(msr); - error = vlapic_write(vlapic, offset, val); + error = vlapic_write(vlapic, offset, val, retu); } return (error); @@ -174,7 +238,7 @@ lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size, return (EINVAL); vlapic = vm_lapic(vm, cpu); - error = vlapic_write(vlapic, off, wval); + error = vlapic_write(vlapic, off, wval, arg); return (error); } @@ -196,6 +260,6 @@ lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size, return (EINVAL); vlapic = vm_lapic(vm, cpu); - error = vlapic_read(vlapic, off, rval); + error = vlapic_read(vlapic, off, rval, arg); return (error); } diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h index 1461185..c5c95aa 100644 --- a/sys/amd64/vmm/vmm_lapic.h +++ b/sys/amd64/vmm/vmm_lapic.h @@ -32,16 +32,16 @@ struct vm; boolean_t lapic_msr(u_int num); -int lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval); -int lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval); +int lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval, + bool *retu); +int lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval, + bool *retu); int lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size, void *arg); int lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size, void *arg); -int lapic_timer_tick(struct vm *vm, int cpu); - /* * Returns a vector between 32 and 255 if an interrupt is pending in the * IRR that can be delivered based on the current state of ISR and TPR. @@ -84,4 +84,12 @@ lapic_intr_edge(struct vm *vm, int cpu, int vector) return (lapic_set_intr(vm, cpu, vector, LAPIC_TRIG_EDGE)); } +/* + * Triggers the LAPIC local interrupt (LVT) 'vector' on 'cpu'. 'cpu' can + * be set to -1 to trigger the interrupt on all CPUs. + */ +int lapic_set_local_intr(struct vm *vm, int cpu, int vector); + +int lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg); + #endif diff --git a/sys/amd64/vmm/vmm_msr.c b/sys/amd64/vmm/vmm_msr.c index 4011bb5..03e0071 100644 --- a/sys/amd64/vmm/vmm_msr.c +++ b/sys/amd64/vmm/vmm_msr.c @@ -154,13 +154,13 @@ msr_num_to_idx(u_int num) } int -emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val) +emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val, bool *retu) { int idx; uint64_t *guest_msrs; if (lapic_msr(num)) - return (lapic_wrmsr(vm, cpu, num, val)); + return (lapic_wrmsr(vm, cpu, num, val, retu)); idx = msr_num_to_idx(num); if (idx < 0 || invalid_msr(idx)) @@ -181,14 +181,14 @@ emulate_wrmsr(struct vm *vm, int cpu, u_int num, uint64_t val) } int -emulate_rdmsr(struct vm *vm, int cpu, u_int num) +emulate_rdmsr(struct vm *vm, int cpu, u_int num, bool *retu) { int error, idx; uint32_t eax, edx; uint64_t result, *guest_msrs; if (lapic_msr(num)) { - error = lapic_rdmsr(vm, cpu, num, &result); + error = lapic_rdmsr(vm, cpu, num, &result, retu); goto done; } diff --git a/sys/amd64/vmm/vmm_msr.h b/sys/amd64/vmm/vmm_msr.h index 8a1fda3..e070037 100644 --- a/sys/amd64/vmm/vmm_msr.h +++ b/sys/amd64/vmm/vmm_msr.h @@ -33,8 +33,9 @@ struct vm; void vmm_msr_init(void); -int emulate_wrmsr(struct vm *vm, int vcpu, u_int msr, uint64_t val); -int emulate_rdmsr(struct vm *vm, int vcpu, u_int msr); +int emulate_wrmsr(struct vm *vm, int vcpu, u_int msr, uint64_t val, + bool *retu); +int emulate_rdmsr(struct vm *vm, int vcpu, u_int msr, bool *retu); void guest_msrs_init(struct vm *vm, int cpu); void guest_msr_valid(int msr); void restore_host_msrs(struct vm *vm, int cpu); |