diff options
author | jhb <jhb@FreeBSD.org> | 2014-05-17 19:11:08 +0000 |
---|---|---|
committer | jhb <jhb@FreeBSD.org> | 2014-05-17 19:11:08 +0000 |
commit | bbf655f9b49cc39db4559ede5c58d302ff8f3de2 (patch) | |
tree | f6cf26193250fdea84a6946390d9759716c70b5c /sys/amd64 | |
parent | 7e7928763170f8b10771c099cf46224daaf67bca (diff) | |
download | FreeBSD-src-bbf655f9b49cc39db4559ede5c58d302ff8f3de2.zip FreeBSD-src-bbf655f9b49cc39db4559ede5c58d302ff8f3de2.tar.gz |
MFC 259641,259863,259924,259937,259961,259978,260380,260383,260410,260466,
260531,260532,260550,260619,261170,261453,261621,263280,263290,264516:
Add support for local APIC hardware-assist.
- Restructure vlapic access and register handling to support hardware-assist
for the local APIC.
- Use the 'Virtual Interrupt Delivery' and 'Posted Interrupt Processing'
feature of Intel VT-x if supported by hardware.
- Add an API to rendezvous all active vcpus in a virtual machine and use
it to support level triggered interrupts with VT-x 'Virtual Interrupt
Delivery'.
- Use a cheaper IPI handler than IPI_AST for nested page table shootdowns
and avoid doing unnecessary nested TLB invalidations.
Reviewed by: neel
Diffstat (limited to 'sys/amd64')
-rw-r--r-- | sys/amd64/amd64/pmap.c | 7 | ||||
-rw-r--r-- | sys/amd64/include/pmap.h | 7 | ||||
-rw-r--r-- | sys/amd64/include/vmm.h | 40 | ||||
-rw-r--r-- | sys/amd64/vmm/amd/amdv.c | 20 | ||||
-rw-r--r-- | sys/amd64/vmm/intel/ept.c | 5 | ||||
-rw-r--r-- | sys/amd64/vmm/intel/ept.h | 2 | ||||
-rw-r--r-- | sys/amd64/vmm/intel/vmcs.c | 44 | ||||
-rw-r--r-- | sys/amd64/vmm/intel/vmcs.h | 41 | ||||
-rw-r--r-- | sys/amd64/vmm/intel/vmx.c | 698 | ||||
-rw-r--r-- | sys/amd64/vmm/intel/vmx.h | 29 | ||||
-rw-r--r-- | sys/amd64/vmm/intel/vmx_controls.h | 23 | ||||
-rw-r--r-- | sys/amd64/vmm/intel/vmx_genassym.c | 9 | ||||
-rw-r--r-- | sys/amd64/vmm/intel/vmx_support.S | 33 | ||||
-rw-r--r-- | sys/amd64/vmm/io/vioapic.c | 87 | ||||
-rw-r--r-- | sys/amd64/vmm/io/vlapic.c | 669 | ||||
-rw-r--r-- | sys/amd64/vmm/io/vlapic.h | 115 | ||||
-rw-r--r-- | sys/amd64/vmm/io/vlapic_priv.h | 185 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm.c | 234 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_ipi.c | 36 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_ipi.h | 8 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_lapic.c | 26 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_lapic.h | 20 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_stat.c | 4 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_stat.h | 4 |
24 files changed, 1753 insertions, 593 deletions
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 4aa66b5..2b61023 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -1304,6 +1304,7 @@ pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va) static __inline void pmap_invalidate_ept(pmap_t pmap) { + int ipinum; sched_pin(); KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), @@ -1328,11 +1329,9 @@ pmap_invalidate_ept(pmap_t pmap) /* * Force the vcpu to exit and trap back into the hypervisor. - * - * XXX this is not optimal because IPI_AST builds a trapframe - * whereas all we need is an 'eoi' followed by 'iret'. */ - ipi_selected(pmap->pm_active, IPI_AST); + ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; + ipi_selected(pmap->pm_active, ipinum); sched_unpin(); } diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index 1b5f6a0..e83e07e 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -312,9 +312,10 @@ struct pmap { }; /* flags */ -#define PMAP_PDE_SUPERPAGE (1 << 0) /* supports 2MB superpages */ -#define PMAP_EMULATE_AD_BITS (1 << 1) /* needs A/D bits emulation */ -#define PMAP_SUPPORTS_EXEC_ONLY (1 << 2) /* execute only mappings ok */ +#define PMAP_NESTED_IPIMASK 0xff +#define PMAP_PDE_SUPERPAGE (1 << 8) /* supports 2MB superpages */ +#define PMAP_EMULATE_AD_BITS (1 << 9) /* needs A/D bits emulation */ +#define PMAP_SUPPORTS_EXEC_ONLY (1 << 10) /* execute only mappings ok */ typedef struct pmap *pmap_t; diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index 92b767f..fab7e74 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -47,12 +47,12 @@ struct pmap; enum x2apic_state; -typedef int (*vmm_init_func_t)(void); +typedef int (*vmm_init_func_t)(int ipinum); typedef int (*vmm_cleanup_func_t)(void); typedef void (*vmm_resume_func_t)(void); typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip, - struct pmap *pmap); + struct pmap *pmap, void *rendezvous_cookie); typedef void (*vmi_cleanup_func_t)(void *vmi); typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, uint64_t *retval); @@ -69,6 +69,8 @@ typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); +typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu); +typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic); struct vmm_ops { vmm_init_func_t init; /* module wide initialization */ @@ -87,6 +89,8 @@ struct vmm_ops { vmi_set_cap_t vmsetcap; vmi_vmspace_alloc vmspace_alloc; vmi_vmspace_free vmspace_free; + vmi_vlapic_init vlapic_init; + vmi_vlapic_cleanup vlapic_cleanup; }; extern struct vmm_ops vmm_ops_intel; @@ -132,6 +136,31 @@ cpuset_t vm_active_cpus(struct vm *vm); struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); /* + * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'. + * The rendezvous 'func(arg)' is not allowed to do anything that will + * cause the thread to be put to sleep. + * + * If the rendezvous is being initiated from a vcpu context then the + * 'vcpuid' must refer to that vcpu, otherwise it should be set to -1. + * + * The caller cannot hold any locks when initiating the rendezvous. + * + * The implementation of this API may cause vcpus other than those specified + * by 'dest' to be stalled. The caller should not rely on any vcpus making + * forward progress when the rendezvous is in progress. + */ +typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg); +void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, + vm_rendezvous_func_t func, void *arg); + +static __inline int +vcpu_rendezvous_pending(void *rendezvous_cookie) +{ + + return (*(uintptr_t *)rendezvous_cookie != 0); +} + +/* * Return 1 if device indicated by bus/slot/func is supposed to be a * pci passthrough device. * @@ -158,7 +187,7 @@ vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) } void *vcpu_stats(struct vm *vm, int vcpu); -void vcpu_notify_event(struct vm *vm, int vcpuid); +void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr); struct vmspace *vm_get_vmspace(struct vm *vm); int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func); int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func); @@ -267,6 +296,8 @@ enum vm_exitcode { VM_EXITCODE_INST_EMUL, VM_EXITCODE_SPINUP_AP, VM_EXITCODE_SPINDOWN_CPU, + VM_EXITCODE_RENDEZVOUS, + VM_EXITCODE_IOAPIC_EOI, VM_EXITCODE_MAX }; @@ -323,6 +354,9 @@ struct vm_exit { struct { uint64_t rflags; } hlt; + struct { + int vector; + } ioapic_eoi; } u; }; diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c index 6c87901..00484c7 100644 --- a/sys/amd64/vmm/amd/amdv.c +++ b/sys/amd64/vmm/amd/amdv.c @@ -38,7 +38,7 @@ __FBSDID("$FreeBSD$"); #include "io/iommu.h" static int -amdv_init(void) +amdv_init(int ipinum) { printf("amdv_init: not implemented\n"); @@ -67,7 +67,7 @@ amdv_vminit(struct vm *vm, struct pmap *pmap) } static int -amdv_vmrun(void *arg, int vcpu, register_t rip, struct pmap *pmap) +amdv_vmrun(void *arg, int vcpu, register_t rip, struct pmap *pmap, void *cookie) { printf("amdv_vmrun: not implemented\n"); @@ -155,6 +155,20 @@ amdv_vmspace_free(struct vmspace *vmspace) return; } +static struct vlapic * +amdv_vlapic_init(void *arg, int vcpuid) +{ + + panic("amdv_vlapic_init: not implmented"); +} + +static void +amdv_vlapic_cleanup(void *arg, struct vlapic *vlapic) +{ + + panic("amdv_vlapic_cleanup: not implemented"); +} + struct vmm_ops vmm_ops_amd = { amdv_init, amdv_cleanup, @@ -171,6 +185,8 @@ struct vmm_ops vmm_ops_amd = { amdv_setcap, amdv_vmspace_alloc, amdv_vmspace_free, + amdv_vlapic_init, + amdv_vlapic_cleanup, }; static int diff --git a/sys/amd64/vmm/intel/ept.c b/sys/amd64/vmm/intel/ept.c index 18e90f3..5f6c4d0 100644 --- a/sys/amd64/vmm/intel/ept.c +++ b/sys/amd64/vmm/intel/ept.c @@ -43,6 +43,7 @@ __FBSDID("$FreeBSD$"); #include <machine/vmm.h> #include "vmx_cpufunc.h" +#include "vmm_ipi.h" #include "vmx_msr.h" #include "ept.h" @@ -76,7 +77,7 @@ SYSCTL_INT(_hw_vmm_ept, OID_AUTO, pmap_flags, CTLFLAG_RD, &ept_pmap_flags, 0, NULL); int -ept_init(void) +ept_init(int ipinum) { int use_hw_ad_bits, use_superpages, use_exec_only; uint64_t cap; @@ -98,6 +99,8 @@ ept_init(void) !INVEPT_ALL_TYPES_SUPPORTED(cap)) return (EINVAL); + ept_pmap_flags = ipinum & PMAP_NESTED_IPIMASK; + use_superpages = 1; TUNABLE_INT_FETCH("hw.vmm.ept.use_superpages", &use_superpages); if (use_superpages && EPT_PDE_SUPERPAGE(cap)) diff --git a/sys/amd64/vmm/intel/ept.h b/sys/amd64/vmm/intel/ept.h index dfd3a44..1393e46 100644 --- a/sys/amd64/vmm/intel/ept.h +++ b/sys/amd64/vmm/intel/ept.h @@ -31,7 +31,7 @@ struct vmx; -int ept_init(void); +int ept_init(int ipinum); void ept_invalidate_mappings(u_long eptp); struct vmspace *ept_vmspace_alloc(vm_offset_t min, vm_offset_t max); void ept_vmspace_free(struct vmspace *vmspace); diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c index 980eac1..1ddefe0 100644 --- a/sys/amd64/vmm/intel/vmcs.c +++ b/sys/amd64/vmm/intel/vmcs.c @@ -315,11 +315,7 @@ done: } int -vmcs_set_defaults(struct vmcs *vmcs, - u_long host_rip, u_long host_rsp, uint64_t eptp, - uint32_t pinbased_ctls, uint32_t procbased_ctls, - uint32_t procbased_ctls2, uint32_t exit_ctls, - uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid) +vmcs_init(struct vmcs *vmcs) { int error, codesel, datasel, tsssel; u_long cr0, cr4, efer; @@ -335,22 +331,6 @@ vmcs_set_defaults(struct vmcs *vmcs, */ VMPTRLD(vmcs); - /* - * Load the VMX controls - */ - if ((error = vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls)) != 0) - goto done; - if ((error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls)) != 0) - goto done; - if ((error = vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2)) != 0) - goto done; - if ((error = vmwrite(VMCS_EXIT_CTLS, exit_ctls)) != 0) - goto done; - if ((error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls)) != 0) - goto done; - - /* Guest state */ - /* Initialize guest IA32_PAT MSR with the default value */ pat = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | @@ -422,23 +402,7 @@ vmcs_set_defaults(struct vmcs *vmcs, goto done; /* instruction pointer */ - if ((error = vmwrite(VMCS_HOST_RIP, host_rip)) != 0) - goto done; - - /* stack pointer */ - if ((error = vmwrite(VMCS_HOST_RSP, host_rsp)) != 0) - goto done; - - /* eptp */ - if ((error = vmwrite(VMCS_EPTP, eptp)) != 0) - goto done; - - /* vpid */ - if ((error = vmwrite(VMCS_VPID, vpid)) != 0) - goto done; - - /* msr bitmap */ - if ((error = vmwrite(VMCS_MSR_BITMAP, msr_bitmap)) != 0) + if ((error = vmwrite(VMCS_HOST_RIP, (u_long)vmx_exit_guest)) != 0) goto done; /* exception bitmap */ @@ -509,7 +473,7 @@ DB_SHOW_COMMAND(vmcs, db_show_vmcs) switch (exit & 0x8000ffff) { case EXIT_REASON_EXCEPTION: case EXIT_REASON_EXT_INTR: - val = vmcs_read(VMCS_EXIT_INTERRUPTION_INFO); + val = vmcs_read(VMCS_EXIT_INTR_INFO); db_printf("Interrupt Type: "); switch (val >> 8 & 0x7) { case 0: @@ -531,7 +495,7 @@ DB_SHOW_COMMAND(vmcs, db_show_vmcs) db_printf(" Vector: %lu", val & 0xff); if (val & 0x800) db_printf(" Error Code: %lx", - vmcs_read(VMCS_EXIT_INTERRUPTION_ERROR)); + vmcs_read(VMCS_EXIT_INTR_ERRCODE)); db_printf("\n"); break; case EXIT_REASON_EPT_FAULT: diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h index b1e2883..fa03826 100644 --- a/sys/amd64/vmm/intel/vmcs.h +++ b/sys/amd64/vmm/intel/vmcs.h @@ -46,12 +46,7 @@ struct msr_entry { }; int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count); -int vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp, - uint64_t eptp, - uint32_t pinbased_ctls, uint32_t procbased_ctls, - uint32_t procbased_ctls2, uint32_t exit_ctls, - uint32_t entry_ctls, u_long msr_bitmap, - uint16_t vpid); +int vmcs_init(struct vmcs *vmcs); int vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *rv); int vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val); int vmcs_getdesc(struct vmcs *vmcs, int ident, @@ -102,6 +97,7 @@ vmcs_write(uint32_t encoding, uint64_t val) /* 16-bit control fields */ #define VMCS_VPID 0x00000000 +#define VMCS_PIR_VECTOR 0x00000002 /* 16-bit guest-state fields */ #define VMCS_GUEST_ES_SELECTOR 0x00000800 @@ -112,6 +108,7 @@ vmcs_write(uint32_t encoding, uint64_t val) #define VMCS_GUEST_GS_SELECTOR 0x0000080A #define VMCS_GUEST_LDTR_SELECTOR 0x0000080C #define VMCS_GUEST_TR_SELECTOR 0x0000080E +#define VMCS_GUEST_INTR_STATUS 0x00000810 /* 16-bit host-state fields */ #define VMCS_HOST_ES_SELECTOR 0x00000C00 @@ -133,7 +130,13 @@ vmcs_write(uint32_t encoding, uint64_t val) #define VMCS_TSC_OFFSET 0x00002010 #define VMCS_VIRTUAL_APIC 0x00002012 #define VMCS_APIC_ACCESS 0x00002014 +#define VMCS_PIR_DESC 0x00002016 #define VMCS_EPTP 0x0000201A +#define VMCS_EOI_EXIT0 0x0000201C +#define VMCS_EOI_EXIT1 0x0000201E +#define VMCS_EOI_EXIT2 0x00002020 +#define VMCS_EOI_EXIT3 0x00002022 +#define VMCS_EOI_EXIT(vector) (VMCS_EOI_EXIT0 + ((vector) / 64) * 2) /* 64-bit read-only fields */ #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 @@ -177,8 +180,8 @@ vmcs_write(uint32_t encoding, uint64_t val) /* 32-bit read-only data fields */ #define VMCS_INSTRUCTION_ERROR 0x00004400 #define VMCS_EXIT_REASON 0x00004402 -#define VMCS_EXIT_INTERRUPTION_INFO 0x00004404 -#define VMCS_EXIT_INTERRUPTION_ERROR 0x00004406 +#define VMCS_EXIT_INTR_INFO 0x00004404 +#define VMCS_EXIT_INTR_ERRCODE 0x00004406 #define VMCS_IDT_VECTORING_INFO 0x00004408 #define VMCS_IDT_VECTORING_ERROR 0x0000440A #define VMCS_EXIT_INSTRUCTION_LENGTH 0x0000440C @@ -315,7 +318,8 @@ vmcs_write(uint32_t encoding, uint64_t val) #define EXIT_REASON_PAUSE 40 #define EXIT_REASON_MCE 41 #define EXIT_REASON_TPR 43 -#define EXIT_REASON_APIC 44 +#define EXIT_REASON_APIC_ACCESS 44 +#define EXIT_REASON_VIRTUALIZED_EOI 45 #define EXIT_REASON_GDTR_IDTR 46 #define EXIT_REASON_LDTR_TR 47 #define EXIT_REASON_EPT_FAULT 48 @@ -326,13 +330,15 @@ vmcs_write(uint32_t encoding, uint64_t val) #define EXIT_REASON_INVVPID 53 #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 +#define EXIT_REASON_APIC_WRITE 56 /* * VMCS interrupt information fields */ -#define VMCS_INTERRUPTION_INFO_VALID (1U << 31) -#define VMCS_INTERRUPTION_INFO_HW_INTR (0 << 8) -#define VMCS_INTERRUPTION_INFO_NMI (2 << 8) +#define VMCS_INTR_INFO_VALID (1U << 31) +#define VMCS_INTR_INFO_TYPE(info) (((info) >> 8) & 0x7) +#define VMCS_INTR_INFO_HW_INTR (0 << 8) +#define VMCS_INTR_INFO_NMI (2 << 8) /* * VMCS IDT-Vectoring information fields @@ -365,4 +371,15 @@ vmcs_write(uint32_t encoding, uint64_t val) #define EPT_VIOLATION_GLA_VALID (1UL << 7) #define EPT_VIOLATION_XLAT_VALID (1UL << 8) +/* + * Exit qualification for APIC-access VM exit + */ +#define APIC_ACCESS_OFFSET(qual) ((qual) & 0xFFF) +#define APIC_ACCESS_TYPE(qual) (((qual) >> 12) & 0xF) + +/* + * Exit qualification for APIC-write VM exit + */ +#define APIC_WRITE_OFFSET(qual) ((qual) & 0xFFF) + #endif diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index bcaed4e..b79d174 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -45,15 +45,18 @@ __FBSDID("$FreeBSD$"); #include <machine/cpufunc.h> #include <machine/md_var.h> #include <machine/segments.h> +#include <machine/smp.h> #include <machine/specialreg.h> #include <machine/vmparam.h> #include <machine/vmm.h> #include "vmm_host.h" -#include "vmm_lapic.h" +#include "vmm_ipi.h" #include "vmm_msr.h" #include "vmm_ktr.h" #include "vmm_stat.h" +#include "vlapic.h" +#include "vlapic_priv.h" #include "vmx_msr.h" #include "ept.h" @@ -92,6 +95,7 @@ __FBSDID("$FreeBSD$"); #define VM_EXIT_CTLS_ONE_SETTING \ (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \ + VM_EXIT_ACKNOWLEDGE_INTERRUPT | \ VM_EXIT_SAVE_PAT | \ VM_EXIT_LOAD_PAT) #define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS @@ -112,7 +116,8 @@ __FBSDID("$FreeBSD$"); #define HANDLED 1 #define UNHANDLED 0 -MALLOC_DEFINE(M_VMX, "vmx", "vmx"); +static MALLOC_DEFINE(M_VMX, "vmx", "vmx"); +static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); @@ -164,12 +169,33 @@ static int cap_pause_exit; static int cap_unrestricted_guest; static int cap_monitor_trap; static int cap_invpcid; - + +static int virtual_interrupt_delivery; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, + &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); + +static int posted_interrupts; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD, + &posted_interrupts, 0, "APICv posted interrupt support"); + +static int pirvec; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD, + &pirvec, 0, "APICv posted interrupt vector"); + static struct unrhdr *vpid_unr; static u_int vpid_alloc_failed; SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, &vpid_alloc_failed, 0, NULL); +/* + * Use the last page below 4GB as the APIC access address. This address is + * occupied by the boot firmware so it is guaranteed that it will not conflict + * with a page in system memory. + */ +#define APIC_ACCESS_ADDRESS 0xFFFFF000 + +static void vmx_inject_pir(struct vlapic *vlapic); + #ifdef KTR static const char * exit_reason_to_str(int reason) @@ -259,8 +285,8 @@ exit_reason_to_str(int reason) return "mce"; case EXIT_REASON_TPR: return "tpr"; - case EXIT_REASON_APIC: - return "apic"; + case EXIT_REASON_APIC_ACCESS: + return "apic-access"; case EXIT_REASON_GDTR_IDTR: return "gdtridtr"; case EXIT_REASON_LDTR_TR: @@ -281,6 +307,8 @@ exit_reason_to_str(int reason) return "wbinvd"; case EXIT_REASON_XSETBV: return "xsetbv"; + case EXIT_REASON_APIC_WRITE: + return "apic-write"; default: snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); return (reasonbuf); @@ -424,6 +452,9 @@ vmx_disable(void *arg __unused) static int vmx_cleanup(void) { + + if (pirvec != 0) + vmm_ipi_free(pirvec); if (vpid_unr != NULL) { delete_unrhdr(vpid_unr); @@ -457,11 +488,11 @@ vmx_restore(void) } static int -vmx_init(void) +vmx_init(int ipinum) { - int error; + int error, use_tpr_shadow; uint64_t fixed0, fixed1, feature_control; - uint32_t tmp; + uint32_t tmp, procbased2_vid_bits; /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ if (!(cpu_feature2 & CPUID2_VMX)) { @@ -595,9 +626,58 @@ vmx_init(void) MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, &tmp) == 0); + /* + * Check support for virtual interrupt delivery. + */ + procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | + PROCBASED2_VIRTUALIZE_X2APIC_MODE | + PROCBASED2_APIC_REGISTER_VIRTUALIZATION | + PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY); + + use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0, + &tmp) == 0); + + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, + procbased2_vid_bits, 0, &tmp); + if (error == 0 && use_tpr_shadow) { + virtual_interrupt_delivery = 1; + TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid", + &virtual_interrupt_delivery); + } + + if (virtual_interrupt_delivery) { + procbased_ctls |= PROCBASED_USE_TPR_SHADOW; + procbased_ctls2 |= procbased2_vid_bits; + procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE; + + /* + * Check for Posted Interrupts only if Virtual Interrupt + * Delivery is enabled. + */ + error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, + MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0, + &tmp); + if (error == 0) { + pirvec = vmm_ipi_alloc(); + if (pirvec == 0) { + if (bootverbose) { + printf("vmx_init: unable to allocate " + "posted interrupt vector\n"); + } + } else { + posted_interrupts = 1; + TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir", + &posted_interrupts); + } + } + } + + if (posted_interrupts) + pinbased_ctls |= PINBASED_POSTED_INTERRUPT; /* Initialize EPT */ - error = ept_init(); + error = ept_init(ipinum); if (error) { printf("vmx_init: ept initialization failed (%d)\n", error); return (error); @@ -638,6 +718,31 @@ vmx_init(void) return (0); } +static void +vmx_trigger_hostintr(int vector) +{ + uintptr_t func; + struct gate_descriptor *gd; + + gd = &idt[vector]; + + KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: " + "invalid vector %d", vector)); + KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present", + vector)); + KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d " + "has invalid type %d", vector, gd->gd_type)); + KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d " + "has invalid dpl %d", vector, gd->gd_dpl)); + KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor " + "for vector %d has invalid selector %d", vector, gd->gd_selector)); + KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid " + "IST %d", vector, gd->gd_ist)); + + func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset); + vmx_call_isr(func); +} + static int vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) { @@ -676,6 +781,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap) uint16_t vpid[VM_MAXCPU]; int i, error, guest_msr_count; struct vmx *vmx; + struct vmcs *vmcs; vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); if ((uintptr_t)vmx & PAGE_MASK) { @@ -740,27 +846,52 @@ vmx_vminit(struct vm *vm, pmap_t pmap) vpid_alloc(vpid, VM_MAXCPU); + if (virtual_interrupt_delivery) { + error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE, + APIC_ACCESS_ADDRESS); + /* XXX this should really return an error to the caller */ + KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error)); + } + for (i = 0; i < VM_MAXCPU; i++) { - vmx->vmcs[i].identifier = vmx_revision(); - error = vmclear(&vmx->vmcs[i]); + vmcs = &vmx->vmcs[i]; + vmcs->identifier = vmx_revision(); + error = vmclear(vmcs); if (error != 0) { panic("vmx_vminit: vmclear error %d on vcpu %d\n", error, i); } - error = vmcs_set_defaults(&vmx->vmcs[i], - (u_long)vmx_exit_guest, - (u_long)&vmx->ctx[i], - vmx->eptp, - pinbased_ctls, - procbased_ctls, - procbased_ctls2, - exit_ctls, entry_ctls, - vtophys(vmx->msr_bitmap), - vpid[i]); + error = vmcs_init(vmcs); + KASSERT(error == 0, ("vmcs_init error %d", error)); - if (error != 0) - panic("vmx_vminit: vmcs_set_defaults error %d", error); + VMPTRLD(vmcs); + error = 0; + error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]); + error += vmwrite(VMCS_EPTP, vmx->eptp); + error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls); + error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls); + error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2); + error += vmwrite(VMCS_EXIT_CTLS, exit_ctls); + error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls); + error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap)); + error += vmwrite(VMCS_VPID, vpid[i]); + if (virtual_interrupt_delivery) { + error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS); + error += vmwrite(VMCS_VIRTUAL_APIC, + vtophys(&vmx->apic_page[i])); + error += vmwrite(VMCS_EOI_EXIT0, 0); + error += vmwrite(VMCS_EOI_EXIT1, 0); + error += vmwrite(VMCS_EOI_EXIT2, 0); + error += vmwrite(VMCS_EOI_EXIT3, 0); + } + if (posted_interrupts) { + error += vmwrite(VMCS_PIR_VECTOR, pirvec); + error += vmwrite(VMCS_PIR_DESC, + vtophys(&vmx->pir_desc[i])); + } + VMCLEAR(vmcs); + KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs")); vmx->cap[i].set = 0; vmx->cap[i].proc_ctls = procbased_ctls; @@ -771,9 +902,8 @@ vmx_vminit(struct vm *vm, pmap_t pmap) msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); - error = vmcs_set_msr_save(&vmx->vmcs[i], - vtophys(vmx->guest_msrs[i]), - guest_msr_count); + error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]), + guest_msr_count); if (error != 0) panic("vmcs_set_msr_save error %d", error); @@ -783,16 +913,15 @@ vmx_vminit(struct vm *vm, pmap_t pmap) * CR0 - 0x60000010 * CR4 - 0 */ - error = vmx_setup_cr0_shadow(&vmx->vmcs[i], 0x60000010); + error = vmx_setup_cr0_shadow(vmcs, 0x60000010); if (error != 0) panic("vmx_setup_cr0_shadow %d", error); - error = vmx_setup_cr4_shadow(&vmx->vmcs[i], 0); + error = vmx_setup_cr4_shadow(vmcs, 0); if (error != 0) panic("vmx_setup_cr4_shadow %d", error); vmx->ctx[i].pmap = pmap; - vmx->ctx[i].eptp = vmx->eptp; } return (vmx); @@ -840,20 +969,20 @@ vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) #endif } +static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); + static void -vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) +vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) { - int lastcpu; struct vmxstate *vmxstate; - struct invvpid_desc invvpid_desc = { 0 }; + struct invvpid_desc invvpid_desc; vmxstate = &vmx->state[vcpu]; - lastcpu = vmxstate->lastcpu; - vmxstate->lastcpu = curcpu; - - if (lastcpu == curcpu) + if (vmxstate->lastcpu == curcpu) return; + vmxstate->lastcpu = curcpu; + vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); @@ -876,8 +1005,20 @@ vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) * for "all" EP4TAs. */ if (vmxstate->vpid != 0) { - invvpid_desc.vpid = vmxstate->vpid; - invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); + if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { + invvpid_desc._res1 = 0; + invvpid_desc._res2 = 0; + invvpid_desc.vpid = vmxstate->vpid; + invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); + } else { + /* + * The invvpid can be skipped if an invept is going to + * be performed before entering the guest. The invept + * will invalidate combined mappings tagged with + * 'vmx->eptp' for all vpids. + */ + vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); + } } } @@ -935,7 +1076,7 @@ vmx_inject_nmi(struct vmx *vmx, int vcpu) * Inject the virtual NMI. The vector must be the NMI IDT entry * or the VMCS entry check will fail. */ - info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID; + info = VMCS_INTR_INFO_NMI | VMCS_INTR_INFO_VALID; info |= IDT_NMI; vmcs_write(VMCS_ENTRY_INTR_INFO, info); @@ -957,7 +1098,7 @@ nmiblocked: } static void -vmx_inject_interrupts(struct vmx *vmx, int vcpu) +vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic) { int vector; uint64_t info, rflags, interruptibility; @@ -973,7 +1114,7 @@ vmx_inject_interrupts(struct vmx *vmx, int vcpu) * because of a pending AST. */ info = vmcs_read(VMCS_ENTRY_INTR_INFO); - if (info & VMCS_INTERRUPTION_INFO_VALID) + if (info & VMCS_INTR_INFO_VALID) return; /* @@ -982,9 +1123,13 @@ vmx_inject_interrupts(struct vmx *vmx, int vcpu) if (vmx_inject_nmi(vmx, vcpu)) return; + if (virtual_interrupt_delivery) { + vmx_inject_pir(vlapic); + return; + } + /* Ask the local apic for a vector to inject */ - vector = lapic_pending_intr(vmx->vm, vcpu); - if (vector < 0) + if (!vlapic_pending_intr(vlapic, &vector)) return; if (vector < 32 || vector > 255) @@ -1000,12 +1145,12 @@ vmx_inject_interrupts(struct vmx *vmx, int vcpu) goto cantinject; /* Inject the interrupt */ - info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID; + info = VMCS_INTR_INFO_HW_INTR | VMCS_INTR_INFO_VALID; info |= vector; vmcs_write(VMCS_ENTRY_INTR_INFO, info); /* Update the Local APIC ISR */ - lapic_intr_accepted(vmx->vm, vcpu, vector); + vlapic_intr_accepted(vlapic, vector); VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); @@ -1175,11 +1320,141 @@ ept_emulation_fault(uint64_t ept_qual) } static int +vmx_handle_apic_write(struct vlapic *vlapic, uint64_t qual) +{ + int error, handled, offset; + bool retu; + + if (!virtual_interrupt_delivery) + return (UNHANDLED); + + handled = 1; + offset = APIC_WRITE_OFFSET(qual); + switch (offset) { + case APIC_OFFSET_ID: + vlapic_id_write_handler(vlapic); + break; + case APIC_OFFSET_LDR: + vlapic_ldr_write_handler(vlapic); + break; + case APIC_OFFSET_DFR: + vlapic_dfr_write_handler(vlapic); + break; + case APIC_OFFSET_SVR: + vlapic_svr_write_handler(vlapic); + break; + case APIC_OFFSET_ESR: + vlapic_esr_write_handler(vlapic); + break; + case APIC_OFFSET_ICR_LOW: + retu = false; + error = vlapic_icrlo_write_handler(vlapic, &retu); + if (error != 0 || retu) + handled = 0; + break; + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + vlapic_lvt_write_handler(vlapic, offset); + break; + case APIC_OFFSET_TIMER_ICR: + vlapic_icrtmr_write_handler(vlapic); + break; + case APIC_OFFSET_TIMER_DCR: + vlapic_dcr_write_handler(vlapic); + break; + default: + handled = 0; + break; + } + return (handled); +} + +static bool +apic_access_fault(uint64_t gpa) +{ + + if (virtual_interrupt_delivery && + (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE)) + return (true); + else + return (false); +} + +static int +vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) +{ + uint64_t qual; + int access_type, offset, allowed; + + if (!virtual_interrupt_delivery) + return (UNHANDLED); + + qual = vmexit->u.vmx.exit_qualification; + access_type = APIC_ACCESS_TYPE(qual); + offset = APIC_ACCESS_OFFSET(qual); + + allowed = 0; + if (access_type == 0) { + /* + * Read data access to the following registers is expected. + */ + switch (offset) { + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_CCR: + allowed = 1; + break; + default: + break; + } + } else if (access_type == 1) { + /* + * Write data access to the following registers is expected. + */ + switch (offset) { + case APIC_OFFSET_VER: + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_CCR: + allowed = 1; + break; + default: + break; + } + } + + if (allowed) { + vmexit->exitcode = VM_EXITCODE_INST_EMUL; + vmexit->u.inst_emul.gpa = DEFAULT_APIC_BASE + offset; + vmexit->u.inst_emul.gla = VIE_INVALID_GLA; + vmexit->u.inst_emul.cr3 = vmcs_guest_cr3(); + } + + /* + * Regardless of whether the APIC-access is allowed this handler + * always returns UNHANDLED: + * - if the access is allowed then it is handled by emulating the + * instruction that caused the VM-exit (outside the critical section) + * - if the access is not allowed then it will be converted to an + * exitcode of VM_EXITCODE_VMX and will be dealt with in userland. + */ + return (UNHANDLED); +} + +static int vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) { int error, handled; struct vmxctx *vmxctx; - uint32_t eax, ecx, edx, idtvec_info, idtvec_err, reason; + struct vlapic *vlapic; + uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, reason; uint64_t qual, gpa; bool retu; @@ -1203,7 +1478,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) switch (reason) { case EXIT_REASON_EPT_FAULT: case EXIT_REASON_EPT_MISCONFIG: - case EXIT_REASON_APIC: + case EXIT_REASON_APIC_ACCESS: case EXIT_REASON_TASK_SWITCH: case EXIT_REASON_EXCEPTION: idtvec_info = vmcs_idt_vectoring_info(); @@ -1290,6 +1565,11 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) * host interrupt handler in the VM's softc. We will inject * this virtual interrupt during the subsequent VM enter. */ + intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); + KASSERT((intr_info & VMCS_INTR_INFO_VALID) != 0 && + VMCS_INTR_INFO_TYPE(intr_info) == 0, + ("VM exit interruption info invalid: %#x", intr_info)); + vmx_trigger_hostintr(intr_info & 0xff); /* * This is special. We want to treat this as an 'handled' @@ -1318,24 +1598,42 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); break; case EXIT_REASON_EPT_FAULT: - vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EPT_FAULT, 1); /* * If 'gpa' lies within the address space allocated to * memory then this must be a nested page fault otherwise * this must be an instruction that accesses MMIO space. */ gpa = vmcs_gpa(); - if (vm_mem_allocated(vmx->vm, gpa)) { + if (vm_mem_allocated(vmx->vm, gpa) || apic_access_fault(gpa)) { vmexit->exitcode = VM_EXITCODE_PAGING; vmexit->u.paging.gpa = gpa; vmexit->u.paging.fault_type = ept_fault_type(qual); + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1); } else if (ept_emulation_fault(qual)) { vmexit->exitcode = VM_EXITCODE_INST_EMUL; vmexit->u.inst_emul.gpa = gpa; vmexit->u.inst_emul.gla = vmcs_gla(); vmexit->u.inst_emul.cr3 = vmcs_guest_cr3(); + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1); } break; + case EXIT_REASON_VIRTUALIZED_EOI: + vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI; + vmexit->u.ioapic_eoi.vector = qual & 0xFF; + vmexit->inst_length = 0; /* trap-like */ + break; + case EXIT_REASON_APIC_ACCESS: + handled = vmx_handle_apic_access(vmx, vcpu, vmexit); + break; + case EXIT_REASON_APIC_WRITE: + /* + * APIC-write VM exit is trap-like so the %rip is already + * pointing to the next instruction. + */ + vmexit->inst_length = 0; + vlapic = vm_lapic(vmx->vm, vcpu); + handled = vmx_handle_apic_write(vlapic, qual); + break; default: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); break; @@ -1387,6 +1685,18 @@ vmx_exit_astpending(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) } static __inline int +vmx_exit_rendezvous(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) +{ + + vmexit->rip = vmcs_guest_rip(); + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RENDEZVOUS, 1); + + return (UNHANDLED); +} + +static __inline int vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) { @@ -1415,26 +1725,29 @@ vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) } static int -vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap) +vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap, + void *rendezvous_cookie) { int rc, handled, launched; struct vmx *vmx; + struct vm *vm; struct vmxctx *vmxctx; struct vmcs *vmcs; struct vm_exit *vmexit; + struct vlapic *vlapic; uint64_t rip; uint32_t exit_reason; vmx = arg; + vm = vmx->vm; vmcs = &vmx->vmcs[vcpu]; vmxctx = &vmx->ctx[vcpu]; - vmexit = vm_exitinfo(vmx->vm, vcpu); + vlapic = vm_lapic(vm, vcpu); + vmexit = vm_exitinfo(vm, vcpu); launched = 0; KASSERT(vmxctx->pmap == pmap, ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); - KASSERT(vmxctx->eptp == vmx->eptp, - ("eptp %p different than ctx eptp %#lx", eptp, vmxctx->eptp)); VMPTRLD(vmcs); @@ -1444,12 +1757,12 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap) * from a different process than the one that actually runs it. * * If the life of a virtual machine was spent entirely in the context - * of a single process we could do this once in vmcs_set_defaults(). + * of a single process we could do this once in vmx_vminit(). */ vmcs_write(VMCS_HOST_CR3, rcr3()); vmcs_write(VMCS_GUEST_RIP, startrip); - vmx_set_pcpu_defaults(vmx, vcpu); + vmx_set_pcpu_defaults(vmx, vcpu, pmap); do { /* * Interrupts are disabled from this point on until the @@ -1476,9 +1789,15 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap) break; } - vmx_inject_interrupts(vmx, vcpu); + if (vcpu_rendezvous_pending(rendezvous_cookie)) { + enable_intr(); + handled = vmx_exit_rendezvous(vmx, vcpu, vmexit); + break; + } + + vmx_inject_interrupts(vmx, vcpu, vlapic); vmx_run_trace(vmx, vcpu); - rc = vmx_enter_guest(vmxctx, launched); + rc = vmx_enter_guest(vmxctx, vmx, launched); enable_intr(); @@ -1509,9 +1828,9 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap) } if (!handled) - vmm_stat_incr(vmx->vm, vcpu, VMEXIT_USERSPACE, 1); + vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1); - VCPU_CTR1(vmx->vm, vcpu, "returning from vmx_run: exitcode %d", + VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d", vmexit->exitcode); VMCLEAR(vmcs); @@ -1524,6 +1843,9 @@ vmx_vmcleanup(void *arg) int i, error; struct vmx *vmx = arg; + if (virtual_interrupt_delivery) + vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); + for (i = 0; i < VM_MAXCPU; i++) vpid_free(vmx->state[i].vpid); @@ -1731,11 +2053,11 @@ vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, if (error) return (error); - if (info & VMCS_INTERRUPTION_INFO_VALID) + if (info & VMCS_INTR_INFO_VALID) return (EAGAIN); info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0); - info |= VMCS_INTERRUPTION_INFO_VALID; + info |= VMCS_INTR_INFO_VALID; error = vmcs_setreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info); if (error != 0) return (error); @@ -1887,6 +2209,258 @@ vmx_setcap(void *arg, int vcpu, int type, int val) return (retval); } +struct vlapic_vtx { + struct vlapic vlapic; + struct pir_desc *pir_desc; + struct vmx *vmx; +}; + +#define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \ +do { \ + VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \ + level ? "level" : "edge", vector); \ + VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]); \ + VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]); \ + VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]); \ + VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]); \ + VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\ +} while (0) + +/* + * vlapic->ops handlers that utilize the APICv hardware assist described in + * Chapter 29 of the Intel SDM. + */ +static int +vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) +{ + struct vlapic_vtx *vlapic_vtx; + struct pir_desc *pir_desc; + uint64_t mask; + int idx, notify; + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + pir_desc = vlapic_vtx->pir_desc; + + /* + * Keep track of interrupt requests in the PIR descriptor. This is + * because the virtual APIC page pointed to by the VMCS cannot be + * modified if the vcpu is running. + */ + idx = vector / 64; + mask = 1UL << (vector % 64); + atomic_set_long(&pir_desc->pir[idx], mask); + notify = atomic_cmpset_long(&pir_desc->pending, 0, 1); + + VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector, + level, "vmx_set_intr_ready"); + return (notify); +} + +static int +vmx_pending_intr(struct vlapic *vlapic, int *vecptr) +{ + struct vlapic_vtx *vlapic_vtx; + struct pir_desc *pir_desc; + struct LAPIC *lapic; + uint64_t pending, pirval; + uint32_t ppr, vpr; + int i; + + /* + * This function is only expected to be called from the 'HLT' exit + * handler which does not care about the vector that is pending. + */ + KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL")); + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + pir_desc = vlapic_vtx->pir_desc; + + pending = atomic_load_acq_long(&pir_desc->pending); + if (!pending) + return (0); /* common case */ + + /* + * If there is an interrupt pending then it will be recognized only + * if its priority is greater than the processor priority. + * + * Special case: if the processor priority is zero then any pending + * interrupt will be recognized. + */ + lapic = vlapic->apic_page; + ppr = lapic->ppr & 0xf0; + if (ppr == 0) + return (1); + + VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d", + lapic->ppr); + + for (i = 3; i >= 0; i--) { + pirval = pir_desc->pir[i]; + if (pirval != 0) { + vpr = (i * 64 + flsl(pirval) - 1) & 0xf0; + return (vpr > ppr); + } + } + return (0); +} + +static void +vmx_intr_accepted(struct vlapic *vlapic, int vector) +{ + + panic("vmx_intr_accepted: not expected to be called"); +} + +static void +vmx_set_tmr(struct vlapic *vlapic, int vector, bool level) +{ + struct vlapic_vtx *vlapic_vtx; + struct vmx *vmx; + struct vmcs *vmcs; + uint64_t mask, val; + + KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); + KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL), + ("vmx_set_tmr: vcpu cannot be running")); + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + vmx = vlapic_vtx->vmx; + vmcs = &vmx->vmcs[vlapic->vcpuid]; + mask = 1UL << (vector % 64); + + VMPTRLD(vmcs); + val = vmcs_read(VMCS_EOI_EXIT(vector)); + if (level) + val |= mask; + else + val &= ~mask; + vmcs_write(VMCS_EOI_EXIT(vector), val); + VMCLEAR(vmcs); +} + +static void +vmx_post_intr(struct vlapic *vlapic, int hostcpu) +{ + + ipi_cpu(hostcpu, pirvec); +} + +/* + * Transfer the pending interrupts in the PIR descriptor to the IRR + * in the virtual APIC page. + */ +static void +vmx_inject_pir(struct vlapic *vlapic) +{ + struct vlapic_vtx *vlapic_vtx; + struct pir_desc *pir_desc; + struct LAPIC *lapic; + uint64_t val, pirval; + int rvi, pirbase; + uint16_t intr_status_old, intr_status_new; + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + pir_desc = vlapic_vtx->pir_desc; + if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) { + VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " + "no posted interrupt pending"); + return; + } + + pirval = 0; + lapic = vlapic->apic_page; + + val = atomic_readandclear_long(&pir_desc->pir[0]); + if (val != 0) { + lapic->irr0 |= val; + lapic->irr1 |= val >> 32; + pirbase = 0; + pirval = val; + } + + val = atomic_readandclear_long(&pir_desc->pir[1]); + if (val != 0) { + lapic->irr2 |= val; + lapic->irr3 |= val >> 32; + pirbase = 64; + pirval = val; + } + + val = atomic_readandclear_long(&pir_desc->pir[2]); + if (val != 0) { + lapic->irr4 |= val; + lapic->irr5 |= val >> 32; + pirbase = 128; + pirval = val; + } + + val = atomic_readandclear_long(&pir_desc->pir[3]); + if (val != 0) { + lapic->irr6 |= val; + lapic->irr7 |= val >> 32; + pirbase = 192; + pirval = val; + } + VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir"); + + /* + * Update RVI so the processor can evaluate pending virtual + * interrupts on VM-entry. + */ + if (pirval != 0) { + rvi = pirbase + flsl(pirval) - 1; + intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS); + intr_status_new = (intr_status_old & 0xFF00) | rvi; + if (intr_status_new > intr_status_old) { + vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new); + VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " + "guest_intr_status changed from 0x%04x to 0x%04x", + intr_status_old, intr_status_new); + } + } +} + +static struct vlapic * +vmx_vlapic_init(void *arg, int vcpuid) +{ + struct vmx *vmx; + struct vlapic *vlapic; + struct vlapic_vtx *vlapic_vtx; + + vmx = arg; + + vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO); + vlapic->vm = vmx->vm; + vlapic->vcpuid = vcpuid; + vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid]; + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid]; + vlapic_vtx->vmx = vmx; + + if (virtual_interrupt_delivery) { + vlapic->ops.set_intr_ready = vmx_set_intr_ready; + vlapic->ops.pending_intr = vmx_pending_intr; + vlapic->ops.intr_accepted = vmx_intr_accepted; + vlapic->ops.set_tmr = vmx_set_tmr; + } + + if (posted_interrupts) + vlapic->ops.post_intr = vmx_post_intr; + + vlapic_init(vlapic); + + return (vlapic); +} + +static void +vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic) +{ + + vlapic_cleanup(vlapic); + free(vlapic, M_VLAPIC); +} + struct vmm_ops vmm_ops_intel = { vmx_init, vmx_cleanup, @@ -1903,4 +2477,6 @@ struct vmm_ops vmm_ops_intel = { vmx_setcap, ept_vmspace_alloc, ept_vmspace_free, + vmx_vlapic_init, + vmx_vlapic_cleanup, }; diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h index 67ef631..80bfd72 100644 --- a/sys/amd64/vmm/intel/vmx.h +++ b/sys/amd64/vmm/intel/vmx.h @@ -64,16 +64,13 @@ struct vmxctx { /* * XXX todo debug registers and fpu state */ - - int inst_fail_status; - long eptgen[MAXCPU]; /* cached pmap->pm_eptgen */ + int inst_fail_status; /* - * The 'eptp' and the 'pmap' do not change during the lifetime of - * the VM so it is safe to keep a copy in each vcpu's vmxctx. + * The pmap needs to be deactivated in vmx_exit_guest() + * so keep a copy of the 'pmap' in each vmxctx. */ - vm_paddr_t eptp; struct pmap *pmap; }; @@ -88,27 +85,45 @@ struct vmxstate { uint16_t vpid; }; +struct apic_page { + uint32_t reg[PAGE_SIZE / 4]; +}; +CTASSERT(sizeof(struct apic_page) == PAGE_SIZE); + +/* Posted Interrupt Descriptor (described in section 29.6 of the Intel SDM) */ +struct pir_desc { + uint64_t pir[4]; + uint64_t pending; + uint64_t unused[3]; +} __aligned(64); +CTASSERT(sizeof(struct pir_desc) == 64); + /* virtual machine softc */ struct vmx { struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */ + struct apic_page apic_page[VM_MAXCPU]; /* one apic page per vcpu */ char msr_bitmap[PAGE_SIZE]; + struct pir_desc pir_desc[VM_MAXCPU]; struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES]; struct vmxctx ctx[VM_MAXCPU]; struct vmxcap cap[VM_MAXCPU]; struct vmxstate state[VM_MAXCPU]; uint64_t eptp; struct vm *vm; + long eptgen[MAXCPU]; /* cached pmap->pm_eptgen */ }; CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0); CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0); CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0); +CTASSERT((offsetof(struct vmx, pir_desc[0]) & 63) == 0); #define VMX_GUEST_VMEXIT 0 #define VMX_VMRESUME_ERROR 1 #define VMX_VMLAUNCH_ERROR 2 #define VMX_INVEPT_ERROR 3 -int vmx_enter_guest(struct vmxctx *ctx, int launched); +int vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched); void vmx_exit_guest(void); +void vmx_call_isr(uintptr_t entry); u_long vmx_fix_cr0(u_long cr0); u_long vmx_fix_cr4(u_long cr4); diff --git a/sys/amd64/vmm/intel/vmx_controls.h b/sys/amd64/vmm/intel/vmx_controls.h index 3cd2eff..2b117ae 100644 --- a/sys/amd64/vmm/intel/vmx_controls.h +++ b/sys/amd64/vmm/intel/vmx_controls.h @@ -34,6 +34,7 @@ #define PINBASED_NMI_EXITING (1 << 3) #define PINBASED_VIRTUAL_NMI (1 << 5) #define PINBASED_PREMPTION_TIMER (1 << 6) +#define PINBASED_POSTED_INTERRUPT (1 << 7) /* Primary Processor-Based VM-Execution Controls */ #define PROCBASED_INT_WINDOW_EXITING (1 << 2) @@ -59,16 +60,18 @@ #define PROCBASED_SECONDARY_CONTROLS (1U << 31) /* Secondary Processor-Based VM-Execution Controls */ -#define PROCBASED2_VIRTUALIZE_APIC (1 << 0) -#define PROCBASED2_ENABLE_EPT (1 << 1) -#define PROCBASED2_DESC_TABLE_EXITING (1 << 2) -#define PROCBASED2_ENABLE_RDTSCP (1 << 3) -#define PROCBASED2_VIRTUALIZE_X2APIC (1 << 4) -#define PROCBASED2_ENABLE_VPID (1 << 5) -#define PROCBASED2_WBINVD_EXITING (1 << 6) -#define PROCBASED2_UNRESTRICTED_GUEST (1 << 7) -#define PROCBASED2_PAUSE_LOOP_EXITING (1 << 10) -#define PROCBASED2_ENABLE_INVPCID (1 << 12) +#define PROCBASED2_VIRTUALIZE_APIC_ACCESSES (1 << 0) +#define PROCBASED2_ENABLE_EPT (1 << 1) +#define PROCBASED2_DESC_TABLE_EXITING (1 << 2) +#define PROCBASED2_ENABLE_RDTSCP (1 << 3) +#define PROCBASED2_VIRTUALIZE_X2APIC_MODE (1 << 4) +#define PROCBASED2_ENABLE_VPID (1 << 5) +#define PROCBASED2_WBINVD_EXITING (1 << 6) +#define PROCBASED2_UNRESTRICTED_GUEST (1 << 7) +#define PROCBASED2_APIC_REGISTER_VIRTUALIZATION (1 << 8) +#define PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY (1 << 9) +#define PROCBASED2_PAUSE_LOOP_EXITING (1 << 10) +#define PROCBASED2_ENABLE_INVPCID (1 << 12) /* VM Exit Controls */ #define VM_EXIT_SAVE_DEBUG_CONTROLS (1 << 2) diff --git a/sys/amd64/vmm/intel/vmx_genassym.c b/sys/amd64/vmm/intel/vmx_genassym.c index bf463dc..5c91fec 100644 --- a/sys/amd64/vmm/intel/vmx_genassym.c +++ b/sys/amd64/vmm/intel/vmx_genassym.c @@ -68,10 +68,10 @@ ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx)); ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip)); ASSYM(VMXCTX_INST_FAIL_STATUS, offsetof(struct vmxctx, inst_fail_status)); -ASSYM(VMXCTX_EPTGEN, offsetof(struct vmxctx, eptgen)); - ASSYM(VMXCTX_PMAP, offsetof(struct vmxctx, pmap)); -ASSYM(VMXCTX_EPTP, offsetof(struct vmxctx, eptp)); + +ASSYM(VMX_EPTGEN, offsetof(struct vmx, eptgen)); +ASSYM(VMX_EPTP, offsetof(struct vmx, eptp)); ASSYM(VM_FAIL_INVALID, VM_FAIL_INVALID); ASSYM(VM_FAIL_VALID, VM_FAIL_VALID); @@ -84,3 +84,6 @@ ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid)); ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active)); ASSYM(PM_EPTGEN, offsetof(struct pmap, pm_eptgen)); + +ASSYM(KERNEL_SS, GSEL(GDATA_SEL, SEL_KPL)); +ASSYM(KERNEL_CS, GSEL(GCODE_SEL, SEL_KPL)); diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S index d616984..9e8cf2d 100644 --- a/sys/amd64/vmm/intel/vmx_support.S +++ b/sys/amd64/vmm/intel/vmx_support.S @@ -97,7 +97,8 @@ /* * vmx_enter_guest(struct vmxctx *vmxctx, int launched) * %rdi: pointer to the 'vmxctx' - * %esi: launch state of the VMCS + * %rsi: pointer to the 'vmx' + * %edx: launch state of the VMCS * Interrupts must be disabled on entry. */ ENTRY(vmx_enter_guest) @@ -114,19 +115,19 @@ ENTRY(vmx_enter_guest) LK btsl %eax, PM_ACTIVE(%r11) /* - * If 'vmxctx->eptgen[curcpu]' is not identical to 'pmap->pm_eptgen' + * If 'vmx->eptgen[curcpu]' is not identical to 'pmap->pm_eptgen' * then we must invalidate all mappings associated with this EPTP. */ movq PM_EPTGEN(%r11), %r10 - cmpq %r10, VMXCTX_EPTGEN(%rdi, %rax, 8) + cmpq %r10, VMX_EPTGEN(%rsi, %rax, 8) je guest_restore - /* Refresh 'vmxctx->eptgen[curcpu]' */ - movq %r10, VMXCTX_EPTGEN(%rdi, %rax, 8) + /* Refresh 'vmx->eptgen[curcpu]' */ + movq %r10, VMX_EPTGEN(%rsi, %rax, 8) /* Setup the invept descriptor on the host stack */ mov %rsp, %r11 - movq VMXCTX_EPTP(%rdi), %rax + movq VMX_EPTP(%rsi), %rax movq %rax, -16(%r11) movq $0x0, -8(%r11) mov $0x1, %eax /* Single context invalidate */ @@ -134,7 +135,7 @@ ENTRY(vmx_enter_guest) jbe invept_error /* Check invept instruction error */ guest_restore: - cmpl $0, %esi + cmpl $0, %edx je do_launch VMX_GUEST_RESTORE @@ -234,3 +235,21 @@ ENTRY(vmx_exit_guest) movl $VMX_GUEST_VMEXIT, %eax ret END(vmx_exit_guest) + +/* + * %rdi = interrupt handler entry point + * + * Calling sequence described in the "Instruction Set Reference" for the "INT" + * instruction in Intel SDM, Vol 2. + */ +ENTRY(vmx_call_isr) + mov %rsp, %r11 /* save %rsp */ + and $~0xf, %rsp /* align on 16-byte boundary */ + pushq $KERNEL_SS /* %ss */ + pushq %r11 /* %rsp */ + pushfq /* %rflags */ + pushq $KERNEL_CS /* %cs */ + cli /* disable interrupts */ + callq *%rdi /* push %rip and call isr */ + ret +END(vmx_call_isr) diff --git a/sys/amd64/vmm/io/vioapic.c b/sys/amd64/vmm/io/vioapic.c index 151065a..703e479 100644 --- a/sys/amd64/vmm/io/vioapic.c +++ b/sys/amd64/vmm/io/vioapic.c @@ -222,8 +222,52 @@ vioapic_pulse_irq(struct vm *vm, int irq) return (vioapic_set_irqstate(vm, irq, IRQSTATE_PULSE)); } +/* + * Reset the vlapic's trigger-mode register to reflect the ioapic pin + * configuration. + */ +static void +vioapic_update_tmr(struct vm *vm, int vcpuid, void *arg) +{ + struct vioapic *vioapic; + struct vlapic *vlapic; + uint32_t low, high, dest; + int delmode, pin, vector; + bool level, phys; + + vlapic = vm_lapic(vm, vcpuid); + vioapic = vm_ioapic(vm); + + VIOAPIC_LOCK(vioapic); + /* + * Reset all vectors to be edge-triggered. + */ + vlapic_reset_tmr(vlapic); + for (pin = 0; pin < REDIR_ENTRIES; pin++) { + low = vioapic->rtbl[pin].reg; + high = vioapic->rtbl[pin].reg >> 32; + + level = low & IOART_TRGRLVL ? true : false; + if (!level) + continue; + + /* + * For a level-triggered 'pin' let the vlapic figure out if + * an assertion on this 'pin' would result in an interrupt + * being delivered to it. If yes, then it will modify the + * TMR bit associated with this vector to level-triggered. + */ + phys = ((low & IOART_DESTMOD) == IOART_DESTPHY); + delmode = low & IOART_DELMOD; + vector = low & IOART_INTVEC; + dest = high >> APIC_ID_SHIFT; + vlapic_set_tmr_level(vlapic, dest, phys, delmode, vector); + } + VIOAPIC_UNLOCK(vioapic); +} + static uint32_t -vioapic_read(struct vioapic *vioapic, uint32_t addr) +vioapic_read(struct vioapic *vioapic, int vcpuid, uint32_t addr) { int regnum, pin, rshift; @@ -258,10 +302,12 @@ vioapic_read(struct vioapic *vioapic, uint32_t addr) } static void -vioapic_write(struct vioapic *vioapic, uint32_t addr, uint32_t data) +vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data) { uint64_t data64, mask64; + uint64_t last, changed; int regnum, pin, lshift; + cpuset_t allvcpus; regnum = addr & 0xff; switch (regnum) { @@ -285,6 +331,8 @@ vioapic_write(struct vioapic *vioapic, uint32_t addr, uint32_t data) else lshift = 0; + last = vioapic->rtbl[pin].reg; + data64 = (uint64_t)data << lshift; mask64 = (uint64_t)0xffffffff << lshift; vioapic->rtbl[pin].reg &= ~mask64 | RTBL_RO_BITS; @@ -294,6 +342,22 @@ vioapic_write(struct vioapic *vioapic, uint32_t addr, uint32_t data) pin, vioapic->rtbl[pin].reg); /* + * If any fields in the redirection table entry (except mask + * or polarity) have changed then rendezvous all the vcpus + * to update their vlapic trigger-mode registers. + */ + changed = last ^ vioapic->rtbl[pin].reg; + if (changed & ~(IOART_INTMASK | IOART_INTPOL)) { + VIOAPIC_CTR1(vioapic, "ioapic pin%d: recalculate " + "vlapic trigger-mode register", pin); + VIOAPIC_UNLOCK(vioapic); + allvcpus = vm_active_cpus(vioapic->vm); + vm_smp_rendezvous(vioapic->vm, vcpuid, allvcpus, + vioapic_update_tmr, NULL); + VIOAPIC_LOCK(vioapic); + } + + /* * Generate an interrupt if the following conditions are met: * - pin is not masked * - previous interrupt has been EOIed @@ -310,8 +374,8 @@ vioapic_write(struct vioapic *vioapic, uint32_t addr, uint32_t data) } static int -vioapic_mmio_rw(struct vioapic *vioapic, uint64_t gpa, uint64_t *data, - int size, bool doread) +vioapic_mmio_rw(struct vioapic *vioapic, int vcpuid, uint64_t gpa, + uint64_t *data, int size, bool doread) { uint64_t offset; @@ -334,10 +398,13 @@ vioapic_mmio_rw(struct vioapic *vioapic, uint64_t gpa, uint64_t *data, else vioapic->ioregsel = *data; } else { - if (doread) - *data = vioapic_read(vioapic, vioapic->ioregsel); - else - vioapic_write(vioapic, vioapic->ioregsel, *data); + if (doread) { + *data = vioapic_read(vioapic, vcpuid, + vioapic->ioregsel); + } else { + vioapic_write(vioapic, vcpuid, vioapic->ioregsel, + *data); + } } VIOAPIC_UNLOCK(vioapic); @@ -352,7 +419,7 @@ vioapic_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *rval, struct vioapic *vioapic; vioapic = vm_ioapic(vm); - error = vioapic_mmio_rw(vioapic, gpa, rval, size, true); + error = vioapic_mmio_rw(vioapic, vcpuid, gpa, rval, size, true); return (error); } @@ -364,7 +431,7 @@ vioapic_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t wval, struct vioapic *vioapic; vioapic = vm_ioapic(vm); - error = vioapic_mmio_rw(vioapic, gpa, &wval, size, false); + error = vioapic_mmio_rw(vioapic, vcpuid, gpa, &wval, size, false); return (error); } diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c index 695040d..2395247 100644 --- a/sys/amd64/vmm/io/vlapic.c +++ b/sys/amd64/vmm/io/vlapic.c @@ -37,108 +37,34 @@ __FBSDID("$FreeBSD$"); #include <sys/systm.h> #include <sys/smp.h> -#include <machine/clock.h> #include <x86/specialreg.h> #include <x86/apicreg.h> +#include <machine/clock.h> +#include <machine/smp.h> + #include <machine/vmm.h> -#include "vmm_stat.h" +#include "vmm_ipi.h" #include "vmm_lapic.h" #include "vmm_ktr.h" +#include "vmm_stat.h" + #include "vlapic.h" +#include "vlapic_priv.h" #include "vioapic.h" -#define VLAPIC_CTR0(vlapic, format) \ - VCPU_CTR0((vlapic)->vm, (vlapic)->vcpuid, format) - -#define VLAPIC_CTR1(vlapic, format, p1) \ - VCPU_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1) - -#define VLAPIC_CTR2(vlapic, format, p1, p2) \ - VCPU_CTR2((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2) - -#define VLAPIC_CTR_IRR(vlapic, msg) \ -do { \ - uint32_t *irrptr = &(vlapic)->apic.irr0; \ - irrptr[0] = irrptr[0]; /* silence compiler */ \ - VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]); \ - VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]); \ - VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]); \ - VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]); \ - VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]); \ - VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]); \ - VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]); \ - VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]); \ -} while (0) - -#define VLAPIC_CTR_ISR(vlapic, msg) \ -do { \ - uint32_t *isrptr = &(vlapic)->apic.isr0; \ - isrptr[0] = isrptr[0]; /* silence compiler */ \ - VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]); \ - VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]); \ - VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]); \ - VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]); \ - VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]); \ - VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]); \ - VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]); \ - VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]); \ -} while (0) - -static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); - #define PRIO(x) ((x) >> 4) #define VLAPIC_VERSION (16) -#define VLAPIC_MAXLVT_ENTRIES (APIC_LVT_CMCI) #define x2apic(vlapic) (((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0) -enum boot_state { - BS_INIT, - BS_SIPI, - BS_RUNNING -}; - -struct vlapic { - struct vm *vm; - int vcpuid; - - struct LAPIC apic; - - uint32_t esr_pending; - int esr_firing; - - struct callout callout; /* vlapic timer */ - struct bintime timer_fire_bt; /* callout expiry time */ - struct bintime timer_freq_bt; /* timer frequency */ - struct bintime timer_period_bt; /* timer period */ - struct mtx timer_mtx; - - /* - * The 'isrvec_stk' is a stack of vectors injected by the local apic. - * A vector is popped from the stack when the processor does an EOI. - * The vector on the top of the stack is used to compute the - * Processor Priority in conjunction with the TPR. - */ - uint8_t isrvec_stk[ISRVEC_STK_SIZE]; - int isrvec_stk_top; - - uint64_t msr_apicbase; - enum boot_state boot_state; -}; - /* * The 'vlapic->timer_mtx' is used to provide mutual exclusion between the - * vlapic_callout_handler() and vcpu accesses to the following registers: - * - initial count register aka icr_timer - * - current count register aka ccr_timer - * - divide config register aka dcr_timer + * vlapic_callout_handler() and vcpu accesses to: + * - timer_freq_bt, timer_period_bt, timer_fire_bt * - timer LVT register - * - * Note that the vlapic_callout_handler() does not write to any of these - * registers so they can be safely read from the vcpu context without locking. */ #define VLAPIC_TIMER_LOCK(vlapic) mtx_lock_spin(&((vlapic)->timer_mtx)) #define VLAPIC_TIMER_UNLOCK(vlapic) mtx_unlock_spin(&((vlapic)->timer_mtx)) @@ -156,72 +82,71 @@ vlapic_get_id(struct vlapic *vlapic) return (vlapic->vcpuid << 24); } -static __inline uint32_t -vlapic_get_ldr(struct vlapic *vlapic) +static uint32_t +x2apic_ldr(struct vlapic *vlapic) { - struct LAPIC *lapic; int apicid; uint32_t ldr; - lapic = &vlapic->apic; - if (x2apic(vlapic)) { - apicid = vlapic_get_id(vlapic); - ldr = 1 << (apicid & 0xf); - ldr |= (apicid & 0xffff0) << 12; - return (ldr); - } else - return (lapic->ldr); + apicid = vlapic_get_id(vlapic); + ldr = 1 << (apicid & 0xf); + ldr |= (apicid & 0xffff0) << 12; + return (ldr); } -static __inline uint32_t -vlapic_get_dfr(struct vlapic *vlapic) +void +vlapic_dfr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; - lapic = &vlapic->apic; - if (x2apic(vlapic)) - return (0); - else - return (lapic->dfr); -} - -static void -vlapic_set_dfr(struct vlapic *vlapic, uint32_t data) -{ - uint32_t dfr; - struct LAPIC *lapic; - + lapic = vlapic->apic_page; if (x2apic(vlapic)) { - VM_CTR1(vlapic->vm, "write to DFR in x2apic mode: %#x", data); + VM_CTR1(vlapic->vm, "ignoring write to DFR in x2apic mode: %#x", + lapic->dfr); + lapic->dfr = 0; return; } - lapic = &vlapic->apic; - dfr = (lapic->dfr & APIC_DFR_RESERVED) | (data & APIC_DFR_MODEL_MASK); - if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT) + lapic->dfr &= APIC_DFR_MODEL_MASK; + lapic->dfr |= APIC_DFR_RESERVED; + + if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT) VLAPIC_CTR0(vlapic, "vlapic DFR in Flat Model"); - else if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER) + else if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER) VLAPIC_CTR0(vlapic, "vlapic DFR in Cluster Model"); else - VLAPIC_CTR1(vlapic, "vlapic DFR in Unknown Model %#x", dfr); - - lapic->dfr = dfr; + VLAPIC_CTR1(vlapic, "DFR in Unknown Model %#x", lapic->dfr); } -static void -vlapic_set_ldr(struct vlapic *vlapic, uint32_t data) +void +vlapic_ldr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; + lapic = vlapic->apic_page; + /* LDR is read-only in x2apic mode */ if (x2apic(vlapic)) { - VLAPIC_CTR1(vlapic, "write to LDR in x2apic mode: %#x", data); - return; + VLAPIC_CTR1(vlapic, "ignoring write to LDR in x2apic mode: %#x", + lapic->ldr); + lapic->ldr = x2apic_ldr(vlapic); + } else { + lapic->ldr &= ~APIC_LDR_RESERVED; + VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr); } +} - lapic = &vlapic->apic; - lapic->ldr = data & ~APIC_LDR_RESERVED; - VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr); +void +vlapic_id_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + /* + * We don't allow the ID register to be modified so reset it back to + * its default value. + */ + lapic = vlapic->apic_page; + lapic->id = vlapic_get_id(vlapic); } static int @@ -249,16 +174,6 @@ vlapic_timer_divisor(uint32_t dcr) } } -static void -vlapic_mask_lvts(uint32_t *lvts, int num_lvt) -{ - int i; - for (i = 0; i < num_lvt; i++) { - *lvts |= APIC_LVT_M; - lvts += 4; - } -} - #if 0 static inline void vlapic_dump_lvt(uint32_t offset, uint32_t *lvt) @@ -277,7 +192,7 @@ vlapic_get_ccr(struct vlapic *vlapic) uint32_t ccr; ccr = 0; - lapic = &vlapic->apic; + lapic = vlapic->apic_page; VLAPIC_TIMER_LOCK(vlapic); if (callout_active(&vlapic->callout)) { @@ -301,18 +216,18 @@ vlapic_get_ccr(struct vlapic *vlapic) return (ccr); } -static void -vlapic_set_dcr(struct vlapic *vlapic, uint32_t dcr) +void +vlapic_dcr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; int divisor; - lapic = &vlapic->apic; + lapic = vlapic->apic_page; VLAPIC_TIMER_LOCK(vlapic); - lapic->dcr_timer = dcr; - divisor = vlapic_timer_divisor(dcr); - VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d", dcr, divisor); + divisor = vlapic_timer_divisor(lapic->dcr_timer); + VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d", + lapic->dcr_timer, divisor); /* * Update the timer frequency and the timer period. @@ -327,57 +242,42 @@ vlapic_set_dcr(struct vlapic *vlapic, uint32_t dcr) VLAPIC_TIMER_UNLOCK(vlapic); } -static void -vlapic_update_errors(struct vlapic *vlapic) -{ - struct LAPIC *lapic = &vlapic->apic; - lapic->esr = vlapic->esr_pending; - vlapic->esr_pending = 0; -} - -static void -vlapic_reset(struct vlapic *vlapic) +void +vlapic_esr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; - lapic = &vlapic->apic; - bzero(lapic, sizeof(struct LAPIC)); - - lapic->version = VLAPIC_VERSION; - lapic->version |= (VLAPIC_MAXLVT_ENTRIES << MAXLVTSHIFT); - lapic->dfr = 0xffffffff; - lapic->svr = APIC_SVR_VECTOR; - vlapic_mask_lvts(&lapic->lvt_timer, 6); - vlapic_mask_lvts(&lapic->lvt_cmci, 1); - vlapic_set_dcr(vlapic, 0); - - if (vlapic->vcpuid == 0) - vlapic->boot_state = BS_RUNNING; /* BSP */ - else - vlapic->boot_state = BS_INIT; /* AP */ + lapic = vlapic->apic_page; + lapic->esr = vlapic->esr_pending; + vlapic->esr_pending = 0; } -void +int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level) { - struct LAPIC *lapic = &vlapic->apic; - uint32_t *irrptr, *tmrptr, mask; - int idx; + struct LAPIC *lapic; + uint32_t *irrptr, *tmrptr, mask; + int idx; - if (vector < 0 || vector >= 256) - panic("vlapic_set_intr_ready: invalid vector %d\n", vector); + KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector)); + lapic = vlapic->apic_page; if (!(lapic->svr & APIC_SVR_ENABLE)) { VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring " "interrupt %d", vector); - return; + return (0); } if (vector < 16) { vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR); - return; + VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d", + vector); + return (1); } - + + if (vlapic->ops.set_intr_ready) + return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level)); + idx = (vector / 32) * 4; mask = 1 << (vector % 32); @@ -385,23 +285,22 @@ vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level) atomic_set_int(&irrptr[idx], mask); /* - * Upon acceptance of an interrupt into the IRR the corresponding - * TMR bit is cleared for edge-triggered interrupts and set for - * level-triggered interrupts. + * Verify that the trigger-mode of the interrupt matches with + * the vlapic TMR registers. */ tmrptr = &lapic->tmr0; - if (level) - atomic_set_int(&tmrptr[idx], mask); - else - atomic_clear_int(&tmrptr[idx], mask); + KASSERT((tmrptr[idx] & mask) == (level ? mask : 0), + ("vlapic TMR[%d] is 0x%08x but interrupt is %s-triggered", + idx / 4, tmrptr[idx], level ? "level" : "edge")); VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready"); + return (1); } static __inline uint32_t * vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset) { - struct LAPIC *lapic = &vlapic->apic; + struct LAPIC *lapic = vlapic->apic_page; int i; switch (offset) { @@ -415,24 +314,65 @@ vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset) } } +static __inline int +lvt_off_to_idx(uint32_t offset) +{ + int index; + + switch (offset) { + case APIC_OFFSET_CMCI_LVT: + index = APIC_LVT_CMCI; + break; + case APIC_OFFSET_TIMER_LVT: + index = APIC_LVT_TIMER; + break; + case APIC_OFFSET_THERM_LVT: + index = APIC_LVT_THERMAL; + break; + case APIC_OFFSET_PERF_LVT: + index = APIC_LVT_PMC; + break; + case APIC_OFFSET_LINT0_LVT: + index = APIC_LVT_LINT0; + break; + case APIC_OFFSET_LINT1_LVT: + index = APIC_LVT_LINT1; + break; + case APIC_OFFSET_ERROR_LVT: + index = APIC_LVT_ERROR; + break; + default: + index = -1; + break; + } + KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: " + "invalid lvt index %d for offset %#x", index, offset)); + + return (index); +} + static __inline uint32_t vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset) { + int idx; + uint32_t val; - return (*vlapic_get_lvtptr(vlapic, offset)); + idx = lvt_off_to_idx(offset); + val = atomic_load_acq_32(&vlapic->lvt_last[idx]); + return (val); } -static void -vlapic_set_lvt(struct vlapic *vlapic, uint32_t offset, uint32_t val) +void +vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset) { - uint32_t *lvtptr, mask; + uint32_t *lvtptr, mask, val; struct LAPIC *lapic; + int idx; - lapic = &vlapic->apic; + lapic = vlapic->apic_page; lvtptr = vlapic_get_lvtptr(vlapic, offset); - - if (offset == APIC_OFFSET_TIMER_LVT) - VLAPIC_TIMER_LOCK(vlapic); + val = *lvtptr; + idx = lvt_off_to_idx(offset); if (!(lapic->svr & APIC_SVR_ENABLE)) val |= APIC_LVT_M; @@ -451,10 +391,36 @@ vlapic_set_lvt(struct vlapic *vlapic, uint32_t offset, uint32_t val) mask |= APIC_LVT_DM; break; } - *lvtptr = val & mask; + val &= mask; + *lvtptr = val; + atomic_store_rel_32(&vlapic->lvt_last[idx], val); +} + +static void +vlapic_mask_lvts(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + + lapic->lvt_cmci |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT); + + lapic->lvt_timer |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT); - if (offset == APIC_OFFSET_TIMER_LVT) - VLAPIC_TIMER_UNLOCK(vlapic); + lapic->lvt_thermal |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT); + + lapic->lvt_pcint |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT); + + lapic->lvt_lint0 |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT); + + lapic->lvt_lint1 |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT); + + lapic->lvt_error |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT); } static int @@ -474,8 +440,8 @@ vlapic_fire_lvt(struct vlapic *vlapic, uint32_t lvt) vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR); return (0); } - vlapic_set_intr_ready(vlapic, vec, false); - vcpu_notify_event(vlapic->vm, vlapic->vcpuid); + if (vlapic_set_intr_ready(vlapic, vec, false)) + vcpu_notify_event(vlapic->vm, vlapic->vcpuid, true); break; case APIC_LVT_DM_NMI: vm_inject_nmi(vlapic->vm, vlapic->vcpuid); @@ -494,7 +460,7 @@ dump_isrvec_stk(struct vlapic *vlapic) int i; uint32_t *isrptr; - isrptr = &vlapic->apic.isr0; + isrptr = &vlapic->apic_page->isr0; for (i = 0; i < 8; i++) printf("ISR%d 0x%08x\n", i, isrptr[i * 4]); @@ -519,7 +485,7 @@ vlapic_update_ppr(struct vlapic *vlapic) * bits is set in the ISRx registers. */ isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top]; - tpr = vlapic->apic.tpr; + tpr = vlapic->apic_page->tpr; #if 1 { @@ -548,7 +514,7 @@ vlapic_update_ppr(struct vlapic *vlapic) * corresponding entry on the isrvec stack. */ i = 1; - isrptr = &vlapic->apic.isr0; + isrptr = &vlapic->apic_page->isr0; for (vector = 0; vector < 256; vector++) { idx = (vector / 32) * 4; if (isrptr[idx] & (1 << (vector % 32))) { @@ -568,14 +534,14 @@ vlapic_update_ppr(struct vlapic *vlapic) else ppr = isrvec & 0xf0; - vlapic->apic.ppr = ppr; + vlapic->apic_page->ppr = ppr; VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr); } static void vlapic_process_eoi(struct vlapic *vlapic) { - struct LAPIC *lapic = &vlapic->apic; + struct LAPIC *lapic = vlapic->apic_page; uint32_t *isrptr, *tmrptr; int i, idx, bitpos, vector; @@ -675,7 +641,7 @@ vlapic_fire_cmci(struct vlapic *vlapic) } } -static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_ENTRIES, +static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1, "lvts triggered"); int @@ -735,8 +701,6 @@ vlapic_callout_handler(void *arg) callout_deactivate(&vlapic->callout); - KASSERT(vlapic->apic.icr_timer != 0, ("vlapic timer is disabled")); - vlapic_fire_timer(vlapic); if (vlapic_periodic_timer(vlapic)) { @@ -781,16 +745,17 @@ done: VLAPIC_TIMER_UNLOCK(vlapic); } -static void -vlapic_set_icr_timer(struct vlapic *vlapic, uint32_t icr_timer) +void +vlapic_icrtmr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; sbintime_t sbt; + uint32_t icr_timer; VLAPIC_TIMER_LOCK(vlapic); - lapic = &vlapic->apic; - lapic->icr_timer = icr_timer; + lapic = vlapic->apic_page; + icr_timer = lapic->icr_timer; vlapic->timer_period_bt = vlapic->timer_freq_bt; bintime_mul(&vlapic->timer_period_bt, icr_timer); @@ -872,8 +837,8 @@ vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, CPU_CLR(vcpuid, &amask); vlapic = vm_lapic(vm, vcpuid); - dfr = vlapic_get_dfr(vlapic); - ldr = vlapic_get_ldr(vlapic); + dfr = vlapic->apic_page->dfr; + ldr = vlapic->apic_page->ldr; if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT) { @@ -912,16 +877,22 @@ vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, static VMM_STAT_ARRAY(IPIS_SENT, VM_MAXCPU, "ipis sent to vcpu"); -static int -lapic_process_icr(struct vlapic *vlapic, uint64_t icrval, bool *retu) +int +vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) { int i; bool phys; cpuset_t dmask; + uint64_t icrval; uint32_t dest, vec, mode; struct vlapic *vlapic2; struct vm_exit *vmexit; - + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + lapic->icr_lo &= ~APIC_DELSTAT_PEND; + icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo; + if (x2apic(vlapic)) dest = icrval >> 32; else @@ -931,9 +902,12 @@ lapic_process_icr(struct vlapic *vlapic, uint64_t icrval, bool *retu) if (mode == APIC_DELMODE_FIXED && vec < 16) { vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR); + VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec); return (0); } - + + VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec); + if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) { switch (icrval & APIC_DEST_MASK) { case APIC_DEST_DESTFLD: @@ -963,8 +937,13 @@ lapic_process_icr(struct vlapic *vlapic, uint64_t icrval, bool *retu) lapic_intr_edge(vlapic->vm, i, vec); vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, IPIS_SENT, i, 1); - } else + VLAPIC_CTR2(vlapic, "vlapic sending ipi %d " + "to vcpuid %d", vec, i); + } else { vm_inject_nmi(vlapic->vm, i); + VLAPIC_CTR1(vlapic, "vlapic sending ipi nmi " + "to vcpuid %d", i); + } } return (0); /* handled completely in the kernel */ @@ -1019,12 +998,15 @@ lapic_process_icr(struct vlapic *vlapic, uint64_t icrval, bool *retu) } int -vlapic_pending_intr(struct vlapic *vlapic) +vlapic_pending_intr(struct vlapic *vlapic, int *vecptr) { - struct LAPIC *lapic = &vlapic->apic; + struct LAPIC *lapic = vlapic->apic_page; int idx, i, bitpos, vector; uint32_t *irrptr, val; + if (vlapic->ops.pending_intr) + return ((*vlapic->ops.pending_intr)(vlapic, vecptr)); + irrptr = &lapic->irr0; /* @@ -1039,21 +1021,26 @@ vlapic_pending_intr(struct vlapic *vlapic) vector = i * 32 + (bitpos - 1); if (PRIO(vector) > PRIO(lapic->ppr)) { VLAPIC_CTR1(vlapic, "pending intr %d", vector); - return (vector); + if (vecptr != NULL) + *vecptr = vector; + return (1); } else break; } } - return (-1); + return (0); } void vlapic_intr_accepted(struct vlapic *vlapic, int vector) { - struct LAPIC *lapic = &vlapic->apic; + struct LAPIC *lapic = vlapic->apic_page; uint32_t *irrptr, *isrptr; int idx, stk_top; + if (vlapic->ops.intr_accepted) + return ((*vlapic->ops.intr_accepted)(vlapic, vector)); + /* * clear the ready bit for vector being accepted in irr * and set the vector as in service in isr. @@ -1081,24 +1068,30 @@ vlapic_intr_accepted(struct vlapic *vlapic, int vector) vlapic_update_ppr(vlapic); } -static void -lapic_set_svr(struct vlapic *vlapic, uint32_t new) +void +vlapic_svr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; - uint32_t old, changed; + uint32_t old, new, changed; + + lapic = vlapic->apic_page; + + new = lapic->svr; + old = vlapic->svr_last; + vlapic->svr_last = new; - lapic = &vlapic->apic; - old = lapic->svr; changed = old ^ new; if ((changed & APIC_SVR_ENABLE) != 0) { if ((new & APIC_SVR_ENABLE) == 0) { /* - * The apic is now disabled so stop the apic timer. + * The apic is now disabled so stop the apic timer + * and mask all the LVT entries. */ VLAPIC_CTR0(vlapic, "vlapic is software-disabled"); VLAPIC_TIMER_LOCK(vlapic); callout_stop(&vlapic->callout); VLAPIC_TIMER_UNLOCK(vlapic); + vlapic_mask_lvts(vlapic); } else { /* * The apic is now enabled so restart the apic timer @@ -1106,16 +1099,15 @@ lapic_set_svr(struct vlapic *vlapic, uint32_t new) */ VLAPIC_CTR0(vlapic, "vlapic is software-enabled"); if (vlapic_periodic_timer(vlapic)) - vlapic_set_icr_timer(vlapic, lapic->icr_timer); + vlapic_icrtmr_write_handler(vlapic); } } - lapic->svr = new; } int vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data, bool *retu) { - struct LAPIC *lapic = &vlapic->apic; + struct LAPIC *lapic = vlapic->apic_page; uint32_t *reg; int i; @@ -1128,7 +1120,7 @@ vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data, bool *retu) switch(offset) { case APIC_OFFSET_ID: - *data = vlapic_get_id(vlapic); + *data = lapic->id; break; case APIC_OFFSET_VER: *data = lapic->version; @@ -1146,10 +1138,10 @@ vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data, bool *retu) *data = lapic->eoi; break; case APIC_OFFSET_LDR: - *data = vlapic_get_ldr(vlapic); + *data = lapic->ldr; break; case APIC_OFFSET_DFR: - *data = vlapic_get_dfr(vlapic); + *data = lapic->dfr; break; case APIC_OFFSET_SVR: *data = lapic->svr; @@ -1174,6 +1166,8 @@ vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data, bool *retu) break; case APIC_OFFSET_ICR_LOW: *data = lapic->icr_lo; + if (x2apic(vlapic)) + *data |= (uint64_t)lapic->icr_hi << 32; break; case APIC_OFFSET_ICR_HI: *data = lapic->icr_hi; @@ -1181,14 +1175,19 @@ vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data, bool *retu) case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: *data = vlapic_get_lvt(vlapic, offset); +#ifdef INVARIANTS + reg = vlapic_get_lvtptr(vlapic, offset); + KASSERT(*data == *reg, ("inconsistent lvt value at " + "offset %#lx: %#lx/%#x", offset, *data, *reg)); +#endif break; - case APIC_OFFSET_ICR: + case APIC_OFFSET_TIMER_ICR: *data = lapic->icr_timer; break; - case APIC_OFFSET_CCR: + case APIC_OFFSET_TIMER_CCR: *data = vlapic_get_ccr(vlapic); break; - case APIC_OFFSET_DCR: + case APIC_OFFSET_TIMER_DCR: *data = lapic->dcr_timer; break; case APIC_OFFSET_RRR: @@ -1204,9 +1203,13 @@ done: int vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data, bool *retu) { - struct LAPIC *lapic = &vlapic->apic; + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *regptr; int retval; + KASSERT((offset & 0xf) == 0 && offset < PAGE_SIZE, + ("vlapic_write: invalid offset %#lx", offset)); + VLAPIC_CTR2(vlapic, "vlapic write offset %#x, data %#lx", offset, data); if (offset > sizeof(*lapic)) { @@ -1214,10 +1217,11 @@ vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data, bool *retu) } retval = 0; - offset &= ~3; switch(offset) { case APIC_OFFSET_ID: + lapic->id = data; + vlapic_id_write_handler(vlapic); break; case APIC_OFFSET_TPR: lapic->tpr = data & 0xff; @@ -1227,41 +1231,44 @@ vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data, bool *retu) vlapic_process_eoi(vlapic); break; case APIC_OFFSET_LDR: - vlapic_set_ldr(vlapic, data); + lapic->ldr = data; + vlapic_ldr_write_handler(vlapic); break; case APIC_OFFSET_DFR: - vlapic_set_dfr(vlapic, data); + lapic->dfr = data; + vlapic_dfr_write_handler(vlapic); break; case APIC_OFFSET_SVR: - lapic_set_svr(vlapic, data); + lapic->svr = data; + vlapic_svr_write_handler(vlapic); break; case APIC_OFFSET_ICR_LOW: - if (!x2apic(vlapic)) { - data &= 0xffffffff; - data |= (uint64_t)lapic->icr_hi << 32; - } - retval = lapic_process_icr(vlapic, data, retu); + lapic->icr_lo = data; + if (x2apic(vlapic)) + lapic->icr_hi = data >> 32; + retval = vlapic_icrlo_write_handler(vlapic, retu); break; case APIC_OFFSET_ICR_HI: - if (!x2apic(vlapic)) { - retval = 0; - lapic->icr_hi = data; - } + lapic->icr_hi = data; break; case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: - vlapic_set_lvt(vlapic, offset, data); + regptr = vlapic_get_lvtptr(vlapic, offset); + *regptr = data; + vlapic_lvt_write_handler(vlapic, offset); break; - case APIC_OFFSET_ICR: - vlapic_set_icr_timer(vlapic, data); + case APIC_OFFSET_TIMER_ICR: + lapic->icr_timer = data; + vlapic_icrtmr_write_handler(vlapic); break; - case APIC_OFFSET_DCR: - vlapic_set_dcr(vlapic, data); + case APIC_OFFSET_TIMER_DCR: + lapic->dcr_timer = data; + vlapic_dcr_write_handler(vlapic); break; case APIC_OFFSET_ESR: - vlapic_update_errors(vlapic); + vlapic_esr_write_handler(vlapic); break; case APIC_OFFSET_VER: case APIC_OFFSET_APR: @@ -1270,7 +1277,7 @@ vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data, bool *retu) case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: - case APIC_OFFSET_CCR: + case APIC_OFFSET_TIMER_CCR: default: // Read only. break; @@ -1279,14 +1286,41 @@ vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data, bool *retu) return (retval); } -struct vlapic * -vlapic_init(struct vm *vm, int vcpuid) +static void +vlapic_reset(struct vlapic *vlapic) { - struct vlapic *vlapic; + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + bzero(lapic, sizeof(struct LAPIC)); - vlapic = malloc(sizeof(struct vlapic), M_VLAPIC, M_WAITOK | M_ZERO); - vlapic->vm = vm; - vlapic->vcpuid = vcpuid; + lapic->id = vlapic_get_id(vlapic); + lapic->version = VLAPIC_VERSION; + lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT); + lapic->dfr = 0xffffffff; + lapic->svr = APIC_SVR_VECTOR; + vlapic_mask_lvts(vlapic); + vlapic_reset_tmr(vlapic); + + lapic->dcr_timer = 0; + vlapic_dcr_write_handler(vlapic); + + if (vlapic->vcpuid == 0) + vlapic->boot_state = BS_RUNNING; /* BSP */ + else + vlapic->boot_state = BS_INIT; /* AP */ + + vlapic->svr_last = lapic->svr; +} + +void +vlapic_init(struct vlapic *vlapic) +{ + KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized")); + KASSERT(vlapic->vcpuid >= 0 && vlapic->vcpuid < VM_MAXCPU, + ("vlapic_init: vcpuid is not initialized")); + KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not " + "initialized")); /* * If the vlapic is configured in x2apic mode then it will be @@ -1300,12 +1334,10 @@ vlapic_init(struct vm *vm, int vcpuid) vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED; - if (vcpuid == 0) + if (vlapic->vcpuid == 0) vlapic->msr_apicbase |= APICBASE_BSP; vlapic_reset(vlapic); - - return (vlapic); } void @@ -1313,7 +1345,6 @@ vlapic_cleanup(struct vlapic *vlapic) { callout_drain(&vlapic->callout); - free(vlapic, M_VLAPIC); } uint64_t @@ -1324,19 +1355,38 @@ vlapic_get_apicbase(struct vlapic *vlapic) } void -vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val) +vlapic_set_apicbase(struct vlapic *vlapic, uint64_t new) { - int err; + struct LAPIC *lapic; enum x2apic_state state; + uint64_t old; + int err; err = vm_get_x2apic_state(vlapic->vm, vlapic->vcpuid, &state); if (err) panic("vlapic_set_apicbase: err %d fetching x2apic state", err); if (state == X2APIC_DISABLED) - val &= ~APICBASE_X2APIC; + new &= ~APICBASE_X2APIC; + + old = vlapic->msr_apicbase; + vlapic->msr_apicbase = new; - vlapic->msr_apicbase = val; + /* + * If the vlapic is switching between xAPIC and x2APIC modes then + * reset the mode-dependent registers. + */ + if ((old ^ new) & APICBASE_X2APIC) { + lapic = vlapic->apic_page; + lapic->id = vlapic_get_id(vlapic); + if (x2apic(vlapic)) { + lapic->ldr = x2apic_ldr(vlapic); + lapic->dfr = 0; + } else { + lapic->ldr = 0; + lapic->dfr = 0xffffffff; + } + } } void @@ -1378,10 +1428,28 @@ vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, } } +void +vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum) +{ + /* + * Post an interrupt to the vcpu currently running on 'hostcpu'. + * + * This is done by leveraging features like Posted Interrupts (Intel) + * Doorbell MSR (AMD AVIC) that avoid a VM exit. + * + * If neither of these features are available then fallback to + * sending an IPI to 'hostcpu'. + */ + if (vlapic->ops.post_intr) + (*vlapic->ops.post_intr)(vlapic, hostcpu); + else + ipi_cpu(hostcpu, ipinum); +} + bool vlapic_enabled(struct vlapic *vlapic) { - struct LAPIC *lapic = &vlapic->apic; + struct LAPIC *lapic = vlapic->apic_page; if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 && (lapic->svr & APIC_SVR_ENABLE) != 0) @@ -1389,3 +1457,62 @@ vlapic_enabled(struct vlapic *vlapic) else return (false); } + +static void +vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level) +{ + struct LAPIC *lapic; + uint32_t *tmrptr, mask; + int idx; + + lapic = vlapic->apic_page; + tmrptr = &lapic->tmr0; + idx = (vector / 32) * 4; + mask = 1 << (vector % 32); + if (level) + tmrptr[idx] |= mask; + else + tmrptr[idx] &= ~mask; + + if (vlapic->ops.set_tmr != NULL) + (*vlapic->ops.set_tmr)(vlapic, vector, level); +} + +void +vlapic_reset_tmr(struct vlapic *vlapic) +{ + int vector; + + VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered"); + + for (vector = 0; vector <= 255; vector++) + vlapic_set_tmr(vlapic, vector, false); +} + +void +vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, + int delmode, int vector) +{ + cpuset_t dmask; + bool lowprio; + + KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); + + /* + * A level trigger is valid only for fixed and lowprio delivery modes. + */ + if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) { + VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for " + "delivery-mode %d", delmode); + return; + } + + lowprio = (delmode == APIC_DELMODE_LOWPRIO); + vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false); + + if (!CPU_ISSET(vlapic->vcpuid, &dmask)) + return; + + VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector); + vlapic_set_tmr(vlapic, vector, true); +} diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h index 98f377e..d2fc6d9 100644 --- a/sys/amd64/vmm/io/vlapic.h +++ b/sys/amd64/vmm/io/vlapic.h @@ -30,74 +30,45 @@ #define _VLAPIC_H_ struct vm; - -/* - * Map of APIC Registers: Offset Description Access - */ -#define APIC_OFFSET_ID 0x20 // Local APIC ID R/W -#define APIC_OFFSET_VER 0x30 // Local APIC Version R -#define APIC_OFFSET_TPR 0x80 // Task Priority Register R/W -#define APIC_OFFSET_APR 0x90 // Arbitration Priority Register R -#define APIC_OFFSET_PPR 0xA0 // Processor Priority Register R -#define APIC_OFFSET_EOI 0xB0 // EOI Register W -#define APIC_OFFSET_RRR 0xC0 // Remote read R -#define APIC_OFFSET_LDR 0xD0 // Logical Destination R/W -#define APIC_OFFSET_DFR 0xE0 // Destination Format Register 0..27 R; 28..31 R/W -#define APIC_OFFSET_SVR 0xF0 // Spurious Interrupt Vector Reg. 0..3 R; 4..9 R/W -#define APIC_OFFSET_ISR0 0x100 // ISR 000-031 R -#define APIC_OFFSET_ISR1 0x110 // ISR 032-063 R -#define APIC_OFFSET_ISR2 0x120 // ISR 064-095 R -#define APIC_OFFSET_ISR3 0x130 // ISR 095-128 R -#define APIC_OFFSET_ISR4 0x140 // ISR 128-159 R -#define APIC_OFFSET_ISR5 0x150 // ISR 160-191 R -#define APIC_OFFSET_ISR6 0x160 // ISR 192-223 R -#define APIC_OFFSET_ISR7 0x170 // ISR 224-255 R -#define APIC_OFFSET_TMR0 0x180 // TMR 000-031 R -#define APIC_OFFSET_TMR1 0x190 // TMR 032-063 R -#define APIC_OFFSET_TMR2 0x1A0 // TMR 064-095 R -#define APIC_OFFSET_TMR3 0x1B0 // TMR 095-128 R -#define APIC_OFFSET_TMR4 0x1C0 // TMR 128-159 R -#define APIC_OFFSET_TMR5 0x1D0 // TMR 160-191 R -#define APIC_OFFSET_TMR6 0x1E0 // TMR 192-223 R -#define APIC_OFFSET_TMR7 0x1F0 // TMR 224-255 R -#define APIC_OFFSET_IRR0 0x200 // IRR 000-031 R -#define APIC_OFFSET_IRR1 0x210 // IRR 032-063 R -#define APIC_OFFSET_IRR2 0x220 // IRR 064-095 R -#define APIC_OFFSET_IRR3 0x230 // IRR 095-128 R -#define APIC_OFFSET_IRR4 0x240 // IRR 128-159 R -#define APIC_OFFSET_IRR5 0x250 // IRR 160-191 R -#define APIC_OFFSET_IRR6 0x260 // IRR 192-223 R -#define APIC_OFFSET_IRR7 0x270 // IRR 224-255 R -#define APIC_OFFSET_ESR 0x280 // Error Status Register R -#define APIC_OFFSET_CMCI_LVT 0x2F0 // Local Vector Table (CMCI) R/W -#define APIC_OFFSET_ICR_LOW 0x300 // Interrupt Command Reg. (0-31) R/W -#define APIC_OFFSET_ICR_HI 0x310 // Interrupt Command Reg. (32-63) R/W -#define APIC_OFFSET_TIMER_LVT 0x320 // Local Vector Table (Timer) R/W -#define APIC_OFFSET_THERM_LVT 0x330 // Local Vector Table (Thermal) R/W (PIV+) -#define APIC_OFFSET_PERF_LVT 0x340 // Local Vector Table (Performance) R/W (P6+) -#define APIC_OFFSET_LINT0_LVT 0x350 // Local Vector Table (LINT0) R/W -#define APIC_OFFSET_LINT1_LVT 0x360 // Local Vector Table (LINT1) R/W -#define APIC_OFFSET_ERROR_LVT 0x370 // Local Vector Table (ERROR) R/W -#define APIC_OFFSET_ICR 0x380 // Initial Count Reg. for Timer R/W -#define APIC_OFFSET_CCR 0x390 // Current Count of Timer R -#define APIC_OFFSET_DCR 0x3E0 // Timer Divide Configuration Reg. R/W - -/* - * 16 priority levels with at most one vector injected per level. - */ -#define ISRVEC_STK_SIZE (16 + 1) - enum x2apic_state; -struct vlapic *vlapic_init(struct vm *vm, int vcpuid); -void vlapic_cleanup(struct vlapic *vlapic); int vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data, bool *retu); int vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data, bool *retu); -int vlapic_pending_intr(struct vlapic *vlapic); + +/* + * Returns 0 if there is no eligible vector that can be delivered to the + * guest at this time and non-zero otherwise. + * + * If an eligible vector number is found and 'vecptr' is not NULL then it will + * be stored in the location pointed to by 'vecptr'. + * + * Note that the vector does not automatically transition to the ISR as a + * result of calling this function. + */ +int vlapic_pending_intr(struct vlapic *vlapic, int *vecptr); + +/* + * Transition 'vector' from IRR to ISR. This function is called with the + * vector returned by 'vlapic_pending_intr()' when the guest is able to + * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that + * block interrupt delivery). + */ void vlapic_intr_accepted(struct vlapic *vlapic, int vector); -void vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level); + +/* + * Returns 1 if the vcpu needs to be notified of the interrupt and 0 otherwise. + */ +int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level); + +/* + * Post an interrupt to the vcpu running on 'hostcpu'. This will use a + * hardware assist if available (e.g. Posted Interrupt) or fall back to + * sending an 'ipinum' to interrupt the 'hostcpu'. + */ +void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum); + void vlapic_set_error(struct vlapic *vlapic, uint32_t mask); void vlapic_fire_cmci(struct vlapic *vlapic); int vlapic_trigger_lvt(struct vlapic *vlapic, int vector); @@ -109,4 +80,26 @@ bool vlapic_enabled(struct vlapic *vlapic); void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, int delmode, int vec); + +/* Reset the trigger-mode bits for all vectors to be edge-triggered */ +void vlapic_reset_tmr(struct vlapic *vlapic); + +/* + * Set the trigger-mode bit associated with 'vector' to level-triggered if + * the (dest,phys,delmode) tuple resolves to an interrupt being delivered to + * this 'vlapic'. + */ +void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, + int delmode, int vector); + +/* APIC write handlers */ +void vlapic_id_write_handler(struct vlapic *vlapic); +void vlapic_ldr_write_handler(struct vlapic *vlapic); +void vlapic_dfr_write_handler(struct vlapic *vlapic); +void vlapic_svr_write_handler(struct vlapic *vlapic); +void vlapic_esr_write_handler(struct vlapic *vlapic); +int vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu); +void vlapic_icrtmr_write_handler(struct vlapic *vlapic); +void vlapic_dcr_write_handler(struct vlapic *vlapic); +void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset); #endif /* _VLAPIC_H_ */ diff --git a/sys/amd64/vmm/io/vlapic_priv.h b/sys/amd64/vmm/io/vlapic_priv.h new file mode 100644 index 0000000..a4e96aa --- /dev/null +++ b/sys/amd64/vmm/io/vlapic_priv.h @@ -0,0 +1,185 @@ +/*- + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VLAPIC_PRIV_H_ +#define _VLAPIC_PRIV_H_ + +#include <x86/apicreg.h> + +/* + * APIC Register: Offset Description + */ +#define APIC_OFFSET_ID 0x20 /* Local APIC ID */ +#define APIC_OFFSET_VER 0x30 /* Local APIC Version */ +#define APIC_OFFSET_TPR 0x80 /* Task Priority Register */ +#define APIC_OFFSET_APR 0x90 /* Arbitration Priority */ +#define APIC_OFFSET_PPR 0xA0 /* Processor Priority Register */ +#define APIC_OFFSET_EOI 0xB0 /* EOI Register */ +#define APIC_OFFSET_RRR 0xC0 /* Remote read */ +#define APIC_OFFSET_LDR 0xD0 /* Logical Destination */ +#define APIC_OFFSET_DFR 0xE0 /* Destination Format Register */ +#define APIC_OFFSET_SVR 0xF0 /* Spurious Vector Register */ +#define APIC_OFFSET_ISR0 0x100 /* In Service Register */ +#define APIC_OFFSET_ISR1 0x110 +#define APIC_OFFSET_ISR2 0x120 +#define APIC_OFFSET_ISR3 0x130 +#define APIC_OFFSET_ISR4 0x140 +#define APIC_OFFSET_ISR5 0x150 +#define APIC_OFFSET_ISR6 0x160 +#define APIC_OFFSET_ISR7 0x170 +#define APIC_OFFSET_TMR0 0x180 /* Trigger Mode Register */ +#define APIC_OFFSET_TMR1 0x190 +#define APIC_OFFSET_TMR2 0x1A0 +#define APIC_OFFSET_TMR3 0x1B0 +#define APIC_OFFSET_TMR4 0x1C0 +#define APIC_OFFSET_TMR5 0x1D0 +#define APIC_OFFSET_TMR6 0x1E0 +#define APIC_OFFSET_TMR7 0x1F0 +#define APIC_OFFSET_IRR0 0x200 /* Interrupt Request Register */ +#define APIC_OFFSET_IRR1 0x210 +#define APIC_OFFSET_IRR2 0x220 +#define APIC_OFFSET_IRR3 0x230 +#define APIC_OFFSET_IRR4 0x240 +#define APIC_OFFSET_IRR5 0x250 +#define APIC_OFFSET_IRR6 0x260 +#define APIC_OFFSET_IRR7 0x270 +#define APIC_OFFSET_ESR 0x280 /* Error Status Register */ +#define APIC_OFFSET_CMCI_LVT 0x2F0 /* Local Vector Table (CMCI) */ +#define APIC_OFFSET_ICR_LOW 0x300 /* Interrupt Command Register */ +#define APIC_OFFSET_ICR_HI 0x310 +#define APIC_OFFSET_TIMER_LVT 0x320 /* Local Vector Table (Timer) */ +#define APIC_OFFSET_THERM_LVT 0x330 /* Local Vector Table (Thermal) */ +#define APIC_OFFSET_PERF_LVT 0x340 /* Local Vector Table (PMC) */ +#define APIC_OFFSET_LINT0_LVT 0x350 /* Local Vector Table (LINT0) */ +#define APIC_OFFSET_LINT1_LVT 0x360 /* Local Vector Table (LINT1) */ +#define APIC_OFFSET_ERROR_LVT 0x370 /* Local Vector Table (ERROR) */ +#define APIC_OFFSET_TIMER_ICR 0x380 /* Timer's Initial Count */ +#define APIC_OFFSET_TIMER_CCR 0x390 /* Timer's Current Count */ +#define APIC_OFFSET_TIMER_DCR 0x3E0 /* Timer's Divide Configuration */ + +#define VLAPIC_CTR0(vlapic, format) \ + VCPU_CTR0((vlapic)->vm, (vlapic)->vcpuid, format) + +#define VLAPIC_CTR1(vlapic, format, p1) \ + VCPU_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1) + +#define VLAPIC_CTR2(vlapic, format, p1, p2) \ + VCPU_CTR2((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2) + +#define VLAPIC_CTR_IRR(vlapic, msg) \ +do { \ + uint32_t *irrptr = &(vlapic)->apic_page->irr0; \ + irrptr[0] = irrptr[0]; /* silence compiler */ \ + VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]); \ +} while (0) + +#define VLAPIC_CTR_ISR(vlapic, msg) \ +do { \ + uint32_t *isrptr = &(vlapic)->apic_page->isr0; \ + isrptr[0] = isrptr[0]; /* silence compiler */ \ + VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]); \ +} while (0) + +enum boot_state { + BS_INIT, + BS_SIPI, + BS_RUNNING +}; + +/* + * 16 priority levels with at most one vector injected per level. + */ +#define ISRVEC_STK_SIZE (16 + 1) + +#define VLAPIC_MAXLVT_INDEX APIC_LVT_CMCI + +struct vlapic; + +struct vlapic_ops { + int (*set_intr_ready)(struct vlapic *vlapic, int vector, bool level); + int (*pending_intr)(struct vlapic *vlapic, int *vecptr); + void (*intr_accepted)(struct vlapic *vlapic, int vector); + void (*post_intr)(struct vlapic *vlapic, int hostcpu); + void (*set_tmr)(struct vlapic *vlapic, int vector, bool level); +}; + +struct vlapic { + struct vm *vm; + int vcpuid; + struct LAPIC *apic_page; + struct vlapic_ops ops; + + uint32_t esr_pending; + int esr_firing; + + struct callout callout; /* vlapic timer */ + struct bintime timer_fire_bt; /* callout expiry time */ + struct bintime timer_freq_bt; /* timer frequency */ + struct bintime timer_period_bt; /* timer period */ + struct mtx timer_mtx; + + /* + * The 'isrvec_stk' is a stack of vectors injected by the local apic. + * A vector is popped from the stack when the processor does an EOI. + * The vector on the top of the stack is used to compute the + * Processor Priority in conjunction with the TPR. + */ + uint8_t isrvec_stk[ISRVEC_STK_SIZE]; + int isrvec_stk_top; + + uint64_t msr_apicbase; + enum boot_state boot_state; + + /* + * Copies of some registers in the virtual APIC page. We do this for + * a couple of different reasons: + * - to be able to detect what changed (e.g. svr_last) + * - to maintain a coherent snapshot of the register (e.g. lvt_last) + */ + uint32_t svr_last; + uint32_t lvt_last[VLAPIC_MAXLVT_INDEX + 1]; +}; + +void vlapic_init(struct vlapic *vlapic); +void vlapic_cleanup(struct vlapic *vlapic); + +#endif /* _VLAPIC_PRIV_H_ */ diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index f471218b..2c86068 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -124,19 +124,25 @@ struct vm { * An active vcpu is one that has been started implicitly (BSP) or * explicitly (AP) by sending it a startup ipi. */ - cpuset_t active_cpus; + volatile cpuset_t active_cpus; + + struct mtx rendezvous_mtx; + cpuset_t rendezvous_req_cpus; + cpuset_t rendezvous_done_cpus; + void *rendezvous_arg; + vm_rendezvous_func_t rendezvous_func; }; static int vmm_initialized; static struct vmm_ops *ops; -#define VMM_INIT() (ops != NULL ? (*ops->init)() : 0) +#define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0) #define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) #define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0) #define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) -#define VMRUN(vmi, vcpu, rip, pmap) \ - (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap) : ENXIO) +#define VMRUN(vmi, vcpu, rip, pmap, rptr) \ + (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr) : ENXIO) #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) #define VMSPACE_ALLOC(min, max) \ (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) @@ -156,6 +162,10 @@ static struct vmm_ops *ops; (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) #define VMSETCAP(vmi, vcpu, num, val) \ (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) +#define VLAPIC_INIT(vmi, vcpu) \ + (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL) +#define VLAPIC_CLEANUP(vmi, vlapic) \ + (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL) #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) #define fpu_stop_emulating() clts() @@ -166,10 +176,20 @@ CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */ /* statistics */ static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); +SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); + +static int vmm_ipinum; +SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, + "IPI vector used for vcpu notifications"); + +static void vm_deactivate_cpu(struct vm *vm, int vcpuid); + static void -vcpu_cleanup(struct vcpu *vcpu) +vcpu_cleanup(struct vm *vm, int i) { - vlapic_cleanup(vcpu->vlapic); + struct vcpu *vcpu = &vm->vcpu[i]; + + VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); vmm_stat_free(vcpu->stats); fpu_save_area_free(vcpu->guestfpu); } @@ -184,7 +204,7 @@ vcpu_init(struct vm *vm, uint32_t vcpu_id) vcpu_lock_init(vcpu); vcpu->hostcpu = NOCPU; vcpu->vcpuid = vcpu_id; - vcpu->vlapic = vlapic_init(vm, vcpu_id); + vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED); vcpu->guestfpu = fpu_save_area_alloc(); fpu_save_area_reset(vcpu->guestfpu); @@ -216,7 +236,10 @@ vmm_init(void) int error; vmm_host_state_init(); - vmm_ipi_init(); + + vmm_ipinum = vmm_ipi_alloc(); + if (vmm_ipinum == 0) + vmm_ipinum = IPI_AST; error = vmm_mem_init(); if (error) @@ -232,7 +255,7 @@ vmm_init(void) vmm_msr_init(); vmm_resume_p = vmm_resume; - return (VMM_INIT()); + return (VMM_INIT(vmm_ipinum)); } static int @@ -253,7 +276,8 @@ vmm_handler(module_t mod, int what, void *arg) if (error == 0) { vmm_resume_p = NULL; iommu_cleanup(); - vmm_ipi_cleanup(); + if (vmm_ipinum != IPI_AST) + vmm_ipi_free(vmm_ipinum); error = VMM_CLEANUP(); /* * Something bad happened - prevent new @@ -288,8 +312,6 @@ static moduledata_t vmm_kmod = { DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); MODULE_VERSION(vmm, 1); -SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); - int vm_create(const char *name, struct vm **retvm) { @@ -315,6 +337,8 @@ vm_create(const char *name, struct vm **retvm) vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); strcpy(vm->name, name); + vm->vmspace = vmspace; + mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); vm->cookie = VMINIT(vm, vmspace_pmap(vmspace)); vm->vioapic = vioapic_init(vm); vm->vhpet = vhpet_init(vm); @@ -325,7 +349,6 @@ vm_create(const char *name, struct vm **retvm) } vm_activate_cpu(vm, BSP); - vm->vmspace = vmspace; *retvm = vm; return (0); @@ -360,7 +383,7 @@ vm_destroy(struct vm *vm) vm->num_mem_segs = 0; for (i = 0; i < VM_MAXCPU; i++) - vcpu_cleanup(&vm->vcpu[i]); + vcpu_cleanup(vm, i); VMSPACE_FREE(vm->vmspace); @@ -866,6 +889,63 @@ vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) panic("Error %d setting state to %d", error, newstate); } +static void +vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func) +{ + + KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked")); + + /* + * Update 'rendezvous_func' and execute a write memory barrier to + * ensure that it is visible across all host cpus. This is not needed + * for correctness but it does ensure that all the vcpus will notice + * that the rendezvous is requested immediately. + */ + vm->rendezvous_func = func; + wmb(); +} + +#define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \ + do { \ + if (vcpuid >= 0) \ + VCPU_CTR0(vm, vcpuid, fmt); \ + else \ + VM_CTR0(vm, fmt); \ + } while (0) + +static void +vm_handle_rendezvous(struct vm *vm, int vcpuid) +{ + + KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), + ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid)); + + mtx_lock(&vm->rendezvous_mtx); + while (vm->rendezvous_func != NULL) { + /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ + CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus); + + if (vcpuid != -1 && + CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && + !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { + VCPU_CTR0(vm, vcpuid, "Calling rendezvous func"); + (*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg); + CPU_SET(vcpuid, &vm->rendezvous_done_cpus); + } + if (CPU_CMP(&vm->rendezvous_req_cpus, + &vm->rendezvous_done_cpus) == 0) { + VCPU_CTR0(vm, vcpuid, "Rendezvous completed"); + vm_set_rendezvous_func(vm, NULL); + wakeup(&vm->rendezvous_func); + break; + } + RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion"); + mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, + "vmrndv", 0); + } + mtx_unlock(&vm->rendezvous_mtx); +} + /* * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. */ @@ -874,9 +954,10 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) { struct vm_exit *vmexit; struct vcpu *vcpu; - int t, timo; + int t, timo, spindown; vcpu = &vm->vcpu[vcpuid]; + spindown = 0; vcpu_lock(vcpu); @@ -888,7 +969,7 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) * returned from VMRUN() and before we grabbed the vcpu lock. */ if (!vm_nmi_pending(vm, vcpuid) && - (intr_disabled || vlapic_pending_intr(vcpu->vlapic) < 0)) { + (intr_disabled || !vlapic_pending_intr(vcpu->vlapic, NULL))) { t = ticks; vcpu_require_state_locked(vcpu, VCPU_SLEEPING); if (vlapic_enabled(vcpu->vlapic)) { @@ -903,16 +984,25 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) * Spindown the vcpu if the apic is disabled and it * had entered the halted state. */ - *retu = true; - vmexit = vm_exitinfo(vm, vcpuid); - vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU; - VCPU_CTR0(vm, vcpuid, "spinning down cpu"); + spindown = 1; } vcpu_require_state_locked(vcpu, VCPU_FROZEN); vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); } vcpu_unlock(vcpu); + /* + * Since 'vm_deactivate_cpu()' grabs a sleep mutex we must call it + * outside the confines of the vcpu spinlock. + */ + if (spindown) { + *retu = true; + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU; + vm_deactivate_cpu(vm, vcpuid); + VCPU_CTR0(vm, vcpuid, "spinning down cpu"); + } + return (0); } @@ -1042,7 +1132,7 @@ restart: vcpu_require_state(vm, vcpuid, VCPU_RUNNING); vcpu->hostcpu = curcpu; - error = VMRUN(vm->cookie, vcpuid, rip, pmap); + error = VMRUN(vm->cookie, vcpuid, rip, pmap, &vm->rendezvous_func); vcpu->hostcpu = NOCPU; vcpu_require_state(vm, vcpuid, VCPU_FROZEN); @@ -1056,6 +1146,14 @@ restart: if (error == 0) { retu = false; switch (vme->exitcode) { + case VM_EXITCODE_IOAPIC_EOI: + vioapic_process_eoi(vm, vcpuid, + vme->u.ioapic_eoi.vector); + break; + case VM_EXITCODE_RENDEZVOUS: + vm_handle_rendezvous(vm, vcpuid); + error = 0; + break; case VM_EXITCODE_HLT: intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); @@ -1111,7 +1209,7 @@ vm_inject_nmi(struct vm *vm, int vcpuid) vcpu = &vm->vcpu[vcpuid]; vcpu->nmi_pending = 1; - vcpu_notify_event(vm, vcpuid); + vcpu_notify_event(vm, vcpuid, false); return (0); } @@ -1286,8 +1384,37 @@ void vm_activate_cpu(struct vm *vm, int vcpuid) { - if (vcpuid >= 0 && vcpuid < VM_MAXCPU) - CPU_SET(vcpuid, &vm->active_cpus); + KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, + ("vm_activate_cpu: invalid vcpuid %d", vcpuid)); + KASSERT(!CPU_ISSET(vcpuid, &vm->active_cpus), + ("vm_activate_cpu: vcpuid %d is already active", vcpuid)); + + VCPU_CTR0(vm, vcpuid, "activated"); + CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); +} + +static void +vm_deactivate_cpu(struct vm *vm, int vcpuid) +{ + + KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, + ("vm_deactivate_cpu: invalid vcpuid %d", vcpuid)); + KASSERT(CPU_ISSET(vcpuid, &vm->active_cpus), + ("vm_deactivate_cpu: vcpuid %d is not active", vcpuid)); + + VCPU_CTR0(vm, vcpuid, "deactivated"); + CPU_CLR_ATOMIC(vcpuid, &vm->active_cpus); + + /* + * If a vcpu rendezvous is in progress then it could be blocked + * on 'vcpuid' - unblock it before disappearing forever. + */ + mtx_lock(&vm->rendezvous_mtx); + if (vm->rendezvous_func != NULL) { + VCPU_CTR0(vm, vcpuid, "unblock rendezvous after deactivation"); + wakeup(&vm->rendezvous_func); + } + mtx_unlock(&vm->rendezvous_mtx); } cpuset_t @@ -1339,7 +1466,7 @@ vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) * to the host_cpu to cause the vcpu to trap into the hypervisor. */ void -vcpu_notify_event(struct vm *vm, int vcpuid) +vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) { int hostcpu; struct vcpu *vcpu; @@ -1354,8 +1481,13 @@ vcpu_notify_event(struct vm *vm, int vcpuid) } else { if (vcpu->state != VCPU_RUNNING) panic("invalid vcpu state %d", vcpu->state); - if (hostcpu != curcpu) - ipi_cpu(hostcpu, vmm_ipinum); + if (hostcpu != curcpu) { + if (lapic_intr) + vlapic_post_intr(vcpu->vlapic, hostcpu, + vmm_ipinum); + else + ipi_cpu(hostcpu, vmm_ipinum); + } } vcpu_unlock(vcpu); } @@ -1375,3 +1507,51 @@ vm_apicid2vcpuid(struct vm *vm, int apicid) */ return (apicid); } + +void +vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, + vm_rendezvous_func_t func, void *arg) +{ + int i; + + /* + * Enforce that this function is called without any locks + */ + WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous"); + KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), + ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid)); + +restart: + mtx_lock(&vm->rendezvous_mtx); + if (vm->rendezvous_func != NULL) { + /* + * If a rendezvous is already in progress then we need to + * call the rendezvous handler in case this 'vcpuid' is one + * of the targets of the rendezvous. + */ + RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress"); + mtx_unlock(&vm->rendezvous_mtx); + vm_handle_rendezvous(vm, vcpuid); + goto restart; + } + KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " + "rendezvous is still in progress")); + + RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous"); + vm->rendezvous_req_cpus = dest; + CPU_ZERO(&vm->rendezvous_done_cpus); + vm->rendezvous_arg = arg; + vm_set_rendezvous_func(vm, func); + mtx_unlock(&vm->rendezvous_mtx); + + /* + * Wake up any sleeping vcpus and trigger a VM-exit in any running + * vcpus so they handle the rendezvous as soon as possible. + */ + for (i = 0; i < VM_MAXCPU; i++) { + if (CPU_ISSET(i, &dest)) + vcpu_notify_event(vm, i, false); + } + + vm_handle_rendezvous(vm, vcpuid); +} diff --git a/sys/amd64/vmm/vmm_ipi.c b/sys/amd64/vmm/vmm_ipi.c index 643d326..1765284 100644 --- a/sys/amd64/vmm/vmm_ipi.c +++ b/sys/amd64/vmm/vmm_ipi.c @@ -44,15 +44,10 @@ __FBSDID("$FreeBSD$"); extern inthand_t IDTVEC(rsvd), IDTVEC(justreturn); -/* - * The default is to use the IPI_AST to interrupt a vcpu. - */ -int vmm_ipinum = IPI_AST; - CTASSERT(APIC_SPURIOUS_INT == 255); -void -vmm_ipi_init(void) +int +vmm_ipi_alloc(void) { int idx; uintptr_t func; @@ -72,22 +67,27 @@ vmm_ipi_init(void) ip = &idt[idx]; func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); if (func == (uintptr_t)&IDTVEC(rsvd)) { - vmm_ipinum = idx; - setidt(vmm_ipinum, IDTVEC(justreturn), SDT_SYSIGT, + setidt(idx , IDTVEC(justreturn), SDT_SYSIGT, SEL_KPL, 0); - break; + return (idx); } } - - if (vmm_ipinum != IPI_AST && bootverbose) { - printf("vmm_ipi_init: installing ipi handler to interrupt " - "vcpus at vector %d\n", vmm_ipinum); - } + return (0); } void -vmm_ipi_cleanup(void) +vmm_ipi_free(int ipinum) { - if (vmm_ipinum != IPI_AST) - setidt(vmm_ipinum, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); + uintptr_t func; + struct gate_descriptor *ip; + + KASSERT(ipinum >= APIC_IPI_INTS && ipinum < APIC_SPURIOUS_INT, + ("invalid ipi %d", ipinum)); + + ip = &idt[ipinum]; + func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); + KASSERT(func == (uintptr_t)&IDTVEC(justreturn), + ("invalid ipi %d", ipinum)); + + setidt(ipinum, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); } diff --git a/sys/amd64/vmm/vmm_ipi.h b/sys/amd64/vmm/vmm_ipi.h index 91552e3..679d183 100644 --- a/sys/amd64/vmm/vmm_ipi.h +++ b/sys/amd64/vmm/vmm_ipi.h @@ -29,11 +29,7 @@ #ifndef _VMM_IPI_H_ #define _VMM_IPI_H_ -struct vm; - -extern int vmm_ipinum; - -void vmm_ipi_init(void); -void vmm_ipi_cleanup(void); +int vmm_ipi_alloc(void); +void vmm_ipi_free(int num); #endif diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c index 8d915cd..47e04da 100644 --- a/sys/amd64/vmm/vmm_lapic.c +++ b/sys/amd64/vmm/vmm_lapic.c @@ -51,26 +51,6 @@ __FBSDID("$FreeBSD$"); #define MSI_X86_ADDR_LOG 0x00000004 /* Destination Mode */ int -lapic_pending_intr(struct vm *vm, int cpu) -{ - struct vlapic *vlapic; - - vlapic = vm_lapic(vm, cpu); - - return (vlapic_pending_intr(vlapic)); -} - -void -lapic_intr_accepted(struct vm *vm, int cpu, int vector) -{ - struct vlapic *vlapic; - - vlapic = vm_lapic(vm, cpu); - - vlapic_intr_accepted(vlapic, vector); -} - -int lapic_set_intr(struct vm *vm, int cpu, int vector, bool level) { struct vlapic *vlapic; @@ -82,10 +62,8 @@ lapic_set_intr(struct vm *vm, int cpu, int vector, bool level) return (EINVAL); vlapic = vm_lapic(vm, cpu); - vlapic_set_intr_ready(vlapic, vector, level); - - vcpu_notify_event(vm, cpu); - + if (vlapic_set_intr_ready(vlapic, vector, level)) + vcpu_notify_event(vm, cpu, true); return (0); } diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h index c5c95aa..88fa948 100644 --- a/sys/amd64/vmm/vmm_lapic.h +++ b/sys/amd64/vmm/vmm_lapic.h @@ -43,26 +43,6 @@ int lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size, void *arg); /* - * Returns a vector between 32 and 255 if an interrupt is pending in the - * IRR that can be delivered based on the current state of ISR and TPR. - * - * Note that the vector does not automatically transition to the ISR as a - * result of calling this function. - * - * Returns -1 if there is no eligible vector that can be delivered to the - * guest at this time. - */ -int lapic_pending_intr(struct vm *vm, int cpu); - -/* - * Transition 'vector' from IRR to ISR. This function is called with the - * vector returned by 'lapic_pending_intr()' when the guest is able to - * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that - * block interrupt delivery). - */ -void lapic_intr_accepted(struct vm *vm, int cpu, int vector); - -/* * Signals to the LAPIC that an interrupt at 'vector' needs to be generated * to the 'cpu', the state is recorded in IRR. */ diff --git a/sys/amd64/vmm/vmm_stat.c b/sys/amd64/vmm/vmm_stat.c index 781fda5..0951e1e 100644 --- a/sys/amd64/vmm/vmm_stat.c +++ b/sys/amd64/vmm/vmm_stat.c @@ -146,7 +146,9 @@ VMM_STAT(VMEXIT_INTR_WINDOW, "vm exits due to interrupt window opening"); VMM_STAT(VMEXIT_NMI_WINDOW, "vm exits due to nmi window opening"); VMM_STAT(VMEXIT_INOUT, "number of times in/out was intercepted"); VMM_STAT(VMEXIT_CPUID, "number of times cpuid was intercepted"); -VMM_STAT(VMEXIT_EPT_FAULT, "vm exits due to nested page fault"); +VMM_STAT(VMEXIT_NESTED_FAULT, "vm exits due to nested page fault"); +VMM_STAT(VMEXIT_INST_EMUL, "vm exits for instruction emulation"); VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason"); VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit"); VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace"); +VMM_STAT(VMEXIT_RENDEZVOUS, "number of times rendezvous pending at exit"); diff --git a/sys/amd64/vmm/vmm_stat.h b/sys/amd64/vmm/vmm_stat.h index bc58113..0190a63 100644 --- a/sys/amd64/vmm/vmm_stat.h +++ b/sys/amd64/vmm/vmm_stat.h @@ -116,8 +116,10 @@ VMM_STAT_DECLARE(VMEXIT_INTR_WINDOW); VMM_STAT_DECLARE(VMEXIT_NMI_WINDOW); VMM_STAT_DECLARE(VMEXIT_INOUT); VMM_STAT_DECLARE(VMEXIT_CPUID); -VMM_STAT_DECLARE(VMEXIT_EPT_FAULT); +VMM_STAT_DECLARE(VMEXIT_NESTED_FAULT); +VMM_STAT_DECLARE(VMEXIT_INST_EMUL); VMM_STAT_DECLARE(VMEXIT_UNKNOWN); VMM_STAT_DECLARE(VMEXIT_ASTPENDING); VMM_STAT_DECLARE(VMEXIT_USERSPACE); +VMM_STAT_DECLARE(VMEXIT_RENDEZVOUS); #endif |