summaryrefslogtreecommitdiffstats
path: root/sys/amd64
diff options
context:
space:
mode:
authorjhb <jhb@FreeBSD.org>2014-05-17 19:11:08 +0000
committerjhb <jhb@FreeBSD.org>2014-05-17 19:11:08 +0000
commitbbf655f9b49cc39db4559ede5c58d302ff8f3de2 (patch)
treef6cf26193250fdea84a6946390d9759716c70b5c /sys/amd64
parent7e7928763170f8b10771c099cf46224daaf67bca (diff)
downloadFreeBSD-src-bbf655f9b49cc39db4559ede5c58d302ff8f3de2.zip
FreeBSD-src-bbf655f9b49cc39db4559ede5c58d302ff8f3de2.tar.gz
MFC 259641,259863,259924,259937,259961,259978,260380,260383,260410,260466,
260531,260532,260550,260619,261170,261453,261621,263280,263290,264516: Add support for local APIC hardware-assist. - Restructure vlapic access and register handling to support hardware-assist for the local APIC. - Use the 'Virtual Interrupt Delivery' and 'Posted Interrupt Processing' feature of Intel VT-x if supported by hardware. - Add an API to rendezvous all active vcpus in a virtual machine and use it to support level triggered interrupts with VT-x 'Virtual Interrupt Delivery'. - Use a cheaper IPI handler than IPI_AST for nested page table shootdowns and avoid doing unnecessary nested TLB invalidations. Reviewed by: neel
Diffstat (limited to 'sys/amd64')
-rw-r--r--sys/amd64/amd64/pmap.c7
-rw-r--r--sys/amd64/include/pmap.h7
-rw-r--r--sys/amd64/include/vmm.h40
-rw-r--r--sys/amd64/vmm/amd/amdv.c20
-rw-r--r--sys/amd64/vmm/intel/ept.c5
-rw-r--r--sys/amd64/vmm/intel/ept.h2
-rw-r--r--sys/amd64/vmm/intel/vmcs.c44
-rw-r--r--sys/amd64/vmm/intel/vmcs.h41
-rw-r--r--sys/amd64/vmm/intel/vmx.c698
-rw-r--r--sys/amd64/vmm/intel/vmx.h29
-rw-r--r--sys/amd64/vmm/intel/vmx_controls.h23
-rw-r--r--sys/amd64/vmm/intel/vmx_genassym.c9
-rw-r--r--sys/amd64/vmm/intel/vmx_support.S33
-rw-r--r--sys/amd64/vmm/io/vioapic.c87
-rw-r--r--sys/amd64/vmm/io/vlapic.c669
-rw-r--r--sys/amd64/vmm/io/vlapic.h115
-rw-r--r--sys/amd64/vmm/io/vlapic_priv.h185
-rw-r--r--sys/amd64/vmm/vmm.c234
-rw-r--r--sys/amd64/vmm/vmm_ipi.c36
-rw-r--r--sys/amd64/vmm/vmm_ipi.h8
-rw-r--r--sys/amd64/vmm/vmm_lapic.c26
-rw-r--r--sys/amd64/vmm/vmm_lapic.h20
-rw-r--r--sys/amd64/vmm/vmm_stat.c4
-rw-r--r--sys/amd64/vmm/vmm_stat.h4
24 files changed, 1753 insertions, 593 deletions
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 4aa66b5..2b61023 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -1304,6 +1304,7 @@ pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va)
static __inline void
pmap_invalidate_ept(pmap_t pmap)
{
+ int ipinum;
sched_pin();
KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
@@ -1328,11 +1329,9 @@ pmap_invalidate_ept(pmap_t pmap)
/*
* Force the vcpu to exit and trap back into the hypervisor.
- *
- * XXX this is not optimal because IPI_AST builds a trapframe
- * whereas all we need is an 'eoi' followed by 'iret'.
*/
- ipi_selected(pmap->pm_active, IPI_AST);
+ ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
+ ipi_selected(pmap->pm_active, ipinum);
sched_unpin();
}
diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h
index 1b5f6a0..e83e07e 100644
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@@ -312,9 +312,10 @@ struct pmap {
};
/* flags */
-#define PMAP_PDE_SUPERPAGE (1 << 0) /* supports 2MB superpages */
-#define PMAP_EMULATE_AD_BITS (1 << 1) /* needs A/D bits emulation */
-#define PMAP_SUPPORTS_EXEC_ONLY (1 << 2) /* execute only mappings ok */
+#define PMAP_NESTED_IPIMASK 0xff
+#define PMAP_PDE_SUPERPAGE (1 << 8) /* supports 2MB superpages */
+#define PMAP_EMULATE_AD_BITS (1 << 9) /* needs A/D bits emulation */
+#define PMAP_SUPPORTS_EXEC_ONLY (1 << 10) /* execute only mappings ok */
typedef struct pmap *pmap_t;
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 92b767f..fab7e74 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -47,12 +47,12 @@ struct pmap;
enum x2apic_state;
-typedef int (*vmm_init_func_t)(void);
+typedef int (*vmm_init_func_t)(int ipinum);
typedef int (*vmm_cleanup_func_t)(void);
typedef void (*vmm_resume_func_t)(void);
typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
- struct pmap *pmap);
+ struct pmap *pmap, void *rendezvous_cookie);
typedef void (*vmi_cleanup_func_t)(void *vmi);
typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num,
uint64_t *retval);
@@ -69,6 +69,8 @@ typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
typedef void (*vmi_vmspace_free)(struct vmspace *vmspace);
+typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu);
+typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic);
struct vmm_ops {
vmm_init_func_t init; /* module wide initialization */
@@ -87,6 +89,8 @@ struct vmm_ops {
vmi_set_cap_t vmsetcap;
vmi_vmspace_alloc vmspace_alloc;
vmi_vmspace_free vmspace_free;
+ vmi_vlapic_init vlapic_init;
+ vmi_vlapic_cleanup vlapic_cleanup;
};
extern struct vmm_ops vmm_ops_intel;
@@ -132,6 +136,31 @@ cpuset_t vm_active_cpus(struct vm *vm);
struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
/*
+ * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'.
+ * The rendezvous 'func(arg)' is not allowed to do anything that will
+ * cause the thread to be put to sleep.
+ *
+ * If the rendezvous is being initiated from a vcpu context then the
+ * 'vcpuid' must refer to that vcpu, otherwise it should be set to -1.
+ *
+ * The caller cannot hold any locks when initiating the rendezvous.
+ *
+ * The implementation of this API may cause vcpus other than those specified
+ * by 'dest' to be stalled. The caller should not rely on any vcpus making
+ * forward progress when the rendezvous is in progress.
+ */
+typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg);
+void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
+ vm_rendezvous_func_t func, void *arg);
+
+static __inline int
+vcpu_rendezvous_pending(void *rendezvous_cookie)
+{
+
+ return (*(uintptr_t *)rendezvous_cookie != 0);
+}
+
+/*
* Return 1 if device indicated by bus/slot/func is supposed to be a
* pci passthrough device.
*
@@ -158,7 +187,7 @@ vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
}
void *vcpu_stats(struct vm *vm, int vcpu);
-void vcpu_notify_event(struct vm *vm, int vcpuid);
+void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
struct vmspace *vm_get_vmspace(struct vm *vm);
int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
@@ -267,6 +296,8 @@ enum vm_exitcode {
VM_EXITCODE_INST_EMUL,
VM_EXITCODE_SPINUP_AP,
VM_EXITCODE_SPINDOWN_CPU,
+ VM_EXITCODE_RENDEZVOUS,
+ VM_EXITCODE_IOAPIC_EOI,
VM_EXITCODE_MAX
};
@@ -323,6 +354,9 @@ struct vm_exit {
struct {
uint64_t rflags;
} hlt;
+ struct {
+ int vector;
+ } ioapic_eoi;
} u;
};
diff --git a/sys/amd64/vmm/amd/amdv.c b/sys/amd64/vmm/amd/amdv.c
index 6c87901..00484c7 100644
--- a/sys/amd64/vmm/amd/amdv.c
+++ b/sys/amd64/vmm/amd/amdv.c
@@ -38,7 +38,7 @@ __FBSDID("$FreeBSD$");
#include "io/iommu.h"
static int
-amdv_init(void)
+amdv_init(int ipinum)
{
printf("amdv_init: not implemented\n");
@@ -67,7 +67,7 @@ amdv_vminit(struct vm *vm, struct pmap *pmap)
}
static int
-amdv_vmrun(void *arg, int vcpu, register_t rip, struct pmap *pmap)
+amdv_vmrun(void *arg, int vcpu, register_t rip, struct pmap *pmap, void *cookie)
{
printf("amdv_vmrun: not implemented\n");
@@ -155,6 +155,20 @@ amdv_vmspace_free(struct vmspace *vmspace)
return;
}
+static struct vlapic *
+amdv_vlapic_init(void *arg, int vcpuid)
+{
+
+ panic("amdv_vlapic_init: not implmented");
+}
+
+static void
+amdv_vlapic_cleanup(void *arg, struct vlapic *vlapic)
+{
+
+ panic("amdv_vlapic_cleanup: not implemented");
+}
+
struct vmm_ops vmm_ops_amd = {
amdv_init,
amdv_cleanup,
@@ -171,6 +185,8 @@ struct vmm_ops vmm_ops_amd = {
amdv_setcap,
amdv_vmspace_alloc,
amdv_vmspace_free,
+ amdv_vlapic_init,
+ amdv_vlapic_cleanup,
};
static int
diff --git a/sys/amd64/vmm/intel/ept.c b/sys/amd64/vmm/intel/ept.c
index 18e90f3..5f6c4d0 100644
--- a/sys/amd64/vmm/intel/ept.c
+++ b/sys/amd64/vmm/intel/ept.c
@@ -43,6 +43,7 @@ __FBSDID("$FreeBSD$");
#include <machine/vmm.h>
#include "vmx_cpufunc.h"
+#include "vmm_ipi.h"
#include "vmx_msr.h"
#include "ept.h"
@@ -76,7 +77,7 @@ SYSCTL_INT(_hw_vmm_ept, OID_AUTO, pmap_flags, CTLFLAG_RD,
&ept_pmap_flags, 0, NULL);
int
-ept_init(void)
+ept_init(int ipinum)
{
int use_hw_ad_bits, use_superpages, use_exec_only;
uint64_t cap;
@@ -98,6 +99,8 @@ ept_init(void)
!INVEPT_ALL_TYPES_SUPPORTED(cap))
return (EINVAL);
+ ept_pmap_flags = ipinum & PMAP_NESTED_IPIMASK;
+
use_superpages = 1;
TUNABLE_INT_FETCH("hw.vmm.ept.use_superpages", &use_superpages);
if (use_superpages && EPT_PDE_SUPERPAGE(cap))
diff --git a/sys/amd64/vmm/intel/ept.h b/sys/amd64/vmm/intel/ept.h
index dfd3a44..1393e46 100644
--- a/sys/amd64/vmm/intel/ept.h
+++ b/sys/amd64/vmm/intel/ept.h
@@ -31,7 +31,7 @@
struct vmx;
-int ept_init(void);
+int ept_init(int ipinum);
void ept_invalidate_mappings(u_long eptp);
struct vmspace *ept_vmspace_alloc(vm_offset_t min, vm_offset_t max);
void ept_vmspace_free(struct vmspace *vmspace);
diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
index 980eac1..1ddefe0 100644
--- a/sys/amd64/vmm/intel/vmcs.c
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -315,11 +315,7 @@ done:
}
int
-vmcs_set_defaults(struct vmcs *vmcs,
- u_long host_rip, u_long host_rsp, uint64_t eptp,
- uint32_t pinbased_ctls, uint32_t procbased_ctls,
- uint32_t procbased_ctls2, uint32_t exit_ctls,
- uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid)
+vmcs_init(struct vmcs *vmcs)
{
int error, codesel, datasel, tsssel;
u_long cr0, cr4, efer;
@@ -335,22 +331,6 @@ vmcs_set_defaults(struct vmcs *vmcs,
*/
VMPTRLD(vmcs);
- /*
- * Load the VMX controls
- */
- if ((error = vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls)) != 0)
- goto done;
- if ((error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls)) != 0)
- goto done;
- if ((error = vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2)) != 0)
- goto done;
- if ((error = vmwrite(VMCS_EXIT_CTLS, exit_ctls)) != 0)
- goto done;
- if ((error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls)) != 0)
- goto done;
-
- /* Guest state */
-
/* Initialize guest IA32_PAT MSR with the default value */
pat = PAT_VALUE(0, PAT_WRITE_BACK) |
PAT_VALUE(1, PAT_WRITE_THROUGH) |
@@ -422,23 +402,7 @@ vmcs_set_defaults(struct vmcs *vmcs,
goto done;
/* instruction pointer */
- if ((error = vmwrite(VMCS_HOST_RIP, host_rip)) != 0)
- goto done;
-
- /* stack pointer */
- if ((error = vmwrite(VMCS_HOST_RSP, host_rsp)) != 0)
- goto done;
-
- /* eptp */
- if ((error = vmwrite(VMCS_EPTP, eptp)) != 0)
- goto done;
-
- /* vpid */
- if ((error = vmwrite(VMCS_VPID, vpid)) != 0)
- goto done;
-
- /* msr bitmap */
- if ((error = vmwrite(VMCS_MSR_BITMAP, msr_bitmap)) != 0)
+ if ((error = vmwrite(VMCS_HOST_RIP, (u_long)vmx_exit_guest)) != 0)
goto done;
/* exception bitmap */
@@ -509,7 +473,7 @@ DB_SHOW_COMMAND(vmcs, db_show_vmcs)
switch (exit & 0x8000ffff) {
case EXIT_REASON_EXCEPTION:
case EXIT_REASON_EXT_INTR:
- val = vmcs_read(VMCS_EXIT_INTERRUPTION_INFO);
+ val = vmcs_read(VMCS_EXIT_INTR_INFO);
db_printf("Interrupt Type: ");
switch (val >> 8 & 0x7) {
case 0:
@@ -531,7 +495,7 @@ DB_SHOW_COMMAND(vmcs, db_show_vmcs)
db_printf(" Vector: %lu", val & 0xff);
if (val & 0x800)
db_printf(" Error Code: %lx",
- vmcs_read(VMCS_EXIT_INTERRUPTION_ERROR));
+ vmcs_read(VMCS_EXIT_INTR_ERRCODE));
db_printf("\n");
break;
case EXIT_REASON_EPT_FAULT:
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
index b1e2883..fa03826 100644
--- a/sys/amd64/vmm/intel/vmcs.h
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -46,12 +46,7 @@ struct msr_entry {
};
int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
-int vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp,
- uint64_t eptp,
- uint32_t pinbased_ctls, uint32_t procbased_ctls,
- uint32_t procbased_ctls2, uint32_t exit_ctls,
- uint32_t entry_ctls, u_long msr_bitmap,
- uint16_t vpid);
+int vmcs_init(struct vmcs *vmcs);
int vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *rv);
int vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val);
int vmcs_getdesc(struct vmcs *vmcs, int ident,
@@ -102,6 +97,7 @@ vmcs_write(uint32_t encoding, uint64_t val)
/* 16-bit control fields */
#define VMCS_VPID 0x00000000
+#define VMCS_PIR_VECTOR 0x00000002
/* 16-bit guest-state fields */
#define VMCS_GUEST_ES_SELECTOR 0x00000800
@@ -112,6 +108,7 @@ vmcs_write(uint32_t encoding, uint64_t val)
#define VMCS_GUEST_GS_SELECTOR 0x0000080A
#define VMCS_GUEST_LDTR_SELECTOR 0x0000080C
#define VMCS_GUEST_TR_SELECTOR 0x0000080E
+#define VMCS_GUEST_INTR_STATUS 0x00000810
/* 16-bit host-state fields */
#define VMCS_HOST_ES_SELECTOR 0x00000C00
@@ -133,7 +130,13 @@ vmcs_write(uint32_t encoding, uint64_t val)
#define VMCS_TSC_OFFSET 0x00002010
#define VMCS_VIRTUAL_APIC 0x00002012
#define VMCS_APIC_ACCESS 0x00002014
+#define VMCS_PIR_DESC 0x00002016
#define VMCS_EPTP 0x0000201A
+#define VMCS_EOI_EXIT0 0x0000201C
+#define VMCS_EOI_EXIT1 0x0000201E
+#define VMCS_EOI_EXIT2 0x00002020
+#define VMCS_EOI_EXIT3 0x00002022
+#define VMCS_EOI_EXIT(vector) (VMCS_EOI_EXIT0 + ((vector) / 64) * 2)
/* 64-bit read-only fields */
#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400
@@ -177,8 +180,8 @@ vmcs_write(uint32_t encoding, uint64_t val)
/* 32-bit read-only data fields */
#define VMCS_INSTRUCTION_ERROR 0x00004400
#define VMCS_EXIT_REASON 0x00004402
-#define VMCS_EXIT_INTERRUPTION_INFO 0x00004404
-#define VMCS_EXIT_INTERRUPTION_ERROR 0x00004406
+#define VMCS_EXIT_INTR_INFO 0x00004404
+#define VMCS_EXIT_INTR_ERRCODE 0x00004406
#define VMCS_IDT_VECTORING_INFO 0x00004408
#define VMCS_IDT_VECTORING_ERROR 0x0000440A
#define VMCS_EXIT_INSTRUCTION_LENGTH 0x0000440C
@@ -315,7 +318,8 @@ vmcs_write(uint32_t encoding, uint64_t val)
#define EXIT_REASON_PAUSE 40
#define EXIT_REASON_MCE 41
#define EXIT_REASON_TPR 43
-#define EXIT_REASON_APIC 44
+#define EXIT_REASON_APIC_ACCESS 44
+#define EXIT_REASON_VIRTUALIZED_EOI 45
#define EXIT_REASON_GDTR_IDTR 46
#define EXIT_REASON_LDTR_TR 47
#define EXIT_REASON_EPT_FAULT 48
@@ -326,13 +330,15 @@ vmcs_write(uint32_t encoding, uint64_t val)
#define EXIT_REASON_INVVPID 53
#define EXIT_REASON_WBINVD 54
#define EXIT_REASON_XSETBV 55
+#define EXIT_REASON_APIC_WRITE 56
/*
* VMCS interrupt information fields
*/
-#define VMCS_INTERRUPTION_INFO_VALID (1U << 31)
-#define VMCS_INTERRUPTION_INFO_HW_INTR (0 << 8)
-#define VMCS_INTERRUPTION_INFO_NMI (2 << 8)
+#define VMCS_INTR_INFO_VALID (1U << 31)
+#define VMCS_INTR_INFO_TYPE(info) (((info) >> 8) & 0x7)
+#define VMCS_INTR_INFO_HW_INTR (0 << 8)
+#define VMCS_INTR_INFO_NMI (2 << 8)
/*
* VMCS IDT-Vectoring information fields
@@ -365,4 +371,15 @@ vmcs_write(uint32_t encoding, uint64_t val)
#define EPT_VIOLATION_GLA_VALID (1UL << 7)
#define EPT_VIOLATION_XLAT_VALID (1UL << 8)
+/*
+ * Exit qualification for APIC-access VM exit
+ */
+#define APIC_ACCESS_OFFSET(qual) ((qual) & 0xFFF)
+#define APIC_ACCESS_TYPE(qual) (((qual) >> 12) & 0xF)
+
+/*
+ * Exit qualification for APIC-write VM exit
+ */
+#define APIC_WRITE_OFFSET(qual) ((qual) & 0xFFF)
+
#endif
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index bcaed4e..b79d174 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -45,15 +45,18 @@ __FBSDID("$FreeBSD$");
#include <machine/cpufunc.h>
#include <machine/md_var.h>
#include <machine/segments.h>
+#include <machine/smp.h>
#include <machine/specialreg.h>
#include <machine/vmparam.h>
#include <machine/vmm.h>
#include "vmm_host.h"
-#include "vmm_lapic.h"
+#include "vmm_ipi.h"
#include "vmm_msr.h"
#include "vmm_ktr.h"
#include "vmm_stat.h"
+#include "vlapic.h"
+#include "vlapic_priv.h"
#include "vmx_msr.h"
#include "ept.h"
@@ -92,6 +95,7 @@ __FBSDID("$FreeBSD$");
#define VM_EXIT_CTLS_ONE_SETTING \
(VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \
+ VM_EXIT_ACKNOWLEDGE_INTERRUPT | \
VM_EXIT_SAVE_PAT | \
VM_EXIT_LOAD_PAT)
#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS
@@ -112,7 +116,8 @@ __FBSDID("$FreeBSD$");
#define HANDLED 1
#define UNHANDLED 0
-MALLOC_DEFINE(M_VMX, "vmx", "vmx");
+static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
+static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
SYSCTL_DECL(_hw_vmm);
SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
@@ -164,12 +169,33 @@ static int cap_pause_exit;
static int cap_unrestricted_guest;
static int cap_monitor_trap;
static int cap_invpcid;
-
+
+static int virtual_interrupt_delivery;
+SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
+ &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
+
+static int posted_interrupts;
+SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD,
+ &posted_interrupts, 0, "APICv posted interrupt support");
+
+static int pirvec;
+SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
+ &pirvec, 0, "APICv posted interrupt vector");
+
static struct unrhdr *vpid_unr;
static u_int vpid_alloc_failed;
SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
&vpid_alloc_failed, 0, NULL);
+/*
+ * Use the last page below 4GB as the APIC access address. This address is
+ * occupied by the boot firmware so it is guaranteed that it will not conflict
+ * with a page in system memory.
+ */
+#define APIC_ACCESS_ADDRESS 0xFFFFF000
+
+static void vmx_inject_pir(struct vlapic *vlapic);
+
#ifdef KTR
static const char *
exit_reason_to_str(int reason)
@@ -259,8 +285,8 @@ exit_reason_to_str(int reason)
return "mce";
case EXIT_REASON_TPR:
return "tpr";
- case EXIT_REASON_APIC:
- return "apic";
+ case EXIT_REASON_APIC_ACCESS:
+ return "apic-access";
case EXIT_REASON_GDTR_IDTR:
return "gdtridtr";
case EXIT_REASON_LDTR_TR:
@@ -281,6 +307,8 @@ exit_reason_to_str(int reason)
return "wbinvd";
case EXIT_REASON_XSETBV:
return "xsetbv";
+ case EXIT_REASON_APIC_WRITE:
+ return "apic-write";
default:
snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
return (reasonbuf);
@@ -424,6 +452,9 @@ vmx_disable(void *arg __unused)
static int
vmx_cleanup(void)
{
+
+ if (pirvec != 0)
+ vmm_ipi_free(pirvec);
if (vpid_unr != NULL) {
delete_unrhdr(vpid_unr);
@@ -457,11 +488,11 @@ vmx_restore(void)
}
static int
-vmx_init(void)
+vmx_init(int ipinum)
{
- int error;
+ int error, use_tpr_shadow;
uint64_t fixed0, fixed1, feature_control;
- uint32_t tmp;
+ uint32_t tmp, procbased2_vid_bits;
/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
if (!(cpu_feature2 & CPUID2_VMX)) {
@@ -595,9 +626,58 @@ vmx_init(void)
MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
&tmp) == 0);
+ /*
+ * Check support for virtual interrupt delivery.
+ */
+ procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
+ PROCBASED2_VIRTUALIZE_X2APIC_MODE |
+ PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
+ PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
+
+ use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
+ &tmp) == 0);
+
+ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
+ procbased2_vid_bits, 0, &tmp);
+ if (error == 0 && use_tpr_shadow) {
+ virtual_interrupt_delivery = 1;
+ TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
+ &virtual_interrupt_delivery);
+ }
+
+ if (virtual_interrupt_delivery) {
+ procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
+ procbased_ctls2 |= procbased2_vid_bits;
+ procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
+
+ /*
+ * Check for Posted Interrupts only if Virtual Interrupt
+ * Delivery is enabled.
+ */
+ error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
+ MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
+ &tmp);
+ if (error == 0) {
+ pirvec = vmm_ipi_alloc();
+ if (pirvec == 0) {
+ if (bootverbose) {
+ printf("vmx_init: unable to allocate "
+ "posted interrupt vector\n");
+ }
+ } else {
+ posted_interrupts = 1;
+ TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
+ &posted_interrupts);
+ }
+ }
+ }
+
+ if (posted_interrupts)
+ pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
/* Initialize EPT */
- error = ept_init();
+ error = ept_init(ipinum);
if (error) {
printf("vmx_init: ept initialization failed (%d)\n", error);
return (error);
@@ -638,6 +718,31 @@ vmx_init(void)
return (0);
}
+static void
+vmx_trigger_hostintr(int vector)
+{
+ uintptr_t func;
+ struct gate_descriptor *gd;
+
+ gd = &idt[vector];
+
+ KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
+ "invalid vector %d", vector));
+ KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
+ vector));
+ KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
+ "has invalid type %d", vector, gd->gd_type));
+ KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
+ "has invalid dpl %d", vector, gd->gd_dpl));
+ KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
+ "for vector %d has invalid selector %d", vector, gd->gd_selector));
+ KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
+ "IST %d", vector, gd->gd_ist));
+
+ func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
+ vmx_call_isr(func);
+}
+
static int
vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
{
@@ -676,6 +781,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
uint16_t vpid[VM_MAXCPU];
int i, error, guest_msr_count;
struct vmx *vmx;
+ struct vmcs *vmcs;
vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
if ((uintptr_t)vmx & PAGE_MASK) {
@@ -740,27 +846,52 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
vpid_alloc(vpid, VM_MAXCPU);
+ if (virtual_interrupt_delivery) {
+ error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
+ APIC_ACCESS_ADDRESS);
+ /* XXX this should really return an error to the caller */
+ KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
+ }
+
for (i = 0; i < VM_MAXCPU; i++) {
- vmx->vmcs[i].identifier = vmx_revision();
- error = vmclear(&vmx->vmcs[i]);
+ vmcs = &vmx->vmcs[i];
+ vmcs->identifier = vmx_revision();
+ error = vmclear(vmcs);
if (error != 0) {
panic("vmx_vminit: vmclear error %d on vcpu %d\n",
error, i);
}
- error = vmcs_set_defaults(&vmx->vmcs[i],
- (u_long)vmx_exit_guest,
- (u_long)&vmx->ctx[i],
- vmx->eptp,
- pinbased_ctls,
- procbased_ctls,
- procbased_ctls2,
- exit_ctls, entry_ctls,
- vtophys(vmx->msr_bitmap),
- vpid[i]);
+ error = vmcs_init(vmcs);
+ KASSERT(error == 0, ("vmcs_init error %d", error));
- if (error != 0)
- panic("vmx_vminit: vmcs_set_defaults error %d", error);
+ VMPTRLD(vmcs);
+ error = 0;
+ error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
+ error += vmwrite(VMCS_EPTP, vmx->eptp);
+ error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
+ error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
+ error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
+ error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
+ error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
+ error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
+ error += vmwrite(VMCS_VPID, vpid[i]);
+ if (virtual_interrupt_delivery) {
+ error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
+ error += vmwrite(VMCS_VIRTUAL_APIC,
+ vtophys(&vmx->apic_page[i]));
+ error += vmwrite(VMCS_EOI_EXIT0, 0);
+ error += vmwrite(VMCS_EOI_EXIT1, 0);
+ error += vmwrite(VMCS_EOI_EXIT2, 0);
+ error += vmwrite(VMCS_EOI_EXIT3, 0);
+ }
+ if (posted_interrupts) {
+ error += vmwrite(VMCS_PIR_VECTOR, pirvec);
+ error += vmwrite(VMCS_PIR_DESC,
+ vtophys(&vmx->pir_desc[i]));
+ }
+ VMCLEAR(vmcs);
+ KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
vmx->cap[i].set = 0;
vmx->cap[i].proc_ctls = procbased_ctls;
@@ -771,9 +902,8 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
- error = vmcs_set_msr_save(&vmx->vmcs[i],
- vtophys(vmx->guest_msrs[i]),
- guest_msr_count);
+ error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]),
+ guest_msr_count);
if (error != 0)
panic("vmcs_set_msr_save error %d", error);
@@ -783,16 +913,15 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
* CR0 - 0x60000010
* CR4 - 0
*/
- error = vmx_setup_cr0_shadow(&vmx->vmcs[i], 0x60000010);
+ error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
if (error != 0)
panic("vmx_setup_cr0_shadow %d", error);
- error = vmx_setup_cr4_shadow(&vmx->vmcs[i], 0);
+ error = vmx_setup_cr4_shadow(vmcs, 0);
if (error != 0)
panic("vmx_setup_cr4_shadow %d", error);
vmx->ctx[i].pmap = pmap;
- vmx->ctx[i].eptp = vmx->eptp;
}
return (vmx);
@@ -840,20 +969,20 @@ vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
#endif
}
+static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
+
static void
-vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu)
+vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
{
- int lastcpu;
struct vmxstate *vmxstate;
- struct invvpid_desc invvpid_desc = { 0 };
+ struct invvpid_desc invvpid_desc;
vmxstate = &vmx->state[vcpu];
- lastcpu = vmxstate->lastcpu;
- vmxstate->lastcpu = curcpu;
-
- if (lastcpu == curcpu)
+ if (vmxstate->lastcpu == curcpu)
return;
+ vmxstate->lastcpu = curcpu;
+
vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
@@ -876,8 +1005,20 @@ vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu)
* for "all" EP4TAs.
*/
if (vmxstate->vpid != 0) {
- invvpid_desc.vpid = vmxstate->vpid;
- invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
+ if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
+ invvpid_desc._res1 = 0;
+ invvpid_desc._res2 = 0;
+ invvpid_desc.vpid = vmxstate->vpid;
+ invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
+ } else {
+ /*
+ * The invvpid can be skipped if an invept is going to
+ * be performed before entering the guest. The invept
+ * will invalidate combined mappings tagged with
+ * 'vmx->eptp' for all vpids.
+ */
+ vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
+ }
}
}
@@ -935,7 +1076,7 @@ vmx_inject_nmi(struct vmx *vmx, int vcpu)
* Inject the virtual NMI. The vector must be the NMI IDT entry
* or the VMCS entry check will fail.
*/
- info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID;
+ info = VMCS_INTR_INFO_NMI | VMCS_INTR_INFO_VALID;
info |= IDT_NMI;
vmcs_write(VMCS_ENTRY_INTR_INFO, info);
@@ -957,7 +1098,7 @@ nmiblocked:
}
static void
-vmx_inject_interrupts(struct vmx *vmx, int vcpu)
+vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
{
int vector;
uint64_t info, rflags, interruptibility;
@@ -973,7 +1114,7 @@ vmx_inject_interrupts(struct vmx *vmx, int vcpu)
* because of a pending AST.
*/
info = vmcs_read(VMCS_ENTRY_INTR_INFO);
- if (info & VMCS_INTERRUPTION_INFO_VALID)
+ if (info & VMCS_INTR_INFO_VALID)
return;
/*
@@ -982,9 +1123,13 @@ vmx_inject_interrupts(struct vmx *vmx, int vcpu)
if (vmx_inject_nmi(vmx, vcpu))
return;
+ if (virtual_interrupt_delivery) {
+ vmx_inject_pir(vlapic);
+ return;
+ }
+
/* Ask the local apic for a vector to inject */
- vector = lapic_pending_intr(vmx->vm, vcpu);
- if (vector < 0)
+ if (!vlapic_pending_intr(vlapic, &vector))
return;
if (vector < 32 || vector > 255)
@@ -1000,12 +1145,12 @@ vmx_inject_interrupts(struct vmx *vmx, int vcpu)
goto cantinject;
/* Inject the interrupt */
- info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID;
+ info = VMCS_INTR_INFO_HW_INTR | VMCS_INTR_INFO_VALID;
info |= vector;
vmcs_write(VMCS_ENTRY_INTR_INFO, info);
/* Update the Local APIC ISR */
- lapic_intr_accepted(vmx->vm, vcpu, vector);
+ vlapic_intr_accepted(vlapic, vector);
VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
@@ -1175,11 +1320,141 @@ ept_emulation_fault(uint64_t ept_qual)
}
static int
+vmx_handle_apic_write(struct vlapic *vlapic, uint64_t qual)
+{
+ int error, handled, offset;
+ bool retu;
+
+ if (!virtual_interrupt_delivery)
+ return (UNHANDLED);
+
+ handled = 1;
+ offset = APIC_WRITE_OFFSET(qual);
+ switch (offset) {
+ case APIC_OFFSET_ID:
+ vlapic_id_write_handler(vlapic);
+ break;
+ case APIC_OFFSET_LDR:
+ vlapic_ldr_write_handler(vlapic);
+ break;
+ case APIC_OFFSET_DFR:
+ vlapic_dfr_write_handler(vlapic);
+ break;
+ case APIC_OFFSET_SVR:
+ vlapic_svr_write_handler(vlapic);
+ break;
+ case APIC_OFFSET_ESR:
+ vlapic_esr_write_handler(vlapic);
+ break;
+ case APIC_OFFSET_ICR_LOW:
+ retu = false;
+ error = vlapic_icrlo_write_handler(vlapic, &retu);
+ if (error != 0 || retu)
+ handled = 0;
+ break;
+ case APIC_OFFSET_CMCI_LVT:
+ case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+ vlapic_lvt_write_handler(vlapic, offset);
+ break;
+ case APIC_OFFSET_TIMER_ICR:
+ vlapic_icrtmr_write_handler(vlapic);
+ break;
+ case APIC_OFFSET_TIMER_DCR:
+ vlapic_dcr_write_handler(vlapic);
+ break;
+ default:
+ handled = 0;
+ break;
+ }
+ return (handled);
+}
+
+static bool
+apic_access_fault(uint64_t gpa)
+{
+
+ if (virtual_interrupt_delivery &&
+ (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
+ return (true);
+ else
+ return (false);
+}
+
+static int
+vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
+{
+ uint64_t qual;
+ int access_type, offset, allowed;
+
+ if (!virtual_interrupt_delivery)
+ return (UNHANDLED);
+
+ qual = vmexit->u.vmx.exit_qualification;
+ access_type = APIC_ACCESS_TYPE(qual);
+ offset = APIC_ACCESS_OFFSET(qual);
+
+ allowed = 0;
+ if (access_type == 0) {
+ /*
+ * Read data access to the following registers is expected.
+ */
+ switch (offset) {
+ case APIC_OFFSET_APR:
+ case APIC_OFFSET_PPR:
+ case APIC_OFFSET_RRR:
+ case APIC_OFFSET_CMCI_LVT:
+ case APIC_OFFSET_TIMER_CCR:
+ allowed = 1;
+ break;
+ default:
+ break;
+ }
+ } else if (access_type == 1) {
+ /*
+ * Write data access to the following registers is expected.
+ */
+ switch (offset) {
+ case APIC_OFFSET_VER:
+ case APIC_OFFSET_APR:
+ case APIC_OFFSET_PPR:
+ case APIC_OFFSET_RRR:
+ case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
+ case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
+ case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
+ case APIC_OFFSET_CMCI_LVT:
+ case APIC_OFFSET_TIMER_CCR:
+ allowed = 1;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (allowed) {
+ vmexit->exitcode = VM_EXITCODE_INST_EMUL;
+ vmexit->u.inst_emul.gpa = DEFAULT_APIC_BASE + offset;
+ vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
+ vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
+ }
+
+ /*
+ * Regardless of whether the APIC-access is allowed this handler
+ * always returns UNHANDLED:
+ * - if the access is allowed then it is handled by emulating the
+ * instruction that caused the VM-exit (outside the critical section)
+ * - if the access is not allowed then it will be converted to an
+ * exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
+ */
+ return (UNHANDLED);
+}
+
+static int
vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
{
int error, handled;
struct vmxctx *vmxctx;
- uint32_t eax, ecx, edx, idtvec_info, idtvec_err, reason;
+ struct vlapic *vlapic;
+ uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, reason;
uint64_t qual, gpa;
bool retu;
@@ -1203,7 +1478,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
switch (reason) {
case EXIT_REASON_EPT_FAULT:
case EXIT_REASON_EPT_MISCONFIG:
- case EXIT_REASON_APIC:
+ case EXIT_REASON_APIC_ACCESS:
case EXIT_REASON_TASK_SWITCH:
case EXIT_REASON_EXCEPTION:
idtvec_info = vmcs_idt_vectoring_info();
@@ -1290,6 +1565,11 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
* host interrupt handler in the VM's softc. We will inject
* this virtual interrupt during the subsequent VM enter.
*/
+ intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
+ KASSERT((intr_info & VMCS_INTR_INFO_VALID) != 0 &&
+ VMCS_INTR_INFO_TYPE(intr_info) == 0,
+ ("VM exit interruption info invalid: %#x", intr_info));
+ vmx_trigger_hostintr(intr_info & 0xff);
/*
* This is special. We want to treat this as an 'handled'
@@ -1318,24 +1598,42 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
break;
case EXIT_REASON_EPT_FAULT:
- vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EPT_FAULT, 1);
/*
* If 'gpa' lies within the address space allocated to
* memory then this must be a nested page fault otherwise
* this must be an instruction that accesses MMIO space.
*/
gpa = vmcs_gpa();
- if (vm_mem_allocated(vmx->vm, gpa)) {
+ if (vm_mem_allocated(vmx->vm, gpa) || apic_access_fault(gpa)) {
vmexit->exitcode = VM_EXITCODE_PAGING;
vmexit->u.paging.gpa = gpa;
vmexit->u.paging.fault_type = ept_fault_type(qual);
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
} else if (ept_emulation_fault(qual)) {
vmexit->exitcode = VM_EXITCODE_INST_EMUL;
vmexit->u.inst_emul.gpa = gpa;
vmexit->u.inst_emul.gla = vmcs_gla();
vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
}
break;
+ case EXIT_REASON_VIRTUALIZED_EOI:
+ vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
+ vmexit->u.ioapic_eoi.vector = qual & 0xFF;
+ vmexit->inst_length = 0; /* trap-like */
+ break;
+ case EXIT_REASON_APIC_ACCESS:
+ handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
+ break;
+ case EXIT_REASON_APIC_WRITE:
+ /*
+ * APIC-write VM exit is trap-like so the %rip is already
+ * pointing to the next instruction.
+ */
+ vmexit->inst_length = 0;
+ vlapic = vm_lapic(vmx->vm, vcpu);
+ handled = vmx_handle_apic_write(vlapic, qual);
+ break;
default:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
break;
@@ -1387,6 +1685,18 @@ vmx_exit_astpending(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
}
static __inline int
+vmx_exit_rendezvous(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
+{
+
+ vmexit->rip = vmcs_guest_rip();
+ vmexit->inst_length = 0;
+ vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RENDEZVOUS, 1);
+
+ return (UNHANDLED);
+}
+
+static __inline int
vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
{
@@ -1415,26 +1725,29 @@ vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
}
static int
-vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap)
+vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
+ void *rendezvous_cookie)
{
int rc, handled, launched;
struct vmx *vmx;
+ struct vm *vm;
struct vmxctx *vmxctx;
struct vmcs *vmcs;
struct vm_exit *vmexit;
+ struct vlapic *vlapic;
uint64_t rip;
uint32_t exit_reason;
vmx = arg;
+ vm = vmx->vm;
vmcs = &vmx->vmcs[vcpu];
vmxctx = &vmx->ctx[vcpu];
- vmexit = vm_exitinfo(vmx->vm, vcpu);
+ vlapic = vm_lapic(vm, vcpu);
+ vmexit = vm_exitinfo(vm, vcpu);
launched = 0;
KASSERT(vmxctx->pmap == pmap,
("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
- KASSERT(vmxctx->eptp == vmx->eptp,
- ("eptp %p different than ctx eptp %#lx", eptp, vmxctx->eptp));
VMPTRLD(vmcs);
@@ -1444,12 +1757,12 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap)
* from a different process than the one that actually runs it.
*
* If the life of a virtual machine was spent entirely in the context
- * of a single process we could do this once in vmcs_set_defaults().
+ * of a single process we could do this once in vmx_vminit().
*/
vmcs_write(VMCS_HOST_CR3, rcr3());
vmcs_write(VMCS_GUEST_RIP, startrip);
- vmx_set_pcpu_defaults(vmx, vcpu);
+ vmx_set_pcpu_defaults(vmx, vcpu, pmap);
do {
/*
* Interrupts are disabled from this point on until the
@@ -1476,9 +1789,15 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap)
break;
}
- vmx_inject_interrupts(vmx, vcpu);
+ if (vcpu_rendezvous_pending(rendezvous_cookie)) {
+ enable_intr();
+ handled = vmx_exit_rendezvous(vmx, vcpu, vmexit);
+ break;
+ }
+
+ vmx_inject_interrupts(vmx, vcpu, vlapic);
vmx_run_trace(vmx, vcpu);
- rc = vmx_enter_guest(vmxctx, launched);
+ rc = vmx_enter_guest(vmxctx, vmx, launched);
enable_intr();
@@ -1509,9 +1828,9 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap)
}
if (!handled)
- vmm_stat_incr(vmx->vm, vcpu, VMEXIT_USERSPACE, 1);
+ vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
- VCPU_CTR1(vmx->vm, vcpu, "returning from vmx_run: exitcode %d",
+ VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
vmexit->exitcode);
VMCLEAR(vmcs);
@@ -1524,6 +1843,9 @@ vmx_vmcleanup(void *arg)
int i, error;
struct vmx *vmx = arg;
+ if (virtual_interrupt_delivery)
+ vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
+
for (i = 0; i < VM_MAXCPU; i++)
vpid_free(vmx->state[i].vpid);
@@ -1731,11 +2053,11 @@ vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
if (error)
return (error);
- if (info & VMCS_INTERRUPTION_INFO_VALID)
+ if (info & VMCS_INTR_INFO_VALID)
return (EAGAIN);
info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0);
- info |= VMCS_INTERRUPTION_INFO_VALID;
+ info |= VMCS_INTR_INFO_VALID;
error = vmcs_setreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info);
if (error != 0)
return (error);
@@ -1887,6 +2209,258 @@ vmx_setcap(void *arg, int vcpu, int type, int val)
return (retval);
}
+struct vlapic_vtx {
+ struct vlapic vlapic;
+ struct pir_desc *pir_desc;
+ struct vmx *vmx;
+};
+
+#define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \
+do { \
+ VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \
+ level ? "level" : "edge", vector); \
+ VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]); \
+ VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]); \
+ VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]); \
+ VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]); \
+ VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
+} while (0)
+
+/*
+ * vlapic->ops handlers that utilize the APICv hardware assist described in
+ * Chapter 29 of the Intel SDM.
+ */
+static int
+vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
+{
+ struct vlapic_vtx *vlapic_vtx;
+ struct pir_desc *pir_desc;
+ uint64_t mask;
+ int idx, notify;
+
+ vlapic_vtx = (struct vlapic_vtx *)vlapic;
+ pir_desc = vlapic_vtx->pir_desc;
+
+ /*
+ * Keep track of interrupt requests in the PIR descriptor. This is
+ * because the virtual APIC page pointed to by the VMCS cannot be
+ * modified if the vcpu is running.
+ */
+ idx = vector / 64;
+ mask = 1UL << (vector % 64);
+ atomic_set_long(&pir_desc->pir[idx], mask);
+ notify = atomic_cmpset_long(&pir_desc->pending, 0, 1);
+
+ VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
+ level, "vmx_set_intr_ready");
+ return (notify);
+}
+
+static int
+vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
+{
+ struct vlapic_vtx *vlapic_vtx;
+ struct pir_desc *pir_desc;
+ struct LAPIC *lapic;
+ uint64_t pending, pirval;
+ uint32_t ppr, vpr;
+ int i;
+
+ /*
+ * This function is only expected to be called from the 'HLT' exit
+ * handler which does not care about the vector that is pending.
+ */
+ KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
+
+ vlapic_vtx = (struct vlapic_vtx *)vlapic;
+ pir_desc = vlapic_vtx->pir_desc;
+
+ pending = atomic_load_acq_long(&pir_desc->pending);
+ if (!pending)
+ return (0); /* common case */
+
+ /*
+ * If there is an interrupt pending then it will be recognized only
+ * if its priority is greater than the processor priority.
+ *
+ * Special case: if the processor priority is zero then any pending
+ * interrupt will be recognized.
+ */
+ lapic = vlapic->apic_page;
+ ppr = lapic->ppr & 0xf0;
+ if (ppr == 0)
+ return (1);
+
+ VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
+ lapic->ppr);
+
+ for (i = 3; i >= 0; i--) {
+ pirval = pir_desc->pir[i];
+ if (pirval != 0) {
+ vpr = (i * 64 + flsl(pirval) - 1) & 0xf0;
+ return (vpr > ppr);
+ }
+ }
+ return (0);
+}
+
+static void
+vmx_intr_accepted(struct vlapic *vlapic, int vector)
+{
+
+ panic("vmx_intr_accepted: not expected to be called");
+}
+
+static void
+vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
+{
+ struct vlapic_vtx *vlapic_vtx;
+ struct vmx *vmx;
+ struct vmcs *vmcs;
+ uint64_t mask, val;
+
+ KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
+ KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
+ ("vmx_set_tmr: vcpu cannot be running"));
+
+ vlapic_vtx = (struct vlapic_vtx *)vlapic;
+ vmx = vlapic_vtx->vmx;
+ vmcs = &vmx->vmcs[vlapic->vcpuid];
+ mask = 1UL << (vector % 64);
+
+ VMPTRLD(vmcs);
+ val = vmcs_read(VMCS_EOI_EXIT(vector));
+ if (level)
+ val |= mask;
+ else
+ val &= ~mask;
+ vmcs_write(VMCS_EOI_EXIT(vector), val);
+ VMCLEAR(vmcs);
+}
+
+static void
+vmx_post_intr(struct vlapic *vlapic, int hostcpu)
+{
+
+ ipi_cpu(hostcpu, pirvec);
+}
+
+/*
+ * Transfer the pending interrupts in the PIR descriptor to the IRR
+ * in the virtual APIC page.
+ */
+static void
+vmx_inject_pir(struct vlapic *vlapic)
+{
+ struct vlapic_vtx *vlapic_vtx;
+ struct pir_desc *pir_desc;
+ struct LAPIC *lapic;
+ uint64_t val, pirval;
+ int rvi, pirbase;
+ uint16_t intr_status_old, intr_status_new;
+
+ vlapic_vtx = (struct vlapic_vtx *)vlapic;
+ pir_desc = vlapic_vtx->pir_desc;
+ if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
+ VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
+ "no posted interrupt pending");
+ return;
+ }
+
+ pirval = 0;
+ lapic = vlapic->apic_page;
+
+ val = atomic_readandclear_long(&pir_desc->pir[0]);
+ if (val != 0) {
+ lapic->irr0 |= val;
+ lapic->irr1 |= val >> 32;
+ pirbase = 0;
+ pirval = val;
+ }
+
+ val = atomic_readandclear_long(&pir_desc->pir[1]);
+ if (val != 0) {
+ lapic->irr2 |= val;
+ lapic->irr3 |= val >> 32;
+ pirbase = 64;
+ pirval = val;
+ }
+
+ val = atomic_readandclear_long(&pir_desc->pir[2]);
+ if (val != 0) {
+ lapic->irr4 |= val;
+ lapic->irr5 |= val >> 32;
+ pirbase = 128;
+ pirval = val;
+ }
+
+ val = atomic_readandclear_long(&pir_desc->pir[3]);
+ if (val != 0) {
+ lapic->irr6 |= val;
+ lapic->irr7 |= val >> 32;
+ pirbase = 192;
+ pirval = val;
+ }
+ VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
+
+ /*
+ * Update RVI so the processor can evaluate pending virtual
+ * interrupts on VM-entry.
+ */
+ if (pirval != 0) {
+ rvi = pirbase + flsl(pirval) - 1;
+ intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
+ intr_status_new = (intr_status_old & 0xFF00) | rvi;
+ if (intr_status_new > intr_status_old) {
+ vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
+ VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
+ "guest_intr_status changed from 0x%04x to 0x%04x",
+ intr_status_old, intr_status_new);
+ }
+ }
+}
+
+static struct vlapic *
+vmx_vlapic_init(void *arg, int vcpuid)
+{
+ struct vmx *vmx;
+ struct vlapic *vlapic;
+ struct vlapic_vtx *vlapic_vtx;
+
+ vmx = arg;
+
+ vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
+ vlapic->vm = vmx->vm;
+ vlapic->vcpuid = vcpuid;
+ vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
+
+ vlapic_vtx = (struct vlapic_vtx *)vlapic;
+ vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
+ vlapic_vtx->vmx = vmx;
+
+ if (virtual_interrupt_delivery) {
+ vlapic->ops.set_intr_ready = vmx_set_intr_ready;
+ vlapic->ops.pending_intr = vmx_pending_intr;
+ vlapic->ops.intr_accepted = vmx_intr_accepted;
+ vlapic->ops.set_tmr = vmx_set_tmr;
+ }
+
+ if (posted_interrupts)
+ vlapic->ops.post_intr = vmx_post_intr;
+
+ vlapic_init(vlapic);
+
+ return (vlapic);
+}
+
+static void
+vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
+{
+
+ vlapic_cleanup(vlapic);
+ free(vlapic, M_VLAPIC);
+}
+
struct vmm_ops vmm_ops_intel = {
vmx_init,
vmx_cleanup,
@@ -1903,4 +2477,6 @@ struct vmm_ops vmm_ops_intel = {
vmx_setcap,
ept_vmspace_alloc,
ept_vmspace_free,
+ vmx_vlapic_init,
+ vmx_vlapic_cleanup,
};
diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h
index 67ef631..80bfd72 100644
--- a/sys/amd64/vmm/intel/vmx.h
+++ b/sys/amd64/vmm/intel/vmx.h
@@ -64,16 +64,13 @@ struct vmxctx {
/*
* XXX todo debug registers and fpu state
*/
-
- int inst_fail_status;
- long eptgen[MAXCPU]; /* cached pmap->pm_eptgen */
+ int inst_fail_status;
/*
- * The 'eptp' and the 'pmap' do not change during the lifetime of
- * the VM so it is safe to keep a copy in each vcpu's vmxctx.
+ * The pmap needs to be deactivated in vmx_exit_guest()
+ * so keep a copy of the 'pmap' in each vmxctx.
*/
- vm_paddr_t eptp;
struct pmap *pmap;
};
@@ -88,27 +85,45 @@ struct vmxstate {
uint16_t vpid;
};
+struct apic_page {
+ uint32_t reg[PAGE_SIZE / 4];
+};
+CTASSERT(sizeof(struct apic_page) == PAGE_SIZE);
+
+/* Posted Interrupt Descriptor (described in section 29.6 of the Intel SDM) */
+struct pir_desc {
+ uint64_t pir[4];
+ uint64_t pending;
+ uint64_t unused[3];
+} __aligned(64);
+CTASSERT(sizeof(struct pir_desc) == 64);
+
/* virtual machine softc */
struct vmx {
struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */
+ struct apic_page apic_page[VM_MAXCPU]; /* one apic page per vcpu */
char msr_bitmap[PAGE_SIZE];
+ struct pir_desc pir_desc[VM_MAXCPU];
struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES];
struct vmxctx ctx[VM_MAXCPU];
struct vmxcap cap[VM_MAXCPU];
struct vmxstate state[VM_MAXCPU];
uint64_t eptp;
struct vm *vm;
+ long eptgen[MAXCPU]; /* cached pmap->pm_eptgen */
};
CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
CTASSERT((offsetof(struct vmx, guest_msrs) & 15) == 0);
+CTASSERT((offsetof(struct vmx, pir_desc[0]) & 63) == 0);
#define VMX_GUEST_VMEXIT 0
#define VMX_VMRESUME_ERROR 1
#define VMX_VMLAUNCH_ERROR 2
#define VMX_INVEPT_ERROR 3
-int vmx_enter_guest(struct vmxctx *ctx, int launched);
+int vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched);
void vmx_exit_guest(void);
+void vmx_call_isr(uintptr_t entry);
u_long vmx_fix_cr0(u_long cr0);
u_long vmx_fix_cr4(u_long cr4);
diff --git a/sys/amd64/vmm/intel/vmx_controls.h b/sys/amd64/vmm/intel/vmx_controls.h
index 3cd2eff..2b117ae 100644
--- a/sys/amd64/vmm/intel/vmx_controls.h
+++ b/sys/amd64/vmm/intel/vmx_controls.h
@@ -34,6 +34,7 @@
#define PINBASED_NMI_EXITING (1 << 3)
#define PINBASED_VIRTUAL_NMI (1 << 5)
#define PINBASED_PREMPTION_TIMER (1 << 6)
+#define PINBASED_POSTED_INTERRUPT (1 << 7)
/* Primary Processor-Based VM-Execution Controls */
#define PROCBASED_INT_WINDOW_EXITING (1 << 2)
@@ -59,16 +60,18 @@
#define PROCBASED_SECONDARY_CONTROLS (1U << 31)
/* Secondary Processor-Based VM-Execution Controls */
-#define PROCBASED2_VIRTUALIZE_APIC (1 << 0)
-#define PROCBASED2_ENABLE_EPT (1 << 1)
-#define PROCBASED2_DESC_TABLE_EXITING (1 << 2)
-#define PROCBASED2_ENABLE_RDTSCP (1 << 3)
-#define PROCBASED2_VIRTUALIZE_X2APIC (1 << 4)
-#define PROCBASED2_ENABLE_VPID (1 << 5)
-#define PROCBASED2_WBINVD_EXITING (1 << 6)
-#define PROCBASED2_UNRESTRICTED_GUEST (1 << 7)
-#define PROCBASED2_PAUSE_LOOP_EXITING (1 << 10)
-#define PROCBASED2_ENABLE_INVPCID (1 << 12)
+#define PROCBASED2_VIRTUALIZE_APIC_ACCESSES (1 << 0)
+#define PROCBASED2_ENABLE_EPT (1 << 1)
+#define PROCBASED2_DESC_TABLE_EXITING (1 << 2)
+#define PROCBASED2_ENABLE_RDTSCP (1 << 3)
+#define PROCBASED2_VIRTUALIZE_X2APIC_MODE (1 << 4)
+#define PROCBASED2_ENABLE_VPID (1 << 5)
+#define PROCBASED2_WBINVD_EXITING (1 << 6)
+#define PROCBASED2_UNRESTRICTED_GUEST (1 << 7)
+#define PROCBASED2_APIC_REGISTER_VIRTUALIZATION (1 << 8)
+#define PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY (1 << 9)
+#define PROCBASED2_PAUSE_LOOP_EXITING (1 << 10)
+#define PROCBASED2_ENABLE_INVPCID (1 << 12)
/* VM Exit Controls */
#define VM_EXIT_SAVE_DEBUG_CONTROLS (1 << 2)
diff --git a/sys/amd64/vmm/intel/vmx_genassym.c b/sys/amd64/vmm/intel/vmx_genassym.c
index bf463dc..5c91fec 100644
--- a/sys/amd64/vmm/intel/vmx_genassym.c
+++ b/sys/amd64/vmm/intel/vmx_genassym.c
@@ -68,10 +68,10 @@ ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx));
ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip));
ASSYM(VMXCTX_INST_FAIL_STATUS, offsetof(struct vmxctx, inst_fail_status));
-ASSYM(VMXCTX_EPTGEN, offsetof(struct vmxctx, eptgen));
-
ASSYM(VMXCTX_PMAP, offsetof(struct vmxctx, pmap));
-ASSYM(VMXCTX_EPTP, offsetof(struct vmxctx, eptp));
+
+ASSYM(VMX_EPTGEN, offsetof(struct vmx, eptgen));
+ASSYM(VMX_EPTP, offsetof(struct vmx, eptp));
ASSYM(VM_FAIL_INVALID, VM_FAIL_INVALID);
ASSYM(VM_FAIL_VALID, VM_FAIL_VALID);
@@ -84,3 +84,6 @@ ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid));
ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
ASSYM(PM_EPTGEN, offsetof(struct pmap, pm_eptgen));
+
+ASSYM(KERNEL_SS, GSEL(GDATA_SEL, SEL_KPL));
+ASSYM(KERNEL_CS, GSEL(GCODE_SEL, SEL_KPL));
diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S
index d616984..9e8cf2d 100644
--- a/sys/amd64/vmm/intel/vmx_support.S
+++ b/sys/amd64/vmm/intel/vmx_support.S
@@ -97,7 +97,8 @@
/*
* vmx_enter_guest(struct vmxctx *vmxctx, int launched)
* %rdi: pointer to the 'vmxctx'
- * %esi: launch state of the VMCS
+ * %rsi: pointer to the 'vmx'
+ * %edx: launch state of the VMCS
* Interrupts must be disabled on entry.
*/
ENTRY(vmx_enter_guest)
@@ -114,19 +115,19 @@ ENTRY(vmx_enter_guest)
LK btsl %eax, PM_ACTIVE(%r11)
/*
- * If 'vmxctx->eptgen[curcpu]' is not identical to 'pmap->pm_eptgen'
+ * If 'vmx->eptgen[curcpu]' is not identical to 'pmap->pm_eptgen'
* then we must invalidate all mappings associated with this EPTP.
*/
movq PM_EPTGEN(%r11), %r10
- cmpq %r10, VMXCTX_EPTGEN(%rdi, %rax, 8)
+ cmpq %r10, VMX_EPTGEN(%rsi, %rax, 8)
je guest_restore
- /* Refresh 'vmxctx->eptgen[curcpu]' */
- movq %r10, VMXCTX_EPTGEN(%rdi, %rax, 8)
+ /* Refresh 'vmx->eptgen[curcpu]' */
+ movq %r10, VMX_EPTGEN(%rsi, %rax, 8)
/* Setup the invept descriptor on the host stack */
mov %rsp, %r11
- movq VMXCTX_EPTP(%rdi), %rax
+ movq VMX_EPTP(%rsi), %rax
movq %rax, -16(%r11)
movq $0x0, -8(%r11)
mov $0x1, %eax /* Single context invalidate */
@@ -134,7 +135,7 @@ ENTRY(vmx_enter_guest)
jbe invept_error /* Check invept instruction error */
guest_restore:
- cmpl $0, %esi
+ cmpl $0, %edx
je do_launch
VMX_GUEST_RESTORE
@@ -234,3 +235,21 @@ ENTRY(vmx_exit_guest)
movl $VMX_GUEST_VMEXIT, %eax
ret
END(vmx_exit_guest)
+
+/*
+ * %rdi = interrupt handler entry point
+ *
+ * Calling sequence described in the "Instruction Set Reference" for the "INT"
+ * instruction in Intel SDM, Vol 2.
+ */
+ENTRY(vmx_call_isr)
+ mov %rsp, %r11 /* save %rsp */
+ and $~0xf, %rsp /* align on 16-byte boundary */
+ pushq $KERNEL_SS /* %ss */
+ pushq %r11 /* %rsp */
+ pushfq /* %rflags */
+ pushq $KERNEL_CS /* %cs */
+ cli /* disable interrupts */
+ callq *%rdi /* push %rip and call isr */
+ ret
+END(vmx_call_isr)
diff --git a/sys/amd64/vmm/io/vioapic.c b/sys/amd64/vmm/io/vioapic.c
index 151065a..703e479 100644
--- a/sys/amd64/vmm/io/vioapic.c
+++ b/sys/amd64/vmm/io/vioapic.c
@@ -222,8 +222,52 @@ vioapic_pulse_irq(struct vm *vm, int irq)
return (vioapic_set_irqstate(vm, irq, IRQSTATE_PULSE));
}
+/*
+ * Reset the vlapic's trigger-mode register to reflect the ioapic pin
+ * configuration.
+ */
+static void
+vioapic_update_tmr(struct vm *vm, int vcpuid, void *arg)
+{
+ struct vioapic *vioapic;
+ struct vlapic *vlapic;
+ uint32_t low, high, dest;
+ int delmode, pin, vector;
+ bool level, phys;
+
+ vlapic = vm_lapic(vm, vcpuid);
+ vioapic = vm_ioapic(vm);
+
+ VIOAPIC_LOCK(vioapic);
+ /*
+ * Reset all vectors to be edge-triggered.
+ */
+ vlapic_reset_tmr(vlapic);
+ for (pin = 0; pin < REDIR_ENTRIES; pin++) {
+ low = vioapic->rtbl[pin].reg;
+ high = vioapic->rtbl[pin].reg >> 32;
+
+ level = low & IOART_TRGRLVL ? true : false;
+ if (!level)
+ continue;
+
+ /*
+ * For a level-triggered 'pin' let the vlapic figure out if
+ * an assertion on this 'pin' would result in an interrupt
+ * being delivered to it. If yes, then it will modify the
+ * TMR bit associated with this vector to level-triggered.
+ */
+ phys = ((low & IOART_DESTMOD) == IOART_DESTPHY);
+ delmode = low & IOART_DELMOD;
+ vector = low & IOART_INTVEC;
+ dest = high >> APIC_ID_SHIFT;
+ vlapic_set_tmr_level(vlapic, dest, phys, delmode, vector);
+ }
+ VIOAPIC_UNLOCK(vioapic);
+}
+
static uint32_t
-vioapic_read(struct vioapic *vioapic, uint32_t addr)
+vioapic_read(struct vioapic *vioapic, int vcpuid, uint32_t addr)
{
int regnum, pin, rshift;
@@ -258,10 +302,12 @@ vioapic_read(struct vioapic *vioapic, uint32_t addr)
}
static void
-vioapic_write(struct vioapic *vioapic, uint32_t addr, uint32_t data)
+vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data)
{
uint64_t data64, mask64;
+ uint64_t last, changed;
int regnum, pin, lshift;
+ cpuset_t allvcpus;
regnum = addr & 0xff;
switch (regnum) {
@@ -285,6 +331,8 @@ vioapic_write(struct vioapic *vioapic, uint32_t addr, uint32_t data)
else
lshift = 0;
+ last = vioapic->rtbl[pin].reg;
+
data64 = (uint64_t)data << lshift;
mask64 = (uint64_t)0xffffffff << lshift;
vioapic->rtbl[pin].reg &= ~mask64 | RTBL_RO_BITS;
@@ -294,6 +342,22 @@ vioapic_write(struct vioapic *vioapic, uint32_t addr, uint32_t data)
pin, vioapic->rtbl[pin].reg);
/*
+ * If any fields in the redirection table entry (except mask
+ * or polarity) have changed then rendezvous all the vcpus
+ * to update their vlapic trigger-mode registers.
+ */
+ changed = last ^ vioapic->rtbl[pin].reg;
+ if (changed & ~(IOART_INTMASK | IOART_INTPOL)) {
+ VIOAPIC_CTR1(vioapic, "ioapic pin%d: recalculate "
+ "vlapic trigger-mode register", pin);
+ VIOAPIC_UNLOCK(vioapic);
+ allvcpus = vm_active_cpus(vioapic->vm);
+ vm_smp_rendezvous(vioapic->vm, vcpuid, allvcpus,
+ vioapic_update_tmr, NULL);
+ VIOAPIC_LOCK(vioapic);
+ }
+
+ /*
* Generate an interrupt if the following conditions are met:
* - pin is not masked
* - previous interrupt has been EOIed
@@ -310,8 +374,8 @@ vioapic_write(struct vioapic *vioapic, uint32_t addr, uint32_t data)
}
static int
-vioapic_mmio_rw(struct vioapic *vioapic, uint64_t gpa, uint64_t *data,
- int size, bool doread)
+vioapic_mmio_rw(struct vioapic *vioapic, int vcpuid, uint64_t gpa,
+ uint64_t *data, int size, bool doread)
{
uint64_t offset;
@@ -334,10 +398,13 @@ vioapic_mmio_rw(struct vioapic *vioapic, uint64_t gpa, uint64_t *data,
else
vioapic->ioregsel = *data;
} else {
- if (doread)
- *data = vioapic_read(vioapic, vioapic->ioregsel);
- else
- vioapic_write(vioapic, vioapic->ioregsel, *data);
+ if (doread) {
+ *data = vioapic_read(vioapic, vcpuid,
+ vioapic->ioregsel);
+ } else {
+ vioapic_write(vioapic, vcpuid, vioapic->ioregsel,
+ *data);
+ }
}
VIOAPIC_UNLOCK(vioapic);
@@ -352,7 +419,7 @@ vioapic_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *rval,
struct vioapic *vioapic;
vioapic = vm_ioapic(vm);
- error = vioapic_mmio_rw(vioapic, gpa, rval, size, true);
+ error = vioapic_mmio_rw(vioapic, vcpuid, gpa, rval, size, true);
return (error);
}
@@ -364,7 +431,7 @@ vioapic_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t wval,
struct vioapic *vioapic;
vioapic = vm_ioapic(vm);
- error = vioapic_mmio_rw(vioapic, gpa, &wval, size, false);
+ error = vioapic_mmio_rw(vioapic, vcpuid, gpa, &wval, size, false);
return (error);
}
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index 695040d..2395247 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -37,108 +37,34 @@ __FBSDID("$FreeBSD$");
#include <sys/systm.h>
#include <sys/smp.h>
-#include <machine/clock.h>
#include <x86/specialreg.h>
#include <x86/apicreg.h>
+#include <machine/clock.h>
+#include <machine/smp.h>
+
#include <machine/vmm.h>
-#include "vmm_stat.h"
+#include "vmm_ipi.h"
#include "vmm_lapic.h"
#include "vmm_ktr.h"
+#include "vmm_stat.h"
+
#include "vlapic.h"
+#include "vlapic_priv.h"
#include "vioapic.h"
-#define VLAPIC_CTR0(vlapic, format) \
- VCPU_CTR0((vlapic)->vm, (vlapic)->vcpuid, format)
-
-#define VLAPIC_CTR1(vlapic, format, p1) \
- VCPU_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1)
-
-#define VLAPIC_CTR2(vlapic, format, p1, p2) \
- VCPU_CTR2((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2)
-
-#define VLAPIC_CTR_IRR(vlapic, msg) \
-do { \
- uint32_t *irrptr = &(vlapic)->apic.irr0; \
- irrptr[0] = irrptr[0]; /* silence compiler */ \
- VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]); \
- VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]); \
- VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]); \
- VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]); \
- VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]); \
- VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]); \
- VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]); \
- VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]); \
-} while (0)
-
-#define VLAPIC_CTR_ISR(vlapic, msg) \
-do { \
- uint32_t *isrptr = &(vlapic)->apic.isr0; \
- isrptr[0] = isrptr[0]; /* silence compiler */ \
- VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]); \
- VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]); \
- VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]); \
- VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]); \
- VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]); \
- VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]); \
- VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]); \
- VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]); \
-} while (0)
-
-static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
-
#define PRIO(x) ((x) >> 4)
#define VLAPIC_VERSION (16)
-#define VLAPIC_MAXLVT_ENTRIES (APIC_LVT_CMCI)
#define x2apic(vlapic) (((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0)
-enum boot_state {
- BS_INIT,
- BS_SIPI,
- BS_RUNNING
-};
-
-struct vlapic {
- struct vm *vm;
- int vcpuid;
-
- struct LAPIC apic;
-
- uint32_t esr_pending;
- int esr_firing;
-
- struct callout callout; /* vlapic timer */
- struct bintime timer_fire_bt; /* callout expiry time */
- struct bintime timer_freq_bt; /* timer frequency */
- struct bintime timer_period_bt; /* timer period */
- struct mtx timer_mtx;
-
- /*
- * The 'isrvec_stk' is a stack of vectors injected by the local apic.
- * A vector is popped from the stack when the processor does an EOI.
- * The vector on the top of the stack is used to compute the
- * Processor Priority in conjunction with the TPR.
- */
- uint8_t isrvec_stk[ISRVEC_STK_SIZE];
- int isrvec_stk_top;
-
- uint64_t msr_apicbase;
- enum boot_state boot_state;
-};
-
/*
* The 'vlapic->timer_mtx' is used to provide mutual exclusion between the
- * vlapic_callout_handler() and vcpu accesses to the following registers:
- * - initial count register aka icr_timer
- * - current count register aka ccr_timer
- * - divide config register aka dcr_timer
+ * vlapic_callout_handler() and vcpu accesses to:
+ * - timer_freq_bt, timer_period_bt, timer_fire_bt
* - timer LVT register
- *
- * Note that the vlapic_callout_handler() does not write to any of these
- * registers so they can be safely read from the vcpu context without locking.
*/
#define VLAPIC_TIMER_LOCK(vlapic) mtx_lock_spin(&((vlapic)->timer_mtx))
#define VLAPIC_TIMER_UNLOCK(vlapic) mtx_unlock_spin(&((vlapic)->timer_mtx))
@@ -156,72 +82,71 @@ vlapic_get_id(struct vlapic *vlapic)
return (vlapic->vcpuid << 24);
}
-static __inline uint32_t
-vlapic_get_ldr(struct vlapic *vlapic)
+static uint32_t
+x2apic_ldr(struct vlapic *vlapic)
{
- struct LAPIC *lapic;
int apicid;
uint32_t ldr;
- lapic = &vlapic->apic;
- if (x2apic(vlapic)) {
- apicid = vlapic_get_id(vlapic);
- ldr = 1 << (apicid & 0xf);
- ldr |= (apicid & 0xffff0) << 12;
- return (ldr);
- } else
- return (lapic->ldr);
+ apicid = vlapic_get_id(vlapic);
+ ldr = 1 << (apicid & 0xf);
+ ldr |= (apicid & 0xffff0) << 12;
+ return (ldr);
}
-static __inline uint32_t
-vlapic_get_dfr(struct vlapic *vlapic)
+void
+vlapic_dfr_write_handler(struct vlapic *vlapic)
{
struct LAPIC *lapic;
- lapic = &vlapic->apic;
- if (x2apic(vlapic))
- return (0);
- else
- return (lapic->dfr);
-}
-
-static void
-vlapic_set_dfr(struct vlapic *vlapic, uint32_t data)
-{
- uint32_t dfr;
- struct LAPIC *lapic;
-
+ lapic = vlapic->apic_page;
if (x2apic(vlapic)) {
- VM_CTR1(vlapic->vm, "write to DFR in x2apic mode: %#x", data);
+ VM_CTR1(vlapic->vm, "ignoring write to DFR in x2apic mode: %#x",
+ lapic->dfr);
+ lapic->dfr = 0;
return;
}
- lapic = &vlapic->apic;
- dfr = (lapic->dfr & APIC_DFR_RESERVED) | (data & APIC_DFR_MODEL_MASK);
- if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT)
+ lapic->dfr &= APIC_DFR_MODEL_MASK;
+ lapic->dfr |= APIC_DFR_RESERVED;
+
+ if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT)
VLAPIC_CTR0(vlapic, "vlapic DFR in Flat Model");
- else if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER)
+ else if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER)
VLAPIC_CTR0(vlapic, "vlapic DFR in Cluster Model");
else
- VLAPIC_CTR1(vlapic, "vlapic DFR in Unknown Model %#x", dfr);
-
- lapic->dfr = dfr;
+ VLAPIC_CTR1(vlapic, "DFR in Unknown Model %#x", lapic->dfr);
}
-static void
-vlapic_set_ldr(struct vlapic *vlapic, uint32_t data)
+void
+vlapic_ldr_write_handler(struct vlapic *vlapic)
{
struct LAPIC *lapic;
+ lapic = vlapic->apic_page;
+
/* LDR is read-only in x2apic mode */
if (x2apic(vlapic)) {
- VLAPIC_CTR1(vlapic, "write to LDR in x2apic mode: %#x", data);
- return;
+ VLAPIC_CTR1(vlapic, "ignoring write to LDR in x2apic mode: %#x",
+ lapic->ldr);
+ lapic->ldr = x2apic_ldr(vlapic);
+ } else {
+ lapic->ldr &= ~APIC_LDR_RESERVED;
+ VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr);
}
+}
- lapic = &vlapic->apic;
- lapic->ldr = data & ~APIC_LDR_RESERVED;
- VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr);
+void
+vlapic_id_write_handler(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic;
+
+ /*
+ * We don't allow the ID register to be modified so reset it back to
+ * its default value.
+ */
+ lapic = vlapic->apic_page;
+ lapic->id = vlapic_get_id(vlapic);
}
static int
@@ -249,16 +174,6 @@ vlapic_timer_divisor(uint32_t dcr)
}
}
-static void
-vlapic_mask_lvts(uint32_t *lvts, int num_lvt)
-{
- int i;
- for (i = 0; i < num_lvt; i++) {
- *lvts |= APIC_LVT_M;
- lvts += 4;
- }
-}
-
#if 0
static inline void
vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
@@ -277,7 +192,7 @@ vlapic_get_ccr(struct vlapic *vlapic)
uint32_t ccr;
ccr = 0;
- lapic = &vlapic->apic;
+ lapic = vlapic->apic_page;
VLAPIC_TIMER_LOCK(vlapic);
if (callout_active(&vlapic->callout)) {
@@ -301,18 +216,18 @@ vlapic_get_ccr(struct vlapic *vlapic)
return (ccr);
}
-static void
-vlapic_set_dcr(struct vlapic *vlapic, uint32_t dcr)
+void
+vlapic_dcr_write_handler(struct vlapic *vlapic)
{
struct LAPIC *lapic;
int divisor;
- lapic = &vlapic->apic;
+ lapic = vlapic->apic_page;
VLAPIC_TIMER_LOCK(vlapic);
- lapic->dcr_timer = dcr;
- divisor = vlapic_timer_divisor(dcr);
- VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d", dcr, divisor);
+ divisor = vlapic_timer_divisor(lapic->dcr_timer);
+ VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d",
+ lapic->dcr_timer, divisor);
/*
* Update the timer frequency and the timer period.
@@ -327,57 +242,42 @@ vlapic_set_dcr(struct vlapic *vlapic, uint32_t dcr)
VLAPIC_TIMER_UNLOCK(vlapic);
}
-static void
-vlapic_update_errors(struct vlapic *vlapic)
-{
- struct LAPIC *lapic = &vlapic->apic;
- lapic->esr = vlapic->esr_pending;
- vlapic->esr_pending = 0;
-}
-
-static void
-vlapic_reset(struct vlapic *vlapic)
+void
+vlapic_esr_write_handler(struct vlapic *vlapic)
{
struct LAPIC *lapic;
- lapic = &vlapic->apic;
- bzero(lapic, sizeof(struct LAPIC));
-
- lapic->version = VLAPIC_VERSION;
- lapic->version |= (VLAPIC_MAXLVT_ENTRIES << MAXLVTSHIFT);
- lapic->dfr = 0xffffffff;
- lapic->svr = APIC_SVR_VECTOR;
- vlapic_mask_lvts(&lapic->lvt_timer, 6);
- vlapic_mask_lvts(&lapic->lvt_cmci, 1);
- vlapic_set_dcr(vlapic, 0);
-
- if (vlapic->vcpuid == 0)
- vlapic->boot_state = BS_RUNNING; /* BSP */
- else
- vlapic->boot_state = BS_INIT; /* AP */
+ lapic = vlapic->apic_page;
+ lapic->esr = vlapic->esr_pending;
+ vlapic->esr_pending = 0;
}
-void
+int
vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
{
- struct LAPIC *lapic = &vlapic->apic;
- uint32_t *irrptr, *tmrptr, mask;
- int idx;
+ struct LAPIC *lapic;
+ uint32_t *irrptr, *tmrptr, mask;
+ int idx;
- if (vector < 0 || vector >= 256)
- panic("vlapic_set_intr_ready: invalid vector %d\n", vector);
+ KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector));
+ lapic = vlapic->apic_page;
if (!(lapic->svr & APIC_SVR_ENABLE)) {
VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring "
"interrupt %d", vector);
- return;
+ return (0);
}
if (vector < 16) {
vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR);
- return;
+ VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d",
+ vector);
+ return (1);
}
-
+
+ if (vlapic->ops.set_intr_ready)
+ return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level));
+
idx = (vector / 32) * 4;
mask = 1 << (vector % 32);
@@ -385,23 +285,22 @@ vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
atomic_set_int(&irrptr[idx], mask);
/*
- * Upon acceptance of an interrupt into the IRR the corresponding
- * TMR bit is cleared for edge-triggered interrupts and set for
- * level-triggered interrupts.
+ * Verify that the trigger-mode of the interrupt matches with
+ * the vlapic TMR registers.
*/
tmrptr = &lapic->tmr0;
- if (level)
- atomic_set_int(&tmrptr[idx], mask);
- else
- atomic_clear_int(&tmrptr[idx], mask);
+ KASSERT((tmrptr[idx] & mask) == (level ? mask : 0),
+ ("vlapic TMR[%d] is 0x%08x but interrupt is %s-triggered",
+ idx / 4, tmrptr[idx], level ? "level" : "edge"));
VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
+ return (1);
}
static __inline uint32_t *
vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset)
{
- struct LAPIC *lapic = &vlapic->apic;
+ struct LAPIC *lapic = vlapic->apic_page;
int i;
switch (offset) {
@@ -415,24 +314,65 @@ vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset)
}
}
+static __inline int
+lvt_off_to_idx(uint32_t offset)
+{
+ int index;
+
+ switch (offset) {
+ case APIC_OFFSET_CMCI_LVT:
+ index = APIC_LVT_CMCI;
+ break;
+ case APIC_OFFSET_TIMER_LVT:
+ index = APIC_LVT_TIMER;
+ break;
+ case APIC_OFFSET_THERM_LVT:
+ index = APIC_LVT_THERMAL;
+ break;
+ case APIC_OFFSET_PERF_LVT:
+ index = APIC_LVT_PMC;
+ break;
+ case APIC_OFFSET_LINT0_LVT:
+ index = APIC_LVT_LINT0;
+ break;
+ case APIC_OFFSET_LINT1_LVT:
+ index = APIC_LVT_LINT1;
+ break;
+ case APIC_OFFSET_ERROR_LVT:
+ index = APIC_LVT_ERROR;
+ break;
+ default:
+ index = -1;
+ break;
+ }
+ KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: "
+ "invalid lvt index %d for offset %#x", index, offset));
+
+ return (index);
+}
+
static __inline uint32_t
vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
{
+ int idx;
+ uint32_t val;
- return (*vlapic_get_lvtptr(vlapic, offset));
+ idx = lvt_off_to_idx(offset);
+ val = atomic_load_acq_32(&vlapic->lvt_last[idx]);
+ return (val);
}
-static void
-vlapic_set_lvt(struct vlapic *vlapic, uint32_t offset, uint32_t val)
+void
+vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset)
{
- uint32_t *lvtptr, mask;
+ uint32_t *lvtptr, mask, val;
struct LAPIC *lapic;
+ int idx;
- lapic = &vlapic->apic;
+ lapic = vlapic->apic_page;
lvtptr = vlapic_get_lvtptr(vlapic, offset);
-
- if (offset == APIC_OFFSET_TIMER_LVT)
- VLAPIC_TIMER_LOCK(vlapic);
+ val = *lvtptr;
+ idx = lvt_off_to_idx(offset);
if (!(lapic->svr & APIC_SVR_ENABLE))
val |= APIC_LVT_M;
@@ -451,10 +391,36 @@ vlapic_set_lvt(struct vlapic *vlapic, uint32_t offset, uint32_t val)
mask |= APIC_LVT_DM;
break;
}
- *lvtptr = val & mask;
+ val &= mask;
+ *lvtptr = val;
+ atomic_store_rel_32(&vlapic->lvt_last[idx], val);
+}
+
+static void
+vlapic_mask_lvts(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = vlapic->apic_page;
+
+ lapic->lvt_cmci |= APIC_LVT_M;
+ vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT);
+
+ lapic->lvt_timer |= APIC_LVT_M;
+ vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT);
- if (offset == APIC_OFFSET_TIMER_LVT)
- VLAPIC_TIMER_UNLOCK(vlapic);
+ lapic->lvt_thermal |= APIC_LVT_M;
+ vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT);
+
+ lapic->lvt_pcint |= APIC_LVT_M;
+ vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT);
+
+ lapic->lvt_lint0 |= APIC_LVT_M;
+ vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT);
+
+ lapic->lvt_lint1 |= APIC_LVT_M;
+ vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT);
+
+ lapic->lvt_error |= APIC_LVT_M;
+ vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT);
}
static int
@@ -474,8 +440,8 @@ vlapic_fire_lvt(struct vlapic *vlapic, uint32_t lvt)
vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR);
return (0);
}
- vlapic_set_intr_ready(vlapic, vec, false);
- vcpu_notify_event(vlapic->vm, vlapic->vcpuid);
+ if (vlapic_set_intr_ready(vlapic, vec, false))
+ vcpu_notify_event(vlapic->vm, vlapic->vcpuid, true);
break;
case APIC_LVT_DM_NMI:
vm_inject_nmi(vlapic->vm, vlapic->vcpuid);
@@ -494,7 +460,7 @@ dump_isrvec_stk(struct vlapic *vlapic)
int i;
uint32_t *isrptr;
- isrptr = &vlapic->apic.isr0;
+ isrptr = &vlapic->apic_page->isr0;
for (i = 0; i < 8; i++)
printf("ISR%d 0x%08x\n", i, isrptr[i * 4]);
@@ -519,7 +485,7 @@ vlapic_update_ppr(struct vlapic *vlapic)
* bits is set in the ISRx registers.
*/
isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top];
- tpr = vlapic->apic.tpr;
+ tpr = vlapic->apic_page->tpr;
#if 1
{
@@ -548,7 +514,7 @@ vlapic_update_ppr(struct vlapic *vlapic)
* corresponding entry on the isrvec stack.
*/
i = 1;
- isrptr = &vlapic->apic.isr0;
+ isrptr = &vlapic->apic_page->isr0;
for (vector = 0; vector < 256; vector++) {
idx = (vector / 32) * 4;
if (isrptr[idx] & (1 << (vector % 32))) {
@@ -568,14 +534,14 @@ vlapic_update_ppr(struct vlapic *vlapic)
else
ppr = isrvec & 0xf0;
- vlapic->apic.ppr = ppr;
+ vlapic->apic_page->ppr = ppr;
VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
}
static void
vlapic_process_eoi(struct vlapic *vlapic)
{
- struct LAPIC *lapic = &vlapic->apic;
+ struct LAPIC *lapic = vlapic->apic_page;
uint32_t *isrptr, *tmrptr;
int i, idx, bitpos, vector;
@@ -675,7 +641,7 @@ vlapic_fire_cmci(struct vlapic *vlapic)
}
}
-static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_ENTRIES,
+static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1,
"lvts triggered");
int
@@ -735,8 +701,6 @@ vlapic_callout_handler(void *arg)
callout_deactivate(&vlapic->callout);
- KASSERT(vlapic->apic.icr_timer != 0, ("vlapic timer is disabled"));
-
vlapic_fire_timer(vlapic);
if (vlapic_periodic_timer(vlapic)) {
@@ -781,16 +745,17 @@ done:
VLAPIC_TIMER_UNLOCK(vlapic);
}
-static void
-vlapic_set_icr_timer(struct vlapic *vlapic, uint32_t icr_timer)
+void
+vlapic_icrtmr_write_handler(struct vlapic *vlapic)
{
struct LAPIC *lapic;
sbintime_t sbt;
+ uint32_t icr_timer;
VLAPIC_TIMER_LOCK(vlapic);
- lapic = &vlapic->apic;
- lapic->icr_timer = icr_timer;
+ lapic = vlapic->apic_page;
+ icr_timer = lapic->icr_timer;
vlapic->timer_period_bt = vlapic->timer_freq_bt;
bintime_mul(&vlapic->timer_period_bt, icr_timer);
@@ -872,8 +837,8 @@ vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
CPU_CLR(vcpuid, &amask);
vlapic = vm_lapic(vm, vcpuid);
- dfr = vlapic_get_dfr(vlapic);
- ldr = vlapic_get_ldr(vlapic);
+ dfr = vlapic->apic_page->dfr;
+ ldr = vlapic->apic_page->ldr;
if ((dfr & APIC_DFR_MODEL_MASK) ==
APIC_DFR_MODEL_FLAT) {
@@ -912,16 +877,22 @@ vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
static VMM_STAT_ARRAY(IPIS_SENT, VM_MAXCPU, "ipis sent to vcpu");
-static int
-lapic_process_icr(struct vlapic *vlapic, uint64_t icrval, bool *retu)
+int
+vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu)
{
int i;
bool phys;
cpuset_t dmask;
+ uint64_t icrval;
uint32_t dest, vec, mode;
struct vlapic *vlapic2;
struct vm_exit *vmexit;
-
+ struct LAPIC *lapic;
+
+ lapic = vlapic->apic_page;
+ lapic->icr_lo &= ~APIC_DELSTAT_PEND;
+ icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo;
+
if (x2apic(vlapic))
dest = icrval >> 32;
else
@@ -931,9 +902,12 @@ lapic_process_icr(struct vlapic *vlapic, uint64_t icrval, bool *retu)
if (mode == APIC_DELMODE_FIXED && vec < 16) {
vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR);
+ VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec);
return (0);
}
-
+
+ VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec);
+
if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) {
switch (icrval & APIC_DEST_MASK) {
case APIC_DEST_DESTFLD:
@@ -963,8 +937,13 @@ lapic_process_icr(struct vlapic *vlapic, uint64_t icrval, bool *retu)
lapic_intr_edge(vlapic->vm, i, vec);
vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid,
IPIS_SENT, i, 1);
- } else
+ VLAPIC_CTR2(vlapic, "vlapic sending ipi %d "
+ "to vcpuid %d", vec, i);
+ } else {
vm_inject_nmi(vlapic->vm, i);
+ VLAPIC_CTR1(vlapic, "vlapic sending ipi nmi "
+ "to vcpuid %d", i);
+ }
}
return (0); /* handled completely in the kernel */
@@ -1019,12 +998,15 @@ lapic_process_icr(struct vlapic *vlapic, uint64_t icrval, bool *retu)
}
int
-vlapic_pending_intr(struct vlapic *vlapic)
+vlapic_pending_intr(struct vlapic *vlapic, int *vecptr)
{
- struct LAPIC *lapic = &vlapic->apic;
+ struct LAPIC *lapic = vlapic->apic_page;
int idx, i, bitpos, vector;
uint32_t *irrptr, val;
+ if (vlapic->ops.pending_intr)
+ return ((*vlapic->ops.pending_intr)(vlapic, vecptr));
+
irrptr = &lapic->irr0;
/*
@@ -1039,21 +1021,26 @@ vlapic_pending_intr(struct vlapic *vlapic)
vector = i * 32 + (bitpos - 1);
if (PRIO(vector) > PRIO(lapic->ppr)) {
VLAPIC_CTR1(vlapic, "pending intr %d", vector);
- return (vector);
+ if (vecptr != NULL)
+ *vecptr = vector;
+ return (1);
} else
break;
}
}
- return (-1);
+ return (0);
}
void
vlapic_intr_accepted(struct vlapic *vlapic, int vector)
{
- struct LAPIC *lapic = &vlapic->apic;
+ struct LAPIC *lapic = vlapic->apic_page;
uint32_t *irrptr, *isrptr;
int idx, stk_top;
+ if (vlapic->ops.intr_accepted)
+ return ((*vlapic->ops.intr_accepted)(vlapic, vector));
+
/*
* clear the ready bit for vector being accepted in irr
* and set the vector as in service in isr.
@@ -1081,24 +1068,30 @@ vlapic_intr_accepted(struct vlapic *vlapic, int vector)
vlapic_update_ppr(vlapic);
}
-static void
-lapic_set_svr(struct vlapic *vlapic, uint32_t new)
+void
+vlapic_svr_write_handler(struct vlapic *vlapic)
{
struct LAPIC *lapic;
- uint32_t old, changed;
+ uint32_t old, new, changed;
+
+ lapic = vlapic->apic_page;
+
+ new = lapic->svr;
+ old = vlapic->svr_last;
+ vlapic->svr_last = new;
- lapic = &vlapic->apic;
- old = lapic->svr;
changed = old ^ new;
if ((changed & APIC_SVR_ENABLE) != 0) {
if ((new & APIC_SVR_ENABLE) == 0) {
/*
- * The apic is now disabled so stop the apic timer.
+ * The apic is now disabled so stop the apic timer
+ * and mask all the LVT entries.
*/
VLAPIC_CTR0(vlapic, "vlapic is software-disabled");
VLAPIC_TIMER_LOCK(vlapic);
callout_stop(&vlapic->callout);
VLAPIC_TIMER_UNLOCK(vlapic);
+ vlapic_mask_lvts(vlapic);
} else {
/*
* The apic is now enabled so restart the apic timer
@@ -1106,16 +1099,15 @@ lapic_set_svr(struct vlapic *vlapic, uint32_t new)
*/
VLAPIC_CTR0(vlapic, "vlapic is software-enabled");
if (vlapic_periodic_timer(vlapic))
- vlapic_set_icr_timer(vlapic, lapic->icr_timer);
+ vlapic_icrtmr_write_handler(vlapic);
}
}
- lapic->svr = new;
}
int
vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data, bool *retu)
{
- struct LAPIC *lapic = &vlapic->apic;
+ struct LAPIC *lapic = vlapic->apic_page;
uint32_t *reg;
int i;
@@ -1128,7 +1120,7 @@ vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data, bool *retu)
switch(offset)
{
case APIC_OFFSET_ID:
- *data = vlapic_get_id(vlapic);
+ *data = lapic->id;
break;
case APIC_OFFSET_VER:
*data = lapic->version;
@@ -1146,10 +1138,10 @@ vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data, bool *retu)
*data = lapic->eoi;
break;
case APIC_OFFSET_LDR:
- *data = vlapic_get_ldr(vlapic);
+ *data = lapic->ldr;
break;
case APIC_OFFSET_DFR:
- *data = vlapic_get_dfr(vlapic);
+ *data = lapic->dfr;
break;
case APIC_OFFSET_SVR:
*data = lapic->svr;
@@ -1174,6 +1166,8 @@ vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data, bool *retu)
break;
case APIC_OFFSET_ICR_LOW:
*data = lapic->icr_lo;
+ if (x2apic(vlapic))
+ *data |= (uint64_t)lapic->icr_hi << 32;
break;
case APIC_OFFSET_ICR_HI:
*data = lapic->icr_hi;
@@ -1181,14 +1175,19 @@ vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data, bool *retu)
case APIC_OFFSET_CMCI_LVT:
case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
*data = vlapic_get_lvt(vlapic, offset);
+#ifdef INVARIANTS
+ reg = vlapic_get_lvtptr(vlapic, offset);
+ KASSERT(*data == *reg, ("inconsistent lvt value at "
+ "offset %#lx: %#lx/%#x", offset, *data, *reg));
+#endif
break;
- case APIC_OFFSET_ICR:
+ case APIC_OFFSET_TIMER_ICR:
*data = lapic->icr_timer;
break;
- case APIC_OFFSET_CCR:
+ case APIC_OFFSET_TIMER_CCR:
*data = vlapic_get_ccr(vlapic);
break;
- case APIC_OFFSET_DCR:
+ case APIC_OFFSET_TIMER_DCR:
*data = lapic->dcr_timer;
break;
case APIC_OFFSET_RRR:
@@ -1204,9 +1203,13 @@ done:
int
vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data, bool *retu)
{
- struct LAPIC *lapic = &vlapic->apic;
+ struct LAPIC *lapic = vlapic->apic_page;
+ uint32_t *regptr;
int retval;
+ KASSERT((offset & 0xf) == 0 && offset < PAGE_SIZE,
+ ("vlapic_write: invalid offset %#lx", offset));
+
VLAPIC_CTR2(vlapic, "vlapic write offset %#x, data %#lx", offset, data);
if (offset > sizeof(*lapic)) {
@@ -1214,10 +1217,11 @@ vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data, bool *retu)
}
retval = 0;
- offset &= ~3;
switch(offset)
{
case APIC_OFFSET_ID:
+ lapic->id = data;
+ vlapic_id_write_handler(vlapic);
break;
case APIC_OFFSET_TPR:
lapic->tpr = data & 0xff;
@@ -1227,41 +1231,44 @@ vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data, bool *retu)
vlapic_process_eoi(vlapic);
break;
case APIC_OFFSET_LDR:
- vlapic_set_ldr(vlapic, data);
+ lapic->ldr = data;
+ vlapic_ldr_write_handler(vlapic);
break;
case APIC_OFFSET_DFR:
- vlapic_set_dfr(vlapic, data);
+ lapic->dfr = data;
+ vlapic_dfr_write_handler(vlapic);
break;
case APIC_OFFSET_SVR:
- lapic_set_svr(vlapic, data);
+ lapic->svr = data;
+ vlapic_svr_write_handler(vlapic);
break;
case APIC_OFFSET_ICR_LOW:
- if (!x2apic(vlapic)) {
- data &= 0xffffffff;
- data |= (uint64_t)lapic->icr_hi << 32;
- }
- retval = lapic_process_icr(vlapic, data, retu);
+ lapic->icr_lo = data;
+ if (x2apic(vlapic))
+ lapic->icr_hi = data >> 32;
+ retval = vlapic_icrlo_write_handler(vlapic, retu);
break;
case APIC_OFFSET_ICR_HI:
- if (!x2apic(vlapic)) {
- retval = 0;
- lapic->icr_hi = data;
- }
+ lapic->icr_hi = data;
break;
case APIC_OFFSET_CMCI_LVT:
case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
- vlapic_set_lvt(vlapic, offset, data);
+ regptr = vlapic_get_lvtptr(vlapic, offset);
+ *regptr = data;
+ vlapic_lvt_write_handler(vlapic, offset);
break;
- case APIC_OFFSET_ICR:
- vlapic_set_icr_timer(vlapic, data);
+ case APIC_OFFSET_TIMER_ICR:
+ lapic->icr_timer = data;
+ vlapic_icrtmr_write_handler(vlapic);
break;
- case APIC_OFFSET_DCR:
- vlapic_set_dcr(vlapic, data);
+ case APIC_OFFSET_TIMER_DCR:
+ lapic->dcr_timer = data;
+ vlapic_dcr_write_handler(vlapic);
break;
case APIC_OFFSET_ESR:
- vlapic_update_errors(vlapic);
+ vlapic_esr_write_handler(vlapic);
break;
case APIC_OFFSET_VER:
case APIC_OFFSET_APR:
@@ -1270,7 +1277,7 @@ vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data, bool *retu)
case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
- case APIC_OFFSET_CCR:
+ case APIC_OFFSET_TIMER_CCR:
default:
// Read only.
break;
@@ -1279,14 +1286,41 @@ vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data, bool *retu)
return (retval);
}
-struct vlapic *
-vlapic_init(struct vm *vm, int vcpuid)
+static void
+vlapic_reset(struct vlapic *vlapic)
{
- struct vlapic *vlapic;
+ struct LAPIC *lapic;
+
+ lapic = vlapic->apic_page;
+ bzero(lapic, sizeof(struct LAPIC));
- vlapic = malloc(sizeof(struct vlapic), M_VLAPIC, M_WAITOK | M_ZERO);
- vlapic->vm = vm;
- vlapic->vcpuid = vcpuid;
+ lapic->id = vlapic_get_id(vlapic);
+ lapic->version = VLAPIC_VERSION;
+ lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT);
+ lapic->dfr = 0xffffffff;
+ lapic->svr = APIC_SVR_VECTOR;
+ vlapic_mask_lvts(vlapic);
+ vlapic_reset_tmr(vlapic);
+
+ lapic->dcr_timer = 0;
+ vlapic_dcr_write_handler(vlapic);
+
+ if (vlapic->vcpuid == 0)
+ vlapic->boot_state = BS_RUNNING; /* BSP */
+ else
+ vlapic->boot_state = BS_INIT; /* AP */
+
+ vlapic->svr_last = lapic->svr;
+}
+
+void
+vlapic_init(struct vlapic *vlapic)
+{
+ KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized"));
+ KASSERT(vlapic->vcpuid >= 0 && vlapic->vcpuid < VM_MAXCPU,
+ ("vlapic_init: vcpuid is not initialized"));
+ KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not "
+ "initialized"));
/*
* If the vlapic is configured in x2apic mode then it will be
@@ -1300,12 +1334,10 @@ vlapic_init(struct vm *vm, int vcpuid)
vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED;
- if (vcpuid == 0)
+ if (vlapic->vcpuid == 0)
vlapic->msr_apicbase |= APICBASE_BSP;
vlapic_reset(vlapic);
-
- return (vlapic);
}
void
@@ -1313,7 +1345,6 @@ vlapic_cleanup(struct vlapic *vlapic)
{
callout_drain(&vlapic->callout);
- free(vlapic, M_VLAPIC);
}
uint64_t
@@ -1324,19 +1355,38 @@ vlapic_get_apicbase(struct vlapic *vlapic)
}
void
-vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val)
+vlapic_set_apicbase(struct vlapic *vlapic, uint64_t new)
{
- int err;
+ struct LAPIC *lapic;
enum x2apic_state state;
+ uint64_t old;
+ int err;
err = vm_get_x2apic_state(vlapic->vm, vlapic->vcpuid, &state);
if (err)
panic("vlapic_set_apicbase: err %d fetching x2apic state", err);
if (state == X2APIC_DISABLED)
- val &= ~APICBASE_X2APIC;
+ new &= ~APICBASE_X2APIC;
+
+ old = vlapic->msr_apicbase;
+ vlapic->msr_apicbase = new;
- vlapic->msr_apicbase = val;
+ /*
+ * If the vlapic is switching between xAPIC and x2APIC modes then
+ * reset the mode-dependent registers.
+ */
+ if ((old ^ new) & APICBASE_X2APIC) {
+ lapic = vlapic->apic_page;
+ lapic->id = vlapic_get_id(vlapic);
+ if (x2apic(vlapic)) {
+ lapic->ldr = x2apic_ldr(vlapic);
+ lapic->dfr = 0;
+ } else {
+ lapic->ldr = 0;
+ lapic->dfr = 0xffffffff;
+ }
+ }
}
void
@@ -1378,10 +1428,28 @@ vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
}
}
+void
+vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum)
+{
+ /*
+ * Post an interrupt to the vcpu currently running on 'hostcpu'.
+ *
+ * This is done by leveraging features like Posted Interrupts (Intel)
+ * Doorbell MSR (AMD AVIC) that avoid a VM exit.
+ *
+ * If neither of these features are available then fallback to
+ * sending an IPI to 'hostcpu'.
+ */
+ if (vlapic->ops.post_intr)
+ (*vlapic->ops.post_intr)(vlapic, hostcpu);
+ else
+ ipi_cpu(hostcpu, ipinum);
+}
+
bool
vlapic_enabled(struct vlapic *vlapic)
{
- struct LAPIC *lapic = &vlapic->apic;
+ struct LAPIC *lapic = vlapic->apic_page;
if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 &&
(lapic->svr & APIC_SVR_ENABLE) != 0)
@@ -1389,3 +1457,62 @@ vlapic_enabled(struct vlapic *vlapic)
else
return (false);
}
+
+static void
+vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level)
+{
+ struct LAPIC *lapic;
+ uint32_t *tmrptr, mask;
+ int idx;
+
+ lapic = vlapic->apic_page;
+ tmrptr = &lapic->tmr0;
+ idx = (vector / 32) * 4;
+ mask = 1 << (vector % 32);
+ if (level)
+ tmrptr[idx] |= mask;
+ else
+ tmrptr[idx] &= ~mask;
+
+ if (vlapic->ops.set_tmr != NULL)
+ (*vlapic->ops.set_tmr)(vlapic, vector, level);
+}
+
+void
+vlapic_reset_tmr(struct vlapic *vlapic)
+{
+ int vector;
+
+ VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered");
+
+ for (vector = 0; vector <= 255; vector++)
+ vlapic_set_tmr(vlapic, vector, false);
+}
+
+void
+vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys,
+ int delmode, int vector)
+{
+ cpuset_t dmask;
+ bool lowprio;
+
+ KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
+
+ /*
+ * A level trigger is valid only for fixed and lowprio delivery modes.
+ */
+ if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) {
+ VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for "
+ "delivery-mode %d", delmode);
+ return;
+ }
+
+ lowprio = (delmode == APIC_DELMODE_LOWPRIO);
+ vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false);
+
+ if (!CPU_ISSET(vlapic->vcpuid, &dmask))
+ return;
+
+ VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector);
+ vlapic_set_tmr(vlapic, vector, true);
+}
diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h
index 98f377e..d2fc6d9 100644
--- a/sys/amd64/vmm/io/vlapic.h
+++ b/sys/amd64/vmm/io/vlapic.h
@@ -30,74 +30,45 @@
#define _VLAPIC_H_
struct vm;
-
-/*
- * Map of APIC Registers: Offset Description Access
- */
-#define APIC_OFFSET_ID 0x20 // Local APIC ID R/W
-#define APIC_OFFSET_VER 0x30 // Local APIC Version R
-#define APIC_OFFSET_TPR 0x80 // Task Priority Register R/W
-#define APIC_OFFSET_APR 0x90 // Arbitration Priority Register R
-#define APIC_OFFSET_PPR 0xA0 // Processor Priority Register R
-#define APIC_OFFSET_EOI 0xB0 // EOI Register W
-#define APIC_OFFSET_RRR 0xC0 // Remote read R
-#define APIC_OFFSET_LDR 0xD0 // Logical Destination R/W
-#define APIC_OFFSET_DFR 0xE0 // Destination Format Register 0..27 R; 28..31 R/W
-#define APIC_OFFSET_SVR 0xF0 // Spurious Interrupt Vector Reg. 0..3 R; 4..9 R/W
-#define APIC_OFFSET_ISR0 0x100 // ISR 000-031 R
-#define APIC_OFFSET_ISR1 0x110 // ISR 032-063 R
-#define APIC_OFFSET_ISR2 0x120 // ISR 064-095 R
-#define APIC_OFFSET_ISR3 0x130 // ISR 095-128 R
-#define APIC_OFFSET_ISR4 0x140 // ISR 128-159 R
-#define APIC_OFFSET_ISR5 0x150 // ISR 160-191 R
-#define APIC_OFFSET_ISR6 0x160 // ISR 192-223 R
-#define APIC_OFFSET_ISR7 0x170 // ISR 224-255 R
-#define APIC_OFFSET_TMR0 0x180 // TMR 000-031 R
-#define APIC_OFFSET_TMR1 0x190 // TMR 032-063 R
-#define APIC_OFFSET_TMR2 0x1A0 // TMR 064-095 R
-#define APIC_OFFSET_TMR3 0x1B0 // TMR 095-128 R
-#define APIC_OFFSET_TMR4 0x1C0 // TMR 128-159 R
-#define APIC_OFFSET_TMR5 0x1D0 // TMR 160-191 R
-#define APIC_OFFSET_TMR6 0x1E0 // TMR 192-223 R
-#define APIC_OFFSET_TMR7 0x1F0 // TMR 224-255 R
-#define APIC_OFFSET_IRR0 0x200 // IRR 000-031 R
-#define APIC_OFFSET_IRR1 0x210 // IRR 032-063 R
-#define APIC_OFFSET_IRR2 0x220 // IRR 064-095 R
-#define APIC_OFFSET_IRR3 0x230 // IRR 095-128 R
-#define APIC_OFFSET_IRR4 0x240 // IRR 128-159 R
-#define APIC_OFFSET_IRR5 0x250 // IRR 160-191 R
-#define APIC_OFFSET_IRR6 0x260 // IRR 192-223 R
-#define APIC_OFFSET_IRR7 0x270 // IRR 224-255 R
-#define APIC_OFFSET_ESR 0x280 // Error Status Register R
-#define APIC_OFFSET_CMCI_LVT 0x2F0 // Local Vector Table (CMCI) R/W
-#define APIC_OFFSET_ICR_LOW 0x300 // Interrupt Command Reg. (0-31) R/W
-#define APIC_OFFSET_ICR_HI 0x310 // Interrupt Command Reg. (32-63) R/W
-#define APIC_OFFSET_TIMER_LVT 0x320 // Local Vector Table (Timer) R/W
-#define APIC_OFFSET_THERM_LVT 0x330 // Local Vector Table (Thermal) R/W (PIV+)
-#define APIC_OFFSET_PERF_LVT 0x340 // Local Vector Table (Performance) R/W (P6+)
-#define APIC_OFFSET_LINT0_LVT 0x350 // Local Vector Table (LINT0) R/W
-#define APIC_OFFSET_LINT1_LVT 0x360 // Local Vector Table (LINT1) R/W
-#define APIC_OFFSET_ERROR_LVT 0x370 // Local Vector Table (ERROR) R/W
-#define APIC_OFFSET_ICR 0x380 // Initial Count Reg. for Timer R/W
-#define APIC_OFFSET_CCR 0x390 // Current Count of Timer R
-#define APIC_OFFSET_DCR 0x3E0 // Timer Divide Configuration Reg. R/W
-
-/*
- * 16 priority levels with at most one vector injected per level.
- */
-#define ISRVEC_STK_SIZE (16 + 1)
-
enum x2apic_state;
-struct vlapic *vlapic_init(struct vm *vm, int vcpuid);
-void vlapic_cleanup(struct vlapic *vlapic);
int vlapic_write(struct vlapic *vlapic, uint64_t offset, uint64_t data,
bool *retu);
int vlapic_read(struct vlapic *vlapic, uint64_t offset, uint64_t *data,
bool *retu);
-int vlapic_pending_intr(struct vlapic *vlapic);
+
+/*
+ * Returns 0 if there is no eligible vector that can be delivered to the
+ * guest at this time and non-zero otherwise.
+ *
+ * If an eligible vector number is found and 'vecptr' is not NULL then it will
+ * be stored in the location pointed to by 'vecptr'.
+ *
+ * Note that the vector does not automatically transition to the ISR as a
+ * result of calling this function.
+ */
+int vlapic_pending_intr(struct vlapic *vlapic, int *vecptr);
+
+/*
+ * Transition 'vector' from IRR to ISR. This function is called with the
+ * vector returned by 'vlapic_pending_intr()' when the guest is able to
+ * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that
+ * block interrupt delivery).
+ */
void vlapic_intr_accepted(struct vlapic *vlapic, int vector);
-void vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level);
+
+/*
+ * Returns 1 if the vcpu needs to be notified of the interrupt and 0 otherwise.
+ */
+int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level);
+
+/*
+ * Post an interrupt to the vcpu running on 'hostcpu'. This will use a
+ * hardware assist if available (e.g. Posted Interrupt) or fall back to
+ * sending an 'ipinum' to interrupt the 'hostcpu'.
+ */
+void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum);
+
void vlapic_set_error(struct vlapic *vlapic, uint32_t mask);
void vlapic_fire_cmci(struct vlapic *vlapic);
int vlapic_trigger_lvt(struct vlapic *vlapic, int vector);
@@ -109,4 +80,26 @@ bool vlapic_enabled(struct vlapic *vlapic);
void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
int delmode, int vec);
+
+/* Reset the trigger-mode bits for all vectors to be edge-triggered */
+void vlapic_reset_tmr(struct vlapic *vlapic);
+
+/*
+ * Set the trigger-mode bit associated with 'vector' to level-triggered if
+ * the (dest,phys,delmode) tuple resolves to an interrupt being delivered to
+ * this 'vlapic'.
+ */
+void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys,
+ int delmode, int vector);
+
+/* APIC write handlers */
+void vlapic_id_write_handler(struct vlapic *vlapic);
+void vlapic_ldr_write_handler(struct vlapic *vlapic);
+void vlapic_dfr_write_handler(struct vlapic *vlapic);
+void vlapic_svr_write_handler(struct vlapic *vlapic);
+void vlapic_esr_write_handler(struct vlapic *vlapic);
+int vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu);
+void vlapic_icrtmr_write_handler(struct vlapic *vlapic);
+void vlapic_dcr_write_handler(struct vlapic *vlapic);
+void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset);
#endif /* _VLAPIC_H_ */
diff --git a/sys/amd64/vmm/io/vlapic_priv.h b/sys/amd64/vmm/io/vlapic_priv.h
new file mode 100644
index 0000000..a4e96aa
--- /dev/null
+++ b/sys/amd64/vmm/io/vlapic_priv.h
@@ -0,0 +1,185 @@
+/*-
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VLAPIC_PRIV_H_
+#define _VLAPIC_PRIV_H_
+
+#include <x86/apicreg.h>
+
+/*
+ * APIC Register: Offset Description
+ */
+#define APIC_OFFSET_ID 0x20 /* Local APIC ID */
+#define APIC_OFFSET_VER 0x30 /* Local APIC Version */
+#define APIC_OFFSET_TPR 0x80 /* Task Priority Register */
+#define APIC_OFFSET_APR 0x90 /* Arbitration Priority */
+#define APIC_OFFSET_PPR 0xA0 /* Processor Priority Register */
+#define APIC_OFFSET_EOI 0xB0 /* EOI Register */
+#define APIC_OFFSET_RRR 0xC0 /* Remote read */
+#define APIC_OFFSET_LDR 0xD0 /* Logical Destination */
+#define APIC_OFFSET_DFR 0xE0 /* Destination Format Register */
+#define APIC_OFFSET_SVR 0xF0 /* Spurious Vector Register */
+#define APIC_OFFSET_ISR0 0x100 /* In Service Register */
+#define APIC_OFFSET_ISR1 0x110
+#define APIC_OFFSET_ISR2 0x120
+#define APIC_OFFSET_ISR3 0x130
+#define APIC_OFFSET_ISR4 0x140
+#define APIC_OFFSET_ISR5 0x150
+#define APIC_OFFSET_ISR6 0x160
+#define APIC_OFFSET_ISR7 0x170
+#define APIC_OFFSET_TMR0 0x180 /* Trigger Mode Register */
+#define APIC_OFFSET_TMR1 0x190
+#define APIC_OFFSET_TMR2 0x1A0
+#define APIC_OFFSET_TMR3 0x1B0
+#define APIC_OFFSET_TMR4 0x1C0
+#define APIC_OFFSET_TMR5 0x1D0
+#define APIC_OFFSET_TMR6 0x1E0
+#define APIC_OFFSET_TMR7 0x1F0
+#define APIC_OFFSET_IRR0 0x200 /* Interrupt Request Register */
+#define APIC_OFFSET_IRR1 0x210
+#define APIC_OFFSET_IRR2 0x220
+#define APIC_OFFSET_IRR3 0x230
+#define APIC_OFFSET_IRR4 0x240
+#define APIC_OFFSET_IRR5 0x250
+#define APIC_OFFSET_IRR6 0x260
+#define APIC_OFFSET_IRR7 0x270
+#define APIC_OFFSET_ESR 0x280 /* Error Status Register */
+#define APIC_OFFSET_CMCI_LVT 0x2F0 /* Local Vector Table (CMCI) */
+#define APIC_OFFSET_ICR_LOW 0x300 /* Interrupt Command Register */
+#define APIC_OFFSET_ICR_HI 0x310
+#define APIC_OFFSET_TIMER_LVT 0x320 /* Local Vector Table (Timer) */
+#define APIC_OFFSET_THERM_LVT 0x330 /* Local Vector Table (Thermal) */
+#define APIC_OFFSET_PERF_LVT 0x340 /* Local Vector Table (PMC) */
+#define APIC_OFFSET_LINT0_LVT 0x350 /* Local Vector Table (LINT0) */
+#define APIC_OFFSET_LINT1_LVT 0x360 /* Local Vector Table (LINT1) */
+#define APIC_OFFSET_ERROR_LVT 0x370 /* Local Vector Table (ERROR) */
+#define APIC_OFFSET_TIMER_ICR 0x380 /* Timer's Initial Count */
+#define APIC_OFFSET_TIMER_CCR 0x390 /* Timer's Current Count */
+#define APIC_OFFSET_TIMER_DCR 0x3E0 /* Timer's Divide Configuration */
+
+#define VLAPIC_CTR0(vlapic, format) \
+ VCPU_CTR0((vlapic)->vm, (vlapic)->vcpuid, format)
+
+#define VLAPIC_CTR1(vlapic, format, p1) \
+ VCPU_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1)
+
+#define VLAPIC_CTR2(vlapic, format, p1, p2) \
+ VCPU_CTR2((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2)
+
+#define VLAPIC_CTR_IRR(vlapic, msg) \
+do { \
+ uint32_t *irrptr = &(vlapic)->apic_page->irr0; \
+ irrptr[0] = irrptr[0]; /* silence compiler */ \
+ VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]); \
+} while (0)
+
+#define VLAPIC_CTR_ISR(vlapic, msg) \
+do { \
+ uint32_t *isrptr = &(vlapic)->apic_page->isr0; \
+ isrptr[0] = isrptr[0]; /* silence compiler */ \
+ VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]); \
+} while (0)
+
+enum boot_state {
+ BS_INIT,
+ BS_SIPI,
+ BS_RUNNING
+};
+
+/*
+ * 16 priority levels with at most one vector injected per level.
+ */
+#define ISRVEC_STK_SIZE (16 + 1)
+
+#define VLAPIC_MAXLVT_INDEX APIC_LVT_CMCI
+
+struct vlapic;
+
+struct vlapic_ops {
+ int (*set_intr_ready)(struct vlapic *vlapic, int vector, bool level);
+ int (*pending_intr)(struct vlapic *vlapic, int *vecptr);
+ void (*intr_accepted)(struct vlapic *vlapic, int vector);
+ void (*post_intr)(struct vlapic *vlapic, int hostcpu);
+ void (*set_tmr)(struct vlapic *vlapic, int vector, bool level);
+};
+
+struct vlapic {
+ struct vm *vm;
+ int vcpuid;
+ struct LAPIC *apic_page;
+ struct vlapic_ops ops;
+
+ uint32_t esr_pending;
+ int esr_firing;
+
+ struct callout callout; /* vlapic timer */
+ struct bintime timer_fire_bt; /* callout expiry time */
+ struct bintime timer_freq_bt; /* timer frequency */
+ struct bintime timer_period_bt; /* timer period */
+ struct mtx timer_mtx;
+
+ /*
+ * The 'isrvec_stk' is a stack of vectors injected by the local apic.
+ * A vector is popped from the stack when the processor does an EOI.
+ * The vector on the top of the stack is used to compute the
+ * Processor Priority in conjunction with the TPR.
+ */
+ uint8_t isrvec_stk[ISRVEC_STK_SIZE];
+ int isrvec_stk_top;
+
+ uint64_t msr_apicbase;
+ enum boot_state boot_state;
+
+ /*
+ * Copies of some registers in the virtual APIC page. We do this for
+ * a couple of different reasons:
+ * - to be able to detect what changed (e.g. svr_last)
+ * - to maintain a coherent snapshot of the register (e.g. lvt_last)
+ */
+ uint32_t svr_last;
+ uint32_t lvt_last[VLAPIC_MAXLVT_INDEX + 1];
+};
+
+void vlapic_init(struct vlapic *vlapic);
+void vlapic_cleanup(struct vlapic *vlapic);
+
+#endif /* _VLAPIC_PRIV_H_ */
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index f471218b..2c86068 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -124,19 +124,25 @@ struct vm {
* An active vcpu is one that has been started implicitly (BSP) or
* explicitly (AP) by sending it a startup ipi.
*/
- cpuset_t active_cpus;
+ volatile cpuset_t active_cpus;
+
+ struct mtx rendezvous_mtx;
+ cpuset_t rendezvous_req_cpus;
+ cpuset_t rendezvous_done_cpus;
+ void *rendezvous_arg;
+ vm_rendezvous_func_t rendezvous_func;
};
static int vmm_initialized;
static struct vmm_ops *ops;
-#define VMM_INIT() (ops != NULL ? (*ops->init)() : 0)
+#define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0)
#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0)
#define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0)
#define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
-#define VMRUN(vmi, vcpu, rip, pmap) \
- (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap) : ENXIO)
+#define VMRUN(vmi, vcpu, rip, pmap, rptr) \
+ (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr) : ENXIO)
#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
#define VMSPACE_ALLOC(min, max) \
(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
@@ -156,6 +162,10 @@ static struct vmm_ops *ops;
(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
#define VMSETCAP(vmi, vcpu, num, val) \
(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
+#define VLAPIC_INIT(vmi, vcpu) \
+ (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
+#define VLAPIC_CLEANUP(vmi, vlapic) \
+ (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
#define fpu_start_emulating() load_cr0(rcr0() | CR0_TS)
#define fpu_stop_emulating() clts()
@@ -166,10 +176,20 @@ CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */
/* statistics */
static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
+SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
+
+static int vmm_ipinum;
+SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
+ "IPI vector used for vcpu notifications");
+
+static void vm_deactivate_cpu(struct vm *vm, int vcpuid);
+
static void
-vcpu_cleanup(struct vcpu *vcpu)
+vcpu_cleanup(struct vm *vm, int i)
{
- vlapic_cleanup(vcpu->vlapic);
+ struct vcpu *vcpu = &vm->vcpu[i];
+
+ VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
vmm_stat_free(vcpu->stats);
fpu_save_area_free(vcpu->guestfpu);
}
@@ -184,7 +204,7 @@ vcpu_init(struct vm *vm, uint32_t vcpu_id)
vcpu_lock_init(vcpu);
vcpu->hostcpu = NOCPU;
vcpu->vcpuid = vcpu_id;
- vcpu->vlapic = vlapic_init(vm, vcpu_id);
+ vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
vcpu->guestfpu = fpu_save_area_alloc();
fpu_save_area_reset(vcpu->guestfpu);
@@ -216,7 +236,10 @@ vmm_init(void)
int error;
vmm_host_state_init();
- vmm_ipi_init();
+
+ vmm_ipinum = vmm_ipi_alloc();
+ if (vmm_ipinum == 0)
+ vmm_ipinum = IPI_AST;
error = vmm_mem_init();
if (error)
@@ -232,7 +255,7 @@ vmm_init(void)
vmm_msr_init();
vmm_resume_p = vmm_resume;
- return (VMM_INIT());
+ return (VMM_INIT(vmm_ipinum));
}
static int
@@ -253,7 +276,8 @@ vmm_handler(module_t mod, int what, void *arg)
if (error == 0) {
vmm_resume_p = NULL;
iommu_cleanup();
- vmm_ipi_cleanup();
+ if (vmm_ipinum != IPI_AST)
+ vmm_ipi_free(vmm_ipinum);
error = VMM_CLEANUP();
/*
* Something bad happened - prevent new
@@ -288,8 +312,6 @@ static moduledata_t vmm_kmod = {
DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
MODULE_VERSION(vmm, 1);
-SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
-
int
vm_create(const char *name, struct vm **retvm)
{
@@ -315,6 +337,8 @@ vm_create(const char *name, struct vm **retvm)
vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
strcpy(vm->name, name);
+ vm->vmspace = vmspace;
+ mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
vm->vioapic = vioapic_init(vm);
vm->vhpet = vhpet_init(vm);
@@ -325,7 +349,6 @@ vm_create(const char *name, struct vm **retvm)
}
vm_activate_cpu(vm, BSP);
- vm->vmspace = vmspace;
*retvm = vm;
return (0);
@@ -360,7 +383,7 @@ vm_destroy(struct vm *vm)
vm->num_mem_segs = 0;
for (i = 0; i < VM_MAXCPU; i++)
- vcpu_cleanup(&vm->vcpu[i]);
+ vcpu_cleanup(vm, i);
VMSPACE_FREE(vm->vmspace);
@@ -866,6 +889,63 @@ vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
panic("Error %d setting state to %d", error, newstate);
}
+static void
+vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
+{
+
+ KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
+
+ /*
+ * Update 'rendezvous_func' and execute a write memory barrier to
+ * ensure that it is visible across all host cpus. This is not needed
+ * for correctness but it does ensure that all the vcpus will notice
+ * that the rendezvous is requested immediately.
+ */
+ vm->rendezvous_func = func;
+ wmb();
+}
+
+#define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \
+ do { \
+ if (vcpuid >= 0) \
+ VCPU_CTR0(vm, vcpuid, fmt); \
+ else \
+ VM_CTR0(vm, fmt); \
+ } while (0)
+
+static void
+vm_handle_rendezvous(struct vm *vm, int vcpuid)
+{
+
+ KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
+ ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
+
+ mtx_lock(&vm->rendezvous_mtx);
+ while (vm->rendezvous_func != NULL) {
+ /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
+ CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
+
+ if (vcpuid != -1 &&
+ CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
+ !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
+ VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
+ (*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
+ CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
+ }
+ if (CPU_CMP(&vm->rendezvous_req_cpus,
+ &vm->rendezvous_done_cpus) == 0) {
+ VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
+ vm_set_rendezvous_func(vm, NULL);
+ wakeup(&vm->rendezvous_func);
+ break;
+ }
+ RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
+ mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
+ "vmrndv", 0);
+ }
+ mtx_unlock(&vm->rendezvous_mtx);
+}
+
/*
* Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
*/
@@ -874,9 +954,10 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
{
struct vm_exit *vmexit;
struct vcpu *vcpu;
- int t, timo;
+ int t, timo, spindown;
vcpu = &vm->vcpu[vcpuid];
+ spindown = 0;
vcpu_lock(vcpu);
@@ -888,7 +969,7 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
* returned from VMRUN() and before we grabbed the vcpu lock.
*/
if (!vm_nmi_pending(vm, vcpuid) &&
- (intr_disabled || vlapic_pending_intr(vcpu->vlapic) < 0)) {
+ (intr_disabled || !vlapic_pending_intr(vcpu->vlapic, NULL))) {
t = ticks;
vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
if (vlapic_enabled(vcpu->vlapic)) {
@@ -903,16 +984,25 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
* Spindown the vcpu if the apic is disabled and it
* had entered the halted state.
*/
- *retu = true;
- vmexit = vm_exitinfo(vm, vcpuid);
- vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU;
- VCPU_CTR0(vm, vcpuid, "spinning down cpu");
+ spindown = 1;
}
vcpu_require_state_locked(vcpu, VCPU_FROZEN);
vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
}
vcpu_unlock(vcpu);
+ /*
+ * Since 'vm_deactivate_cpu()' grabs a sleep mutex we must call it
+ * outside the confines of the vcpu spinlock.
+ */
+ if (spindown) {
+ *retu = true;
+ vmexit = vm_exitinfo(vm, vcpuid);
+ vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU;
+ vm_deactivate_cpu(vm, vcpuid);
+ VCPU_CTR0(vm, vcpuid, "spinning down cpu");
+ }
+
return (0);
}
@@ -1042,7 +1132,7 @@ restart:
vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
vcpu->hostcpu = curcpu;
- error = VMRUN(vm->cookie, vcpuid, rip, pmap);
+ error = VMRUN(vm->cookie, vcpuid, rip, pmap, &vm->rendezvous_func);
vcpu->hostcpu = NOCPU;
vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
@@ -1056,6 +1146,14 @@ restart:
if (error == 0) {
retu = false;
switch (vme->exitcode) {
+ case VM_EXITCODE_IOAPIC_EOI:
+ vioapic_process_eoi(vm, vcpuid,
+ vme->u.ioapic_eoi.vector);
+ break;
+ case VM_EXITCODE_RENDEZVOUS:
+ vm_handle_rendezvous(vm, vcpuid);
+ error = 0;
+ break;
case VM_EXITCODE_HLT:
intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
@@ -1111,7 +1209,7 @@ vm_inject_nmi(struct vm *vm, int vcpuid)
vcpu = &vm->vcpu[vcpuid];
vcpu->nmi_pending = 1;
- vcpu_notify_event(vm, vcpuid);
+ vcpu_notify_event(vm, vcpuid, false);
return (0);
}
@@ -1286,8 +1384,37 @@ void
vm_activate_cpu(struct vm *vm, int vcpuid)
{
- if (vcpuid >= 0 && vcpuid < VM_MAXCPU)
- CPU_SET(vcpuid, &vm->active_cpus);
+ KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
+ ("vm_activate_cpu: invalid vcpuid %d", vcpuid));
+ KASSERT(!CPU_ISSET(vcpuid, &vm->active_cpus),
+ ("vm_activate_cpu: vcpuid %d is already active", vcpuid));
+
+ VCPU_CTR0(vm, vcpuid, "activated");
+ CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
+}
+
+static void
+vm_deactivate_cpu(struct vm *vm, int vcpuid)
+{
+
+ KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
+ ("vm_deactivate_cpu: invalid vcpuid %d", vcpuid));
+ KASSERT(CPU_ISSET(vcpuid, &vm->active_cpus),
+ ("vm_deactivate_cpu: vcpuid %d is not active", vcpuid));
+
+ VCPU_CTR0(vm, vcpuid, "deactivated");
+ CPU_CLR_ATOMIC(vcpuid, &vm->active_cpus);
+
+ /*
+ * If a vcpu rendezvous is in progress then it could be blocked
+ * on 'vcpuid' - unblock it before disappearing forever.
+ */
+ mtx_lock(&vm->rendezvous_mtx);
+ if (vm->rendezvous_func != NULL) {
+ VCPU_CTR0(vm, vcpuid, "unblock rendezvous after deactivation");
+ wakeup(&vm->rendezvous_func);
+ }
+ mtx_unlock(&vm->rendezvous_mtx);
}
cpuset_t
@@ -1339,7 +1466,7 @@ vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
* to the host_cpu to cause the vcpu to trap into the hypervisor.
*/
void
-vcpu_notify_event(struct vm *vm, int vcpuid)
+vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
{
int hostcpu;
struct vcpu *vcpu;
@@ -1354,8 +1481,13 @@ vcpu_notify_event(struct vm *vm, int vcpuid)
} else {
if (vcpu->state != VCPU_RUNNING)
panic("invalid vcpu state %d", vcpu->state);
- if (hostcpu != curcpu)
- ipi_cpu(hostcpu, vmm_ipinum);
+ if (hostcpu != curcpu) {
+ if (lapic_intr)
+ vlapic_post_intr(vcpu->vlapic, hostcpu,
+ vmm_ipinum);
+ else
+ ipi_cpu(hostcpu, vmm_ipinum);
+ }
}
vcpu_unlock(vcpu);
}
@@ -1375,3 +1507,51 @@ vm_apicid2vcpuid(struct vm *vm, int apicid)
*/
return (apicid);
}
+
+void
+vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
+ vm_rendezvous_func_t func, void *arg)
+{
+ int i;
+
+ /*
+ * Enforce that this function is called without any locks
+ */
+ WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
+ KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
+ ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
+
+restart:
+ mtx_lock(&vm->rendezvous_mtx);
+ if (vm->rendezvous_func != NULL) {
+ /*
+ * If a rendezvous is already in progress then we need to
+ * call the rendezvous handler in case this 'vcpuid' is one
+ * of the targets of the rendezvous.
+ */
+ RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
+ mtx_unlock(&vm->rendezvous_mtx);
+ vm_handle_rendezvous(vm, vcpuid);
+ goto restart;
+ }
+ KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
+ "rendezvous is still in progress"));
+
+ RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
+ vm->rendezvous_req_cpus = dest;
+ CPU_ZERO(&vm->rendezvous_done_cpus);
+ vm->rendezvous_arg = arg;
+ vm_set_rendezvous_func(vm, func);
+ mtx_unlock(&vm->rendezvous_mtx);
+
+ /*
+ * Wake up any sleeping vcpus and trigger a VM-exit in any running
+ * vcpus so they handle the rendezvous as soon as possible.
+ */
+ for (i = 0; i < VM_MAXCPU; i++) {
+ if (CPU_ISSET(i, &dest))
+ vcpu_notify_event(vm, i, false);
+ }
+
+ vm_handle_rendezvous(vm, vcpuid);
+}
diff --git a/sys/amd64/vmm/vmm_ipi.c b/sys/amd64/vmm/vmm_ipi.c
index 643d326..1765284 100644
--- a/sys/amd64/vmm/vmm_ipi.c
+++ b/sys/amd64/vmm/vmm_ipi.c
@@ -44,15 +44,10 @@ __FBSDID("$FreeBSD$");
extern inthand_t IDTVEC(rsvd), IDTVEC(justreturn);
-/*
- * The default is to use the IPI_AST to interrupt a vcpu.
- */
-int vmm_ipinum = IPI_AST;
-
CTASSERT(APIC_SPURIOUS_INT == 255);
-void
-vmm_ipi_init(void)
+int
+vmm_ipi_alloc(void)
{
int idx;
uintptr_t func;
@@ -72,22 +67,27 @@ vmm_ipi_init(void)
ip = &idt[idx];
func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
if (func == (uintptr_t)&IDTVEC(rsvd)) {
- vmm_ipinum = idx;
- setidt(vmm_ipinum, IDTVEC(justreturn), SDT_SYSIGT,
+ setidt(idx , IDTVEC(justreturn), SDT_SYSIGT,
SEL_KPL, 0);
- break;
+ return (idx);
}
}
-
- if (vmm_ipinum != IPI_AST && bootverbose) {
- printf("vmm_ipi_init: installing ipi handler to interrupt "
- "vcpus at vector %d\n", vmm_ipinum);
- }
+ return (0);
}
void
-vmm_ipi_cleanup(void)
+vmm_ipi_free(int ipinum)
{
- if (vmm_ipinum != IPI_AST)
- setidt(vmm_ipinum, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
+ uintptr_t func;
+ struct gate_descriptor *ip;
+
+ KASSERT(ipinum >= APIC_IPI_INTS && ipinum < APIC_SPURIOUS_INT,
+ ("invalid ipi %d", ipinum));
+
+ ip = &idt[ipinum];
+ func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
+ KASSERT(func == (uintptr_t)&IDTVEC(justreturn),
+ ("invalid ipi %d", ipinum));
+
+ setidt(ipinum, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
}
diff --git a/sys/amd64/vmm/vmm_ipi.h b/sys/amd64/vmm/vmm_ipi.h
index 91552e3..679d183 100644
--- a/sys/amd64/vmm/vmm_ipi.h
+++ b/sys/amd64/vmm/vmm_ipi.h
@@ -29,11 +29,7 @@
#ifndef _VMM_IPI_H_
#define _VMM_IPI_H_
-struct vm;
-
-extern int vmm_ipinum;
-
-void vmm_ipi_init(void);
-void vmm_ipi_cleanup(void);
+int vmm_ipi_alloc(void);
+void vmm_ipi_free(int num);
#endif
diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c
index 8d915cd..47e04da 100644
--- a/sys/amd64/vmm/vmm_lapic.c
+++ b/sys/amd64/vmm/vmm_lapic.c
@@ -51,26 +51,6 @@ __FBSDID("$FreeBSD$");
#define MSI_X86_ADDR_LOG 0x00000004 /* Destination Mode */
int
-lapic_pending_intr(struct vm *vm, int cpu)
-{
- struct vlapic *vlapic;
-
- vlapic = vm_lapic(vm, cpu);
-
- return (vlapic_pending_intr(vlapic));
-}
-
-void
-lapic_intr_accepted(struct vm *vm, int cpu, int vector)
-{
- struct vlapic *vlapic;
-
- vlapic = vm_lapic(vm, cpu);
-
- vlapic_intr_accepted(vlapic, vector);
-}
-
-int
lapic_set_intr(struct vm *vm, int cpu, int vector, bool level)
{
struct vlapic *vlapic;
@@ -82,10 +62,8 @@ lapic_set_intr(struct vm *vm, int cpu, int vector, bool level)
return (EINVAL);
vlapic = vm_lapic(vm, cpu);
- vlapic_set_intr_ready(vlapic, vector, level);
-
- vcpu_notify_event(vm, cpu);
-
+ if (vlapic_set_intr_ready(vlapic, vector, level))
+ vcpu_notify_event(vm, cpu, true);
return (0);
}
diff --git a/sys/amd64/vmm/vmm_lapic.h b/sys/amd64/vmm/vmm_lapic.h
index c5c95aa..88fa948 100644
--- a/sys/amd64/vmm/vmm_lapic.h
+++ b/sys/amd64/vmm/vmm_lapic.h
@@ -43,26 +43,6 @@ int lapic_mmio_write(void *vm, int cpu, uint64_t gpa,
uint64_t wval, int size, void *arg);
/*
- * Returns a vector between 32 and 255 if an interrupt is pending in the
- * IRR that can be delivered based on the current state of ISR and TPR.
- *
- * Note that the vector does not automatically transition to the ISR as a
- * result of calling this function.
- *
- * Returns -1 if there is no eligible vector that can be delivered to the
- * guest at this time.
- */
-int lapic_pending_intr(struct vm *vm, int cpu);
-
-/*
- * Transition 'vector' from IRR to ISR. This function is called with the
- * vector returned by 'lapic_pending_intr()' when the guest is able to
- * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that
- * block interrupt delivery).
- */
-void lapic_intr_accepted(struct vm *vm, int cpu, int vector);
-
-/*
* Signals to the LAPIC that an interrupt at 'vector' needs to be generated
* to the 'cpu', the state is recorded in IRR.
*/
diff --git a/sys/amd64/vmm/vmm_stat.c b/sys/amd64/vmm/vmm_stat.c
index 781fda5..0951e1e 100644
--- a/sys/amd64/vmm/vmm_stat.c
+++ b/sys/amd64/vmm/vmm_stat.c
@@ -146,7 +146,9 @@ VMM_STAT(VMEXIT_INTR_WINDOW, "vm exits due to interrupt window opening");
VMM_STAT(VMEXIT_NMI_WINDOW, "vm exits due to nmi window opening");
VMM_STAT(VMEXIT_INOUT, "number of times in/out was intercepted");
VMM_STAT(VMEXIT_CPUID, "number of times cpuid was intercepted");
-VMM_STAT(VMEXIT_EPT_FAULT, "vm exits due to nested page fault");
+VMM_STAT(VMEXIT_NESTED_FAULT, "vm exits due to nested page fault");
+VMM_STAT(VMEXIT_INST_EMUL, "vm exits for instruction emulation");
VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason");
VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit");
VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace");
+VMM_STAT(VMEXIT_RENDEZVOUS, "number of times rendezvous pending at exit");
diff --git a/sys/amd64/vmm/vmm_stat.h b/sys/amd64/vmm/vmm_stat.h
index bc58113..0190a63 100644
--- a/sys/amd64/vmm/vmm_stat.h
+++ b/sys/amd64/vmm/vmm_stat.h
@@ -116,8 +116,10 @@ VMM_STAT_DECLARE(VMEXIT_INTR_WINDOW);
VMM_STAT_DECLARE(VMEXIT_NMI_WINDOW);
VMM_STAT_DECLARE(VMEXIT_INOUT);
VMM_STAT_DECLARE(VMEXIT_CPUID);
-VMM_STAT_DECLARE(VMEXIT_EPT_FAULT);
+VMM_STAT_DECLARE(VMEXIT_NESTED_FAULT);
+VMM_STAT_DECLARE(VMEXIT_INST_EMUL);
VMM_STAT_DECLARE(VMEXIT_UNKNOWN);
VMM_STAT_DECLARE(VMEXIT_ASTPENDING);
VMM_STAT_DECLARE(VMEXIT_USERSPACE);
+VMM_STAT_DECLARE(VMEXIT_RENDEZVOUS);
#endif
OpenPOWER on IntegriCloud