summaryrefslogtreecommitdiffstats
path: root/sys/amd64/vmm
diff options
context:
space:
mode:
authorneel <neel@FreeBSD.org>2014-09-02 04:22:42 +0000
committerneel <neel@FreeBSD.org>2014-09-02 04:22:42 +0000
commite5d2f8730c92c4abb6de986ec4e1f39a242b9868 (patch)
tree6c0bf6acdf7208ac100aa28946d55a2211a18393 /sys/amd64/vmm
parent37c34582e94ded9dca4876a77b195ca18a361fa9 (diff)
parent53369652fd9c456d33338f450a0c02cfdc1db689 (diff)
downloadFreeBSD-src-e5d2f8730c92c4abb6de986ec4e1f39a242b9868.zip
FreeBSD-src-e5d2f8730c92c4abb6de986ec4e1f39a242b9868.tar.gz
IFC @r269962
Submitted by: Anish Gupta (akgupt3@gmail.com)
Diffstat (limited to 'sys/amd64/vmm')
-rw-r--r--sys/amd64/vmm/amd/svm.c182
-rw-r--r--sys/amd64/vmm/amd/vmcb.h4
-rw-r--r--sys/amd64/vmm/intel/vmcs.c8
-rw-r--r--sys/amd64/vmm/intel/vmcs.h3
-rw-r--r--sys/amd64/vmm/intel/vmx.c652
-rw-r--r--sys/amd64/vmm/intel/vmx_msr.c1
-rw-r--r--sys/amd64/vmm/intel/vmx_msr.h23
-rw-r--r--sys/amd64/vmm/intel/vtd.c5
-rw-r--r--sys/amd64/vmm/io/vlapic.c49
-rw-r--r--sys/amd64/vmm/io/vlapic.h3
-rw-r--r--sys/amd64/vmm/vmm.c693
-rw-r--r--sys/amd64/vmm/vmm_dev.c45
-rw-r--r--sys/amd64/vmm/vmm_host.c7
-rw-r--r--sys/amd64/vmm/vmm_instruction_emul.c681
-rw-r--r--sys/amd64/vmm/vmm_stat.c29
-rw-r--r--sys/amd64/vmm/vmm_stat.h44
-rw-r--r--sys/amd64/vmm/x86.c20
17 files changed, 1878 insertions, 571 deletions
diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c
index 9dd893a..fad2a80 100644
--- a/sys/amd64/vmm/amd/svm.c
+++ b/sys/amd64/vmm/amd/svm.c
@@ -117,7 +117,8 @@ static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
*/
static struct svm_regctx host_ctx[MAXCPU];
-static VMM_STAT_AMD(VCPU_EXITINTINFO, "Valid EXITINTINFO");
+static VMM_STAT_AMD(VCPU_EXITINTINFO, "Valid VMCB EXITINTINFO");
+static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "VMM pending exception injected");
/*
* Common function to enable or disabled SVM for a CPU.
@@ -486,13 +487,28 @@ svm_cpl(struct vmcb_state *state)
}
static enum vm_cpu_mode
-svm_vcpu_mode(uint64_t efer)
+svm_vcpu_mode(struct vmcb *vmcb)
{
+ struct vmcb_segment *seg;
+ struct vmcb_state *state;
- if (efer & EFER_LMA)
- return (CPU_MODE_64BIT);
- else
- return (CPU_MODE_COMPATIBILITY);
+ state = &vmcb->state;
+
+ if (state->efer & EFER_LMA) {
+ seg = vmcb_seg(vmcb, VM_REG_GUEST_CS);
+ /*
+ * Section 4.8.1 for APM2, check if Code Segment has
+ * Long attribute set in descriptor.
+ */
+ if (seg->attrib & VMCB_CS_ATTRIB_L)
+ return (CPU_MODE_64BIT);
+ else
+ return (CPU_MODE_COMPATIBILITY);
+ } else if (state->cr0 & CR0_PE) {
+ return (CPU_MODE_PROTECTED);
+ } else {
+ return (CPU_MODE_REAL);
+ }
}
static enum vm_paging_mode
@@ -569,16 +585,19 @@ svm_inout_str_addrsize(uint64_t info1)
}
static void
-svm_paging_info(struct vmcb_state *state, struct vm_guest_paging *paging)
+svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
{
+ struct vmcb_state *state;
+ state = &vmcb->state;
paging->cr3 = state->cr3;
paging->cpl = svm_cpl(state);
- paging->cpu_mode = svm_vcpu_mode(state->efer);
+ paging->cpu_mode = svm_vcpu_mode(vmcb);
paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
- state->efer);
+ state->efer);
}
+
/*
* Handle guest I/O intercept.
*/
@@ -607,7 +626,7 @@ svm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
if (vmexit->u.inout.string) {
vmexit->exitcode = VM_EXITCODE_INOUT_STR;
vis = &vmexit->u.inout_str;
- svm_paging_info(state, &vis->paging);
+ svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging);
vis->rflags = state->rflags;
vis->cr0 = state->cr0;
vis->index = svm_inout_str_index(regs, vmexit->u.inout.in);
@@ -649,6 +668,41 @@ svm_npf_emul_fault(uint64_t exitinfo1)
return (true);
}
+static void
+svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit)
+{
+ struct vm_guest_paging *paging;
+ struct vmcb_segment *seg;
+
+ paging = &vmexit->u.inst_emul.paging;
+ vmexit->exitcode = VM_EXITCODE_INST_EMUL;
+ vmexit->u.inst_emul.gpa = gpa;
+ vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
+ svm_paging_info(vmcb, paging);
+
+ /*
+ * If DecodeAssist SVM feature doesn't exist, we don't have NPF
+ * instuction length. RIP will be calculated based on the length
+ * determined by instruction emulation.
+ */
+ vmexit->inst_length = VIE_INST_SIZE;
+
+ seg = vmcb_seg(vmcb, VM_REG_GUEST_CS);
+ switch(paging->cpu_mode) {
+ case CPU_MODE_PROTECTED:
+ case CPU_MODE_COMPATIBILITY:
+ /*
+ * Section 4.8.1 of APM2, Default Operand Size or D bit.
+ */
+ vmexit->u.inst_emul.cs_d = (seg->attrib & VMCB_CS_ATTRIB_D) ?
+ 1 : 0;
+ break;
+ default:
+ vmexit->u.inst_emul.cs_d = 0;
+ break;
+ }
+}
+
/*
* Special handling of EFER MSR.
* SVM guest must have SVM EFER bit set, prohibit guest from cleareing SVM
@@ -672,6 +726,29 @@ svm_efer(struct svm_softc *svm_sc, int vcpu, boolean_t write)
}
}
+static void
+svm_save_intinfo(struct svm_softc *svm_sc, int vcpu)
+{
+ struct vmcb_ctrl *ctrl;
+ uint64_t intinfo;
+
+ ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
+ intinfo = ctrl->exitintinfo;
+ if (!VMCB_EXITINTINFO_VALID(intinfo))
+ return;
+
+ /*
+ * From APMv2, Section "Intercepts during IDT interrupt delivery"
+ *
+ * If a #VMEXIT happened during event delivery then record the event
+ * that was being delivered.
+ */
+ VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n",
+ intinfo, VMCB_EXITINTINFO_VECTOR(intinfo));
+ vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
+ vm_exit_intinfo(svm_sc->vm, vcpu, intinfo);
+}
+
/*
* Determine the cause of virtual cpu exit and handle VMEXIT.
* Return: false - Break vcpu execution loop and handle vmexit
@@ -705,6 +782,8 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
"injection valid bit is set %#lx", __func__, ctrl->eventinj));
+ svm_save_intinfo(svm_sc, vcpu);
+
switch (code) {
case VMCB_EXIT_MC: /* Machine Check. */
vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_MTRAP, 1);
@@ -782,7 +861,6 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
case VMCB_EXIT_IO:
loop = svm_handle_io(svm_sc, vcpu, vmexit);
vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
- update_rip = true;
break;
case VMCB_EXIT_CPUID:
@@ -847,24 +925,8 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
VCPU_CTR3(svm_sc->vm, vcpu, "SVM:NPF inst_emul,"
"RIP:0x%lx INFO1:0x%lx INFO2:0x%lx .\n",
state->rip, info1, info2);
- vmexit->exitcode = VM_EXITCODE_INST_EMUL;
- vmexit->u.inst_emul.gpa = info2;
- vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
- vmexit->u.inst_emul.paging.cr3 = state->cr3;
- vmexit->u.inst_emul.paging.cpu_mode =
- svm_vcpu_mode(state->efer);
- vmexit->u.inst_emul.paging.paging_mode =
- svm_paging_mode(state->cr0, state->cr4,
- state->efer);
- /* XXX: get CPL from SS */
- vmexit->u.inst_emul.paging.cpl = 0;
- /*
- * If DecodeAssist SVM feature doesn't exist,
- * we don't have faulty instuction length. New
- * RIP will be calculated based on software
- * instruction emulation.
- */
- vmexit->inst_length = VIE_INST_SIZE;
+ svm_handle_inst_emul(svm_get_vmcb(svm_sc, vcpu),
+ info2, vmexit);
vmm_stat_incr(svm_sc->vm, vcpu,
VMEXIT_INST_EMUL, 1);
}
@@ -944,6 +1006,28 @@ svm_inject_nmi(struct svm_softc *svm_sc, int vcpu)
return (1);
}
+static void
+svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu)
+{
+ struct vmcb_ctrl *ctrl;
+ uint64_t intinfo;
+
+ ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
+
+ if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo))
+ return;
+
+ KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not "
+ "valid: %#lx", __func__, intinfo));
+
+ vmcb_eventinject(ctrl, VMCB_EXITINTINFO_TYPE(intinfo),
+ VMCB_EXITINTINFO_VECTOR(intinfo),
+ VMCB_EXITINTINFO_EC(intinfo),
+ VMCB_EXITINTINFO_EC_VALID(intinfo));
+ vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
+ VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo);
+}
+
/*
* Inject event to virtual cpu.
*/
@@ -952,7 +1036,6 @@ svm_inj_interrupts(struct svm_softc *svm_sc, int vcpu, struct vlapic *vlapic)
{
struct vmcb_ctrl *ctrl;
struct vmcb_state *state;
- struct vm_exception exc;
int extint_pending;
int vector;
@@ -961,12 +1044,7 @@ svm_inj_interrupts(struct svm_softc *svm_sc, int vcpu, struct vlapic *vlapic)
state = svm_get_vmcb_state(svm_sc, vcpu);
ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
- if (vm_exception_pending(svm_sc->vm, vcpu, &exc)) {
- KASSERT(exc.vector >= 0 && exc.vector < 32,
- ("Exception vector% invalid", exc.vector));
- vmcb_eventinject(ctrl, VMCB_EVENTINJ_TYPE_EXCEPTION, exc.vector,
- exc.error_code, exc.error_code_valid);
- }
+ svm_inj_intinfo(svm_sc, vcpu);
/* Can't inject multiple events at once. */
if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
@@ -1044,31 +1122,6 @@ setup_tss_type(void)
desc->sd_type = 9;
}
-static void
-svm_handle_exitintinfo(struct svm_softc *svm_sc, int vcpu)
-{
- struct vmcb_ctrl *ctrl;
- uint64_t intinfo;
-
- ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
-
- /*
- * VMEXIT while delivering an exception or interrupt.
- * Inject it as virtual interrupt.
- * Section 15.7.2 Intercepts during IDT interrupt delivery.
- */
- intinfo = ctrl->exitintinfo;
-
- if (VMCB_EXITINTINFO_VALID(intinfo)) {
- vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
- VCPU_CTR1(svm_sc->vm, vcpu, "SVM:EXITINTINFO:0x%lx is valid\n",
- intinfo);
- vmcb_eventinject(ctrl, VMCB_EXITINTINFO_TYPE(intinfo),
- VMCB_EXITINTINFO_VECTOR(intinfo),
- VMCB_EXITINTINFO_EC(intinfo),
- VMCB_EXITINTINFO_EC_VALID(intinfo));
- }
-}
/*
* Start vcpu with specified RIP.
*/
@@ -1106,8 +1159,9 @@ svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap,
vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
/*
- * Flush all TLB mapping for this guest on this CPU,
- * it might have stale entries.
+ * Flush all TLB mappings for this guest on this CPU,
+ * it might have stale entries since vcpu has migrated
+ * or vmm is restarted.
*/
ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;
@@ -1185,8 +1239,6 @@ svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap,
(void)svm_set_vmcb(svm_get_vmcb(svm_sc, vcpu), svm_sc->asid);
- svm_handle_exitintinfo(svm_sc, vcpu);
-
svm_inj_interrupts(svm_sc, vcpu, vlapic);
/* Change TSS type to available.*/
diff --git a/sys/amd64/vmm/amd/vmcb.h b/sys/amd64/vmm/amd/vmcb.h
index 1b7c2be..3e10469 100644
--- a/sys/amd64/vmm/amd/vmcb.h
+++ b/sys/amd64/vmm/amd/vmcb.h
@@ -163,6 +163,10 @@ struct vmcb_segment {
} __attribute__ ((__packed__));
CTASSERT(sizeof(struct vmcb_segment) == 16);
+/* Code segment descriptor attribute in 12 bit format as saved by VMCB. */
+#define VMCB_CS_ATTRIB_L BIT(9) /* Long mode. */
+#define VMCB_CS_ATTRIB_D BIT(10) /* OPerand size bit. */
+
/*
* The VMCB is divided into two areas - the first one contains various
* control bits including the intercept vector and the second one contains
diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c
index cc97d95..51e5c2c 100644
--- a/sys/amd64/vmm/intel/vmcs.c
+++ b/sys/amd64/vmm/intel/vmcs.c
@@ -103,6 +103,14 @@ vmcs_field_encoding(int ident)
return (VMCS_GUEST_LDTR_SELECTOR);
case VM_REG_GUEST_EFER:
return (VMCS_GUEST_IA32_EFER);
+ case VM_REG_GUEST_PDPTE0:
+ return (VMCS_GUEST_PDPTE0);
+ case VM_REG_GUEST_PDPTE1:
+ return (VMCS_GUEST_PDPTE1);
+ case VM_REG_GUEST_PDPTE2:
+ return (VMCS_GUEST_PDPTE2);
+ case VM_REG_GUEST_PDPTE3:
+ return (VMCS_GUEST_PDPTE3);
default:
return (-1);
}
diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h
index 657d5b0..4e9557c 100644
--- a/sys/amd64/vmm/intel/vmcs.h
+++ b/sys/amd64/vmm/intel/vmcs.h
@@ -346,6 +346,9 @@ vmcs_write(uint32_t encoding, uint64_t val)
#define VMCS_INTR_T_HWINTR (0 << 8)
#define VMCS_INTR_T_NMI (2 << 8)
#define VMCS_INTR_T_HWEXCEPTION (3 << 8)
+#define VMCS_INTR_T_SWINTR (4 << 8)
+#define VMCS_INTR_T_PRIV_SWEXCEPTION (5 << 8)
+#define VMCS_INTR_T_SWEXCEPTION (6 << 8)
#define VMCS_INTR_DEL_ERRCODE (1 << 11)
/*
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index dd1cb69..348cc2a 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -83,7 +83,9 @@ __FBSDID("$FreeBSD$");
(PROCBASED_SECONDARY_CONTROLS | \
PROCBASED_IO_EXITING | \
PROCBASED_MSR_BITMAPS | \
- PROCBASED_CTLS_WINDOW_SETTING)
+ PROCBASED_CTLS_WINDOW_SETTING | \
+ PROCBASED_CR8_LOAD_EXITING | \
+ PROCBASED_CR8_STORE_EXITING)
#define PROCBASED_CTLS_ZERO_SETTING \
(PROCBASED_CR3_LOAD_EXITING | \
PROCBASED_CR3_STORE_EXITING | \
@@ -147,8 +149,6 @@ SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
&cr4_zeros_mask, 0, NULL);
-static int vmx_no_patmsr;
-
static int vmx_initialized;
SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
&vmx_initialized, 0, "Intel VMX initialized");
@@ -156,18 +156,38 @@ SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
/*
* Optional capabilities
*/
+static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL);
+
+static int vmx_patmsr;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, patmsr, CTLFLAG_RD, &vmx_patmsr, 0,
+ "PAT MSR saved and restored in VCMS");
+
static int cap_halt_exit;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0,
+ "HLT triggers a VM-exit");
+
static int cap_pause_exit;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit,
+ 0, "PAUSE triggers a VM-exit");
+
static int cap_unrestricted_guest;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD,
+ &cap_unrestricted_guest, 0, "Unrestricted guests");
+
static int cap_monitor_trap;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD,
+ &cap_monitor_trap, 0, "Monitor trap flag");
+
static int cap_invpcid;
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid,
+ 0, "Guests are allowed to use INVPCID");
static int virtual_interrupt_delivery;
-SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
&virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
static int posted_interrupts;
-SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD,
+SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD,
&posted_interrupts, 0, "APICv posted interrupt support");
static int pirvec;
@@ -512,6 +532,15 @@ static void
vmx_enable(void *arg __unused)
{
int error;
+ uint64_t feature_control;
+
+ feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
+ if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
+ (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
+ wrmsr(MSR_IA32_FEATURE_CONTROL,
+ feature_control | IA32_FEATURE_CONTROL_VMX_EN |
+ IA32_FEATURE_CONTROL_LOCK);
+ }
load_cr4(rcr4() | CR4_VMXE);
@@ -547,7 +576,7 @@ vmx_init(int ipinum)
* are set (bits 0 and 2 respectively).
*/
feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
- if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
+ if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 &&
(feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
printf("vmx_init: VMX operation disabled by BIOS\n");
return (ENXIO);
@@ -607,6 +636,7 @@ vmx_init(int ipinum)
}
/* Check support for VM-exit controls */
+ vmx_patmsr = 1;
error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
VM_EXIT_CTLS_ONE_SETTING,
VM_EXIT_CTLS_ZERO_SETTING,
@@ -626,12 +656,12 @@ vmx_init(int ipinum)
if (bootverbose)
printf("vmm: PAT MSR access not supported\n");
guest_msr_valid(MSR_PAT);
- vmx_no_patmsr = 1;
+ vmx_patmsr = 0;
}
}
/* Check support for VM-entry controls */
- if (!vmx_no_patmsr) {
+ if (vmx_patmsr) {
error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
MSR_VMX_TRUE_ENTRY_CTLS,
VM_ENTRY_CTLS_ONE_SETTING,
@@ -705,6 +735,13 @@ vmx_init(int ipinum)
procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
/*
+ * No need to emulate accesses to %CR8 if virtual
+ * interrupt delivery is enabled.
+ */
+ procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING;
+ procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING;
+
+ /*
* Check for Posted Interrupts only if Virtual Interrupt
* Delivery is enabled.
*/
@@ -900,7 +937,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
* MSR_PAT save/restore support, leave access disabled so accesses
* will be trapped.
*/
- if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
+ if (vmx_patmsr && guest_msr_rw(vmx, MSR_PAT))
panic("vmx_vminit: error setting guest pat msr access");
vpid_alloc(vpid, VM_MAXCPU);
@@ -956,7 +993,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
vmx->cap[i].proc_ctls = procbased_ctls;
vmx->cap[i].proc_ctls2 = procbased_ctls2;
- vmx->state[i].lastcpu = -1;
+ vmx->state[i].lastcpu = NOCPU;
vmx->state[i].vpid = vpid[i];
msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
@@ -1029,27 +1066,37 @@ vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
}
static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
+static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done");
-static void
-vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
+/*
+ * Invalidate guest mappings identified by its vpid from the TLB.
+ */
+static __inline void
+vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)
{
struct vmxstate *vmxstate;
struct invvpid_desc invvpid_desc;
vmxstate = &vmx->state[vcpu];
- if (vmxstate->lastcpu == curcpu)
+ if (vmxstate->vpid == 0)
return;
- vmxstate->lastcpu = curcpu;
-
- vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
+ if (!running) {
+ /*
+ * Set the 'lastcpu' to an invalid host cpu.
+ *
+ * This will invalidate TLB entries tagged with the vcpu's
+ * vpid the next time it runs via vmx_set_pcpu_defaults().
+ */
+ vmxstate->lastcpu = NOCPU;
+ return;
+ }
- vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
- vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
- vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
+ KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside "
+ "critical section", __func__, vcpu));
/*
- * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
+ * Invalidate all mappings tagged with 'vpid'
*
* We do this because this vcpu was executing on a different host
* cpu when it last ran. We do not track whether it invalidated
@@ -1063,25 +1110,43 @@ vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
* Note also that this will invalidate mappings tagged with 'vpid'
* for "all" EP4TAs.
*/
- if (vmxstate->vpid != 0) {
- if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
- invvpid_desc._res1 = 0;
- invvpid_desc._res2 = 0;
- invvpid_desc.vpid = vmxstate->vpid;
- invvpid_desc.linear_addr = 0;
- invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
- } else {
- /*
- * The invvpid can be skipped if an invept is going to
- * be performed before entering the guest. The invept
- * will invalidate combined mappings tagged with
- * 'vmx->eptp' for all vpids.
- */
- vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
- }
+ if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
+ invvpid_desc._res1 = 0;
+ invvpid_desc._res2 = 0;
+ invvpid_desc.vpid = vmxstate->vpid;
+ invvpid_desc.linear_addr = 0;
+ invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
+ vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1);
+ } else {
+ /*
+ * The invvpid can be skipped if an invept is going to
+ * be performed before entering the guest. The invept
+ * will invalidate combined mappings tagged with
+ * 'vmx->eptp' for all vpids.
+ */
+ vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
}
}
+static void
+vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
+{
+ struct vmxstate *vmxstate;
+
+ vmxstate = &vmx->state[vcpu];
+ if (vmxstate->lastcpu == curcpu)
+ return;
+
+ vmxstate->lastcpu = curcpu;
+
+ vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
+
+ vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
+ vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
+ vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
+ vmx_invvpid(vmx, vcpu, pmap, 1);
+}
+
/*
* We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
*/
@@ -1165,24 +1230,32 @@ vmx_inject_nmi(struct vmx *vmx, int vcpu)
static void
vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
{
- struct vm_exception exc;
int vector, need_nmi_exiting, extint_pending;
- uint64_t rflags;
+ uint64_t rflags, entryinfo;
uint32_t gi, info;
- if (vm_exception_pending(vmx->vm, vcpu, &exc)) {
- KASSERT(exc.vector >= 0 && exc.vector < 32,
- ("%s: invalid exception vector %d", __func__, exc.vector));
+ if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
+ KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
+ "intinfo is not valid: %#lx", __func__, entryinfo));
info = vmcs_read(VMCS_ENTRY_INTR_INFO);
KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
- "pending exception %d: %#x", __func__, exc.vector, info));
+ "pending exception: %#lx/%#x", __func__, entryinfo, info));
- info = exc.vector | VMCS_INTR_T_HWEXCEPTION | VMCS_INTR_VALID;
- if (exc.error_code_valid) {
- info |= VMCS_INTR_DEL_ERRCODE;
- vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, exc.error_code);
+ info = entryinfo;
+ vector = info & 0xff;
+ if (vector == IDT_BP || vector == IDT_OF) {
+ /*
+ * VT-x requires #BP and #OF to be injected as software
+ * exceptions.
+ */
+ info &= ~VMCS_INTR_T_MASK;
+ info |= VMCS_INTR_T_SWEXCEPTION;
}
+
+ if (info & VMCS_INTR_DEL_ERRCODE)
+ vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
+
vmcs_write(VMCS_ENTRY_INTR_INFO, info);
}
@@ -1240,12 +1313,28 @@ vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
/* Ask the local apic for a vector to inject */
if (!vlapic_pending_intr(vlapic, &vector))
return;
+
+ /*
+ * From the Intel SDM, Volume 3, Section "Maskable
+ * Hardware Interrupts":
+ * - maskable interrupt vectors [16,255] can be delivered
+ * through the local APIC.
+ */
+ KASSERT(vector >= 16 && vector <= 255,
+ ("invalid vector %d from local APIC", vector));
} else {
/* Ask the legacy pic for a vector to inject */
vatpic_pending_intr(vmx->vm, &vector);
- }
- KASSERT(vector >= 32 && vector <= 255, ("invalid vector %d", vector));
+ /*
+ * From the Intel SDM, Volume 3, Section "Maskable
+ * Hardware Interrupts":
+ * - maskable interrupt vectors [0,255] can be delivered
+ * through the INTR pin.
+ */
+ KASSERT(vector >= 0 && vector <= 255,
+ ("invalid vector %d from INTR", vector));
+ }
/* Check RFLAGS.IF and the interruptibility state of the guest */
rflags = vmcs_read(VMCS_GUEST_RFLAGS);
@@ -1293,9 +1382,13 @@ vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
* have posted another one. If that is the case, set
* the Interrupt Window Exiting execution control so
* we can inject that one too.
+ *
+ * Also, interrupt window exiting allows us to inject any
+ * pending APIC vector that was preempted by the ExtINT
+ * as soon as possible. This applies both for the software
+ * emulated vlapic and the hardware assisted virtual APIC.
*/
- if (vm_extint_pending(vmx->vm, vcpu))
- vmx_set_int_window_exiting(vmx, vcpu);
+ vmx_set_int_window_exiting(vmx, vcpu);
}
VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
@@ -1341,6 +1434,16 @@ vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
}
+static void
+vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid)
+{
+ uint32_t gi;
+
+ gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
+ KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING,
+ ("NMI blocking is not in effect %#x", gi));
+}
+
static int
vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
{
@@ -1380,8 +1483,30 @@ vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
return (HANDLED);
}
- if ((xcrval & (XFEATURE_ENABLED_AVX | XFEATURE_ENABLED_SSE)) ==
- XFEATURE_ENABLED_AVX) {
+ /* AVX (YMM_Hi128) requires SSE. */
+ if (xcrval & XFEATURE_ENABLED_AVX &&
+ (xcrval & XFEATURE_AVX) != XFEATURE_AVX) {
+ vm_inject_gp(vmx->vm, vcpu);
+ return (HANDLED);
+ }
+
+ /*
+ * AVX512 requires base AVX (YMM_Hi128) as well as OpMask,
+ * ZMM_Hi256, and Hi16_ZMM.
+ */
+ if (xcrval & XFEATURE_AVX512 &&
+ (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) !=
+ (XFEATURE_AVX512 | XFEATURE_AVX)) {
+ vm_inject_gp(vmx->vm, vcpu);
+ return (HANDLED);
+ }
+
+ /*
+ * Intel MPX requires both bound register state flags to be
+ * set.
+ */
+ if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) !=
+ ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) {
vm_inject_gp(vmx->vm, vcpu);
return (HANDLED);
}
@@ -1395,97 +1520,130 @@ vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
return (HANDLED);
}
-static int
-vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
+static uint64_t
+vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident)
{
- int cr, vmcs_guest_cr, vmcs_shadow_cr;
- uint64_t crval, regval, ones_mask, zeros_mask;
const struct vmxctx *vmxctx;
- /* We only handle mov to %cr0 or %cr4 at this time */
- if ((exitqual & 0xf0) != 0x00)
- return (UNHANDLED);
+ vmxctx = &vmx->ctx[vcpu];
- cr = exitqual & 0xf;
- if (cr != 0 && cr != 4)
- return (UNHANDLED);
+ switch (ident) {
+ case 0:
+ return (vmxctx->guest_rax);
+ case 1:
+ return (vmxctx->guest_rcx);
+ case 2:
+ return (vmxctx->guest_rdx);
+ case 3:
+ return (vmxctx->guest_rbx);
+ case 4:
+ return (vmcs_read(VMCS_GUEST_RSP));
+ case 5:
+ return (vmxctx->guest_rbp);
+ case 6:
+ return (vmxctx->guest_rsi);
+ case 7:
+ return (vmxctx->guest_rdi);
+ case 8:
+ return (vmxctx->guest_r8);
+ case 9:
+ return (vmxctx->guest_r9);
+ case 10:
+ return (vmxctx->guest_r10);
+ case 11:
+ return (vmxctx->guest_r11);
+ case 12:
+ return (vmxctx->guest_r12);
+ case 13:
+ return (vmxctx->guest_r13);
+ case 14:
+ return (vmxctx->guest_r14);
+ case 15:
+ return (vmxctx->guest_r15);
+ default:
+ panic("invalid vmx register %d", ident);
+ }
+}
+
+static void
+vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval)
+{
+ struct vmxctx *vmxctx;
- regval = 0; /* silence gcc */
vmxctx = &vmx->ctx[vcpu];
- /*
- * We must use vmcs_write() directly here because vmcs_setreg() will
- * call vmclear(vmcs) as a side-effect which we certainly don't want.
- */
- switch ((exitqual >> 8) & 0xf) {
+ switch (ident) {
case 0:
- regval = vmxctx->guest_rax;
+ vmxctx->guest_rax = regval;
break;
case 1:
- regval = vmxctx->guest_rcx;
+ vmxctx->guest_rcx = regval;
break;
case 2:
- regval = vmxctx->guest_rdx;
+ vmxctx->guest_rdx = regval;
break;
case 3:
- regval = vmxctx->guest_rbx;
+ vmxctx->guest_rbx = regval;
break;
case 4:
- regval = vmcs_read(VMCS_GUEST_RSP);
+ vmcs_write(VMCS_GUEST_RSP, regval);
break;
case 5:
- regval = vmxctx->guest_rbp;
+ vmxctx->guest_rbp = regval;
break;
case 6:
- regval = vmxctx->guest_rsi;
+ vmxctx->guest_rsi = regval;
break;
case 7:
- regval = vmxctx->guest_rdi;
+ vmxctx->guest_rdi = regval;
break;
case 8:
- regval = vmxctx->guest_r8;
+ vmxctx->guest_r8 = regval;
break;
case 9:
- regval = vmxctx->guest_r9;
+ vmxctx->guest_r9 = regval;
break;
case 10:
- regval = vmxctx->guest_r10;
+ vmxctx->guest_r10 = regval;
break;
case 11:
- regval = vmxctx->guest_r11;
+ vmxctx->guest_r11 = regval;
break;
case 12:
- regval = vmxctx->guest_r12;
+ vmxctx->guest_r12 = regval;
break;
case 13:
- regval = vmxctx->guest_r13;
+ vmxctx->guest_r13 = regval;
break;
case 14:
- regval = vmxctx->guest_r14;
+ vmxctx->guest_r14 = regval;
break;
case 15:
- regval = vmxctx->guest_r15;
+ vmxctx->guest_r15 = regval;
break;
+ default:
+ panic("invalid vmx register %d", ident);
}
+}
- if (cr == 0) {
- ones_mask = cr0_ones_mask;
- zeros_mask = cr0_zeros_mask;
- vmcs_guest_cr = VMCS_GUEST_CR0;
- vmcs_shadow_cr = VMCS_CR0_SHADOW;
- } else {
- ones_mask = cr4_ones_mask;
- zeros_mask = cr4_zeros_mask;
- vmcs_guest_cr = VMCS_GUEST_CR4;
- vmcs_shadow_cr = VMCS_CR4_SHADOW;
- }
- vmcs_write(vmcs_shadow_cr, regval);
+static int
+vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
+{
+ uint64_t crval, regval;
+
+ /* We only handle mov to %cr0 at this time */
+ if ((exitqual & 0xf0) != 0x00)
+ return (UNHANDLED);
+
+ regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
- crval = regval | ones_mask;
- crval &= ~zeros_mask;
- vmcs_write(vmcs_guest_cr, crval);
+ vmcs_write(VMCS_CR0_SHADOW, regval);
- if (cr == 0 && regval & CR0_PG) {
+ crval = regval | cr0_ones_mask;
+ crval &= ~cr0_zeros_mask;
+ vmcs_write(VMCS_GUEST_CR0, crval);
+
+ if (regval & CR0_PG) {
uint64_t efer, entry_ctls;
/*
@@ -1506,6 +1664,51 @@ vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
return (HANDLED);
}
+static int
+vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
+{
+ uint64_t crval, regval;
+
+ /* We only handle mov to %cr4 at this time */
+ if ((exitqual & 0xf0) != 0x00)
+ return (UNHANDLED);
+
+ regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
+
+ vmcs_write(VMCS_CR4_SHADOW, regval);
+
+ crval = regval | cr4_ones_mask;
+ crval &= ~cr4_zeros_mask;
+ vmcs_write(VMCS_GUEST_CR4, crval);
+
+ return (HANDLED);
+}
+
+static int
+vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
+{
+ struct vlapic *vlapic;
+ uint64_t cr8;
+ int regnum;
+
+ /* We only handle mov %cr8 to/from a register at this time. */
+ if ((exitqual & 0xe0) != 0x00) {
+ return (UNHANDLED);
+ }
+
+ vlapic = vm_lapic(vmx->vm, vcpu);
+ regnum = (exitqual >> 8) & 0xf;
+ if (exitqual & 0x10) {
+ cr8 = vlapic_get_cr8(vlapic);
+ vmx_set_guest_reg(vmx, vcpu, regnum, cr8);
+ } else {
+ cr8 = vmx_get_guest_reg(vmx, vcpu, regnum);
+ vlapic_set_cr8(vlapic, cr8);
+ }
+
+ return (HANDLED);
+}
+
/*
* From section "Guest Register State" in the Intel SDM: CPL = SS.DPL
*/
@@ -1521,11 +1724,19 @@ vmx_cpl(void)
static enum vm_cpu_mode
vmx_cpu_mode(void)
{
+ uint32_t csar;
- if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA)
- return (CPU_MODE_64BIT);
- else
- return (CPU_MODE_COMPATIBILITY);
+ if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) {
+ csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
+ if (csar & 0x2000)
+ return (CPU_MODE_64BIT); /* CS.L = 1 */
+ else
+ return (CPU_MODE_COMPATIBILITY);
+ } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) {
+ return (CPU_MODE_PROTECTED);
+ } else {
+ return (CPU_MODE_REAL);
+ }
}
static enum vm_paging_mode
@@ -1617,10 +1828,25 @@ vmx_paging_info(struct vm_guest_paging *paging)
static void
vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
{
+ struct vm_guest_paging *paging;
+ uint32_t csar;
+
+ paging = &vmexit->u.inst_emul.paging;
+
vmexit->exitcode = VM_EXITCODE_INST_EMUL;
vmexit->u.inst_emul.gpa = gpa;
vmexit->u.inst_emul.gla = gla;
- vmx_paging_info(&vmexit->u.inst_emul.paging);
+ vmx_paging_info(paging);
+ switch (paging->cpu_mode) {
+ case CPU_MODE_PROTECTED:
+ case CPU_MODE_COMPATIBILITY:
+ csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
+ vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar);
+ break;
+ default:
+ vmexit->u.inst_emul.cs_d = 0;
+ break;
+ }
}
static int
@@ -1829,6 +2055,26 @@ vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
return (UNHANDLED);
}
+static enum task_switch_reason
+vmx_task_switch_reason(uint64_t qual)
+{
+ int reason;
+
+ reason = (qual >> 30) & 0x3;
+ switch (reason) {
+ case 0:
+ return (TSR_CALL);
+ case 1:
+ return (TSR_IRET);
+ case 2:
+ return (TSR_JMP);
+ case 3:
+ return (TSR_IDT_GATE);
+ default:
+ panic("%s: invalid reason %d", __func__, reason);
+ }
+}
+
static int
vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
{
@@ -1836,9 +2082,10 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
struct vmxctx *vmxctx;
struct vlapic *vlapic;
struct vm_inout_str *vis;
+ struct vm_task_switch *ts;
uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
- uint32_t reason;
- uint64_t qual, gpa;
+ uint32_t intr_type, reason;
+ uint64_t exitintinfo, qual, gpa;
bool retu;
CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
@@ -1854,49 +2101,112 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
/*
- * VM exits that could be triggered during event injection on the
- * previous VM entry need to be handled specially by re-injecting
- * the event.
+ * VM exits that can be triggered during event delivery need to
+ * be handled specially by re-injecting the event if the IDT
+ * vectoring information field's valid bit is set.
*
* See "Information for VM Exits During Event Delivery" in Intel SDM
* for details.
*/
- switch (reason) {
- case EXIT_REASON_EPT_FAULT:
- case EXIT_REASON_EPT_MISCONFIG:
- case EXIT_REASON_APIC_ACCESS:
- case EXIT_REASON_TASK_SWITCH:
- case EXIT_REASON_EXCEPTION:
- idtvec_info = vmcs_idt_vectoring_info();
- if (idtvec_info & VMCS_IDT_VEC_VALID) {
- idtvec_info &= ~(1 << 12); /* clear undefined bit */
- vmcs_write(VMCS_ENTRY_INTR_INFO, idtvec_info);
- if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
- idtvec_err = vmcs_idt_vectoring_err();
- vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR,
- idtvec_err);
- }
- /*
- * If 'virtual NMIs' are being used and the VM-exit
- * happened while injecting an NMI during the previous
- * VM-entry, then clear "blocking by NMI" in the Guest
- * Interruptibility-state.
- */
- if ((idtvec_info & VMCS_INTR_T_MASK) ==
- VMCS_INTR_T_NMI) {
- vmx_clear_nmi_blocking(vmx, vcpu);
- }
+ idtvec_info = vmcs_idt_vectoring_info();
+ if (idtvec_info & VMCS_IDT_VEC_VALID) {
+ idtvec_info &= ~(1 << 12); /* clear undefined bit */
+ exitintinfo = idtvec_info;
+ if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
+ idtvec_err = vmcs_idt_vectoring_err();
+ exitintinfo |= (uint64_t)idtvec_err << 32;
+ }
+ error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo);
+ KASSERT(error == 0, ("%s: vm_set_intinfo error %d",
+ __func__, error));
+
+ /*
+ * If 'virtual NMIs' are being used and the VM-exit
+ * happened while injecting an NMI during the previous
+ * VM-entry, then clear "blocking by NMI" in the
+ * Guest Interruptibility-State so the NMI can be
+ * reinjected on the subsequent VM-entry.
+ *
+ * However, if the NMI was being delivered through a task
+ * gate, then the new task must start execution with NMIs
+ * blocked so don't clear NMI blocking in this case.
+ */
+ intr_type = idtvec_info & VMCS_INTR_T_MASK;
+ if (intr_type == VMCS_INTR_T_NMI) {
+ if (reason != EXIT_REASON_TASK_SWITCH)
+ vmx_clear_nmi_blocking(vmx, vcpu);
+ else
+ vmx_assert_nmi_blocking(vmx, vcpu);
+ }
+
+ /*
+ * Update VM-entry instruction length if the event being
+ * delivered was a software interrupt or software exception.
+ */
+ if (intr_type == VMCS_INTR_T_SWINTR ||
+ intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION ||
+ intr_type == VMCS_INTR_T_SWEXCEPTION) {
vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
}
- default:
- idtvec_info = 0;
- break;
}
switch (reason) {
+ case EXIT_REASON_TASK_SWITCH:
+ ts = &vmexit->u.task_switch;
+ ts->tsssel = qual & 0xffff;
+ ts->reason = vmx_task_switch_reason(qual);
+ ts->ext = 0;
+ ts->errcode_valid = 0;
+ vmx_paging_info(&ts->paging);
+ /*
+ * If the task switch was due to a CALL, JMP, IRET, software
+ * interrupt (INT n) or software exception (INT3, INTO),
+ * then the saved %rip references the instruction that caused
+ * the task switch. The instruction length field in the VMCS
+ * is valid in this case.
+ *
+ * In all other cases (e.g., NMI, hardware exception) the
+ * saved %rip is one that would have been saved in the old TSS
+ * had the task switch completed normally so the instruction
+ * length field is not needed in this case and is explicitly
+ * set to 0.
+ */
+ if (ts->reason == TSR_IDT_GATE) {
+ KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
+ ("invalid idtvec_info %#x for IDT task switch",
+ idtvec_info));
+ intr_type = idtvec_info & VMCS_INTR_T_MASK;
+ if (intr_type != VMCS_INTR_T_SWINTR &&
+ intr_type != VMCS_INTR_T_SWEXCEPTION &&
+ intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) {
+ /* Task switch triggered by external event */
+ ts->ext = 1;
+ vmexit->inst_length = 0;
+ if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
+ ts->errcode_valid = 1;
+ ts->errcode = vmcs_idt_vectoring_err();
+ }
+ }
+ }
+ vmexit->exitcode = VM_EXITCODE_TASK_SWITCH;
+ VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, "
+ "%s errcode 0x%016lx", ts->reason, ts->tsssel,
+ ts->ext ? "external" : "internal",
+ ((uint64_t)ts->errcode << 32) | ts->errcode_valid);
+ break;
case EXIT_REASON_CR_ACCESS:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
- handled = vmx_emulate_cr_access(vmx, vcpu, qual);
+ switch (qual & 0xf) {
+ case 0:
+ handled = vmx_emulate_cr0_access(vmx, vcpu, qual);
+ break;
+ case 4:
+ handled = vmx_emulate_cr4_access(vmx, vcpu, qual);
+ break;
+ case 8:
+ handled = vmx_emulate_cr8_access(vmx, vcpu, qual);
+ break;
+ }
break;
case EXIT_REASON_RDMSR:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
@@ -2029,6 +2339,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
* the guest.
*
* See "Resuming Guest Software after Handling an Exception".
+ * See "Information for VM Exits Due to Vectored Events".
*/
if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
(intr_info & 0xff) != IDT_DF &&
@@ -2129,32 +2440,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
return (handled);
}
-static __inline int
-vmx_exit_astpending(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
-{
-
- vmexit->rip = vmcs_guest_rip();
- vmexit->inst_length = 0;
- vmexit->exitcode = VM_EXITCODE_BOGUS;
- vmx_astpending_trace(vmx, vcpu, vmexit->rip);
- vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1);
-
- return (HANDLED);
-}
-
-static __inline int
-vmx_exit_rendezvous(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
-{
-
- vmexit->rip = vmcs_guest_rip();
- vmexit->inst_length = 0;
- vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
- vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RENDEZVOUS, 1);
-
- return (UNHANDLED);
-}
-
-static __inline int
+static __inline void
vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
{
@@ -2178,8 +2464,6 @@ vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
default:
panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
}
-
- return (UNHANDLED);
}
/*
@@ -2252,6 +2536,8 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
vmcs_write(VMCS_GUEST_RIP, startrip);
vmx_set_pcpu_defaults(vmx, vcpu, pmap);
do {
+ handled = UNHANDLED;
+
/*
* Interrupts are disabled from this point on until the
* guest starts executing. This is done for the following
@@ -2271,26 +2557,33 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
* pmap_invalidate_ept().
*/
disable_intr();
+ vmx_inject_interrupts(vmx, vcpu, vlapic);
+
+ /*
+ * Check for vcpu suspension after injecting events because
+ * vmx_inject_interrupts() can suspend the vcpu due to a
+ * triple fault.
+ */
if (vcpu_suspended(suspend_cookie)) {
enable_intr();
vm_exit_suspended(vmx->vm, vcpu, vmcs_guest_rip());
- handled = UNHANDLED;
break;
}
if (vcpu_rendezvous_pending(rendezvous_cookie)) {
enable_intr();
- handled = vmx_exit_rendezvous(vmx, vcpu, vmexit);
+ vm_exit_rendezvous(vmx->vm, vcpu, vmcs_guest_rip());
break;
}
- if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) {
+ if (vcpu_should_yield(vm, vcpu)) {
enable_intr();
- handled = vmx_exit_astpending(vmx, vcpu, vmexit);
+ vm_exit_astpending(vmx->vm, vcpu, vmcs_guest_rip());
+ vmx_astpending_trace(vmx, vcpu, vmexit->rip);
+ handled = HANDLED;
break;
}
- vmx_inject_interrupts(vmx, vcpu, vlapic);
vmx_run_trace(vmx, vcpu);
rc = vmx_enter_guest(vmxctx, vmx, launched);
@@ -2306,7 +2599,7 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
handled = vmx_exit_process(vmx, vcpu, vmexit);
} else {
enable_intr();
- handled = vmx_exit_inst_error(vmxctx, rc, vmexit);
+ vmx_exit_inst_error(vmxctx, rc, vmexit);
}
launched = 1;
vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
@@ -2458,6 +2751,7 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
{
int error, hostcpu, running, shadow;
uint64_t ctls;
+ pmap_t pmap;
struct vmx *vmx = arg;
running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
@@ -2495,6 +2789,18 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
error = vmcs_setreg(&vmx->vmcs[vcpu], running,
VMCS_IDENT(shadow), val);
}
+
+ if (reg == VM_REG_GUEST_CR3) {
+ /*
+ * Invalidate the guest vcpu's TLB mappings to emulate
+ * the behavior of updating %cr3.
+ *
+ * XXX the processor retains global mappings when %cr3
+ * is updated but vmx_invvpid() does not.
+ */
+ pmap = vmx->ctx[vcpu].pmap;
+ vmx_invvpid(vmx, vcpu, pmap, running);
+ }
}
return (error);
diff --git a/sys/amd64/vmm/intel/vmx_msr.c b/sys/amd64/vmm/intel/vmx_msr.c
index 2aba63c..a3428db 100644
--- a/sys/amd64/vmm/intel/vmx_msr.c
+++ b/sys/amd64/vmm/intel/vmx_msr.c
@@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$");
#include <sys/systm.h>
#include <machine/cpufunc.h>
+#include <machine/specialreg.h>
#include "vmx_msr.h"
diff --git a/sys/amd64/vmm/intel/vmx_msr.h b/sys/amd64/vmm/intel/vmx_msr.h
index e6379a9..340b0f7 100644
--- a/sys/amd64/vmm/intel/vmx_msr.h
+++ b/sys/amd64/vmm/intel/vmx_msr.h
@@ -29,29 +29,6 @@
#ifndef _VMX_MSR_H_
#define _VMX_MSR_H_
-#define MSR_VMX_BASIC 0x480
-#define MSR_VMX_EPT_VPID_CAP 0x48C
-
-#define MSR_VMX_PROCBASED_CTLS 0x482
-#define MSR_VMX_TRUE_PROCBASED_CTLS 0x48E
-
-#define MSR_VMX_PINBASED_CTLS 0x481
-#define MSR_VMX_TRUE_PINBASED_CTLS 0x48D
-
-#define MSR_VMX_PROCBASED_CTLS2 0x48B
-
-#define MSR_VMX_EXIT_CTLS 0x483
-#define MSR_VMX_TRUE_EXIT_CTLS 0x48f
-
-#define MSR_VMX_ENTRY_CTLS 0x484
-#define MSR_VMX_TRUE_ENTRY_CTLS 0x490
-
-#define MSR_VMX_CR0_FIXED0 0x486
-#define MSR_VMX_CR0_FIXED1 0x487
-
-#define MSR_VMX_CR4_FIXED0 0x488
-#define MSR_VMX_CR4_FIXED1 0x489
-
uint32_t vmx_revision(void);
int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
diff --git a/sys/amd64/vmm/intel/vtd.c b/sys/amd64/vmm/intel/vtd.c
index afd7155..be57aff 100644
--- a/sys/amd64/vmm/intel/vtd.c
+++ b/sys/amd64/vmm/intel/vtd.c
@@ -448,6 +448,11 @@ vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,
ptpindex = 0;
ptpshift = 0;
+ KASSERT(gpa + len > gpa, ("%s: invalid gpa range %#lx/%#lx", __func__,
+ gpa, len));
+ KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %#lx/%#lx beyond "
+ "domain maxaddr %#lx", __func__, gpa, len, dom->maxaddr));
+
if (gpa & PAGE_MASK)
panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index d93641c..3c93463 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -906,6 +906,46 @@ vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
static VMM_STAT_ARRAY(IPIS_SENT, VM_MAXCPU, "ipis sent to vcpu");
+static void
+vlapic_set_tpr(struct vlapic *vlapic, uint8_t val)
+{
+ struct LAPIC *lapic = vlapic->apic_page;
+
+ lapic->tpr = val;
+ vlapic_update_ppr(vlapic);
+}
+
+static uint8_t
+vlapic_get_tpr(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = vlapic->apic_page;
+
+ return (lapic->tpr);
+}
+
+void
+vlapic_set_cr8(struct vlapic *vlapic, uint64_t val)
+{
+ uint8_t tpr;
+
+ if (val & ~0xf) {
+ vm_inject_gp(vlapic->vm, vlapic->vcpuid);
+ return;
+ }
+
+ tpr = val << 4;
+ vlapic_set_tpr(vlapic, tpr);
+}
+
+uint64_t
+vlapic_get_cr8(struct vlapic *vlapic)
+{
+ uint8_t tpr;
+
+ tpr = vlapic_get_tpr(vlapic);
+ return (tpr >> 4);
+}
+
int
vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu)
{
@@ -1004,11 +1044,7 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu)
if (vlapic2->boot_state != BS_SIPI)
return (0);
- /*
- * XXX this assumes that the startup IPI always succeeds
- */
vlapic2->boot_state = BS_RUNNING;
- vm_activate_cpu(vlapic2->vm, dest);
*retu = true;
vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
@@ -1188,7 +1224,7 @@ vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset,
*data = lapic->version;
break;
case APIC_OFFSET_TPR:
- *data = lapic->tpr;
+ *data = vlapic_get_tpr(vlapic);
break;
case APIC_OFFSET_APR:
*data = lapic->apr;
@@ -1309,8 +1345,7 @@ vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset,
vlapic_id_write_handler(vlapic);
break;
case APIC_OFFSET_TPR:
- lapic->tpr = data & 0xff;
- vlapic_update_ppr(vlapic);
+ vlapic_set_tpr(vlapic, data & 0xff);
break;
case APIC_OFFSET_EOI:
vlapic_process_eoi(vlapic);
diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h
index 3195cc6..0e68b2f 100644
--- a/sys/amd64/vmm/io/vlapic.h
+++ b/sys/amd64/vmm/io/vlapic.h
@@ -92,6 +92,9 @@ void vlapic_reset_tmr(struct vlapic *vlapic);
void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys,
int delmode, int vector);
+void vlapic_set_cr8(struct vlapic *vlapic, uint64_t val);
+uint64_t vlapic_get_cr8(struct vlapic *vlapic);
+
/* APIC write handlers */
void vlapic_id_write_handler(struct vlapic *vlapic);
void vlapic_ldr_write_handler(struct vlapic *vlapic);
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index ff79182..31d420d 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -84,25 +84,32 @@ __FBSDID("$FreeBSD$");
struct vlapic;
+/*
+ * Initialization:
+ * (a) allocated when vcpu is created
+ * (i) initialized when vcpu is created and when it is reinitialized
+ * (o) initialized the first time the vcpu is created
+ * (x) initialized before use
+ */
struct vcpu {
- int flags;
- enum vcpu_state state;
- struct mtx mtx;
- int hostcpu; /* host cpuid this vcpu last ran on */
- uint64_t guest_msrs[VMM_MSR_NUM];
- struct vlapic *vlapic;
- int vcpuid;
- struct savefpu *guestfpu; /* guest fpu state */
- uint64_t guest_xcr0;
- void *stats;
- struct vm_exit exitinfo;
- enum x2apic_state x2apic_state;
- int nmi_pending;
- int extint_pending;
- struct vm_exception exception;
- int exception_pending;
+ struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */
+ enum vcpu_state state; /* (o) vcpu state */
+ int hostcpu; /* (o) vcpu's host cpu */
+ struct vlapic *vlapic; /* (i) APIC device model */
+ enum x2apic_state x2apic_state; /* (i) APIC mode */
+ uint64_t exitintinfo; /* (i) events pending at VM exit */
+ int nmi_pending; /* (i) NMI pending */
+ int extint_pending; /* (i) INTR pending */
+ struct vm_exception exception; /* (x) exception collateral */
+ int exception_pending; /* (i) exception pending */
+ struct savefpu *guestfpu; /* (a,i) guest fpu state */
+ uint64_t guest_xcr0; /* (i) guest %xcr0 register */
+ void *stats; /* (a,i) statistics */
+ uint64_t guest_msrs[VMM_MSR_NUM]; /* (i) emulated MSRs */
+ struct vm_exit exitinfo; /* (x) exit reason and collateral */
};
+#define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
@@ -116,36 +123,33 @@ struct mem_seg {
};
#define VM_MAX_MEMORY_SEGMENTS 2
+/*
+ * Initialization:
+ * (o) initialized the first time the VM is created
+ * (i) initialized when VM is created and when it is reinitialized
+ * (x) initialized before use
+ */
struct vm {
- void *cookie; /* processor-specific data */
- void *iommu; /* iommu-specific data */
- struct vhpet *vhpet; /* virtual HPET */
- struct vioapic *vioapic; /* virtual ioapic */
- struct vatpic *vatpic; /* virtual atpic */
- struct vatpit *vatpit; /* virtual atpit */
- struct vmspace *vmspace; /* guest's address space */
- struct vcpu vcpu[VM_MAXCPU];
- int num_mem_segs;
- struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS];
- char name[VM_MAX_NAMELEN];
-
- /*
- * Set of active vcpus.
- * An active vcpu is one that has been started implicitly (BSP) or
- * explicitly (AP) by sending it a startup ipi.
- */
- volatile cpuset_t active_cpus;
-
- struct mtx rendezvous_mtx;
- cpuset_t rendezvous_req_cpus;
- cpuset_t rendezvous_done_cpus;
- void *rendezvous_arg;
+ void *cookie; /* (i) cpu-specific data */
+ void *iommu; /* (x) iommu-specific data */
+ struct vhpet *vhpet; /* (i) virtual HPET */
+ struct vioapic *vioapic; /* (i) virtual ioapic */
+ struct vatpic *vatpic; /* (i) virtual atpic */
+ struct vatpit *vatpit; /* (i) virtual atpit */
+ volatile cpuset_t active_cpus; /* (i) active vcpus */
+ int suspend; /* (i) stop VM execution */
+ volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */
+ volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */
+ cpuset_t rendezvous_req_cpus; /* (x) rendezvous requested */
+ cpuset_t rendezvous_done_cpus; /* (x) rendezvous finished */
+ void *rendezvous_arg; /* (x) rendezvous func/arg */
vm_rendezvous_func_t rendezvous_func;
-
- int suspend;
- volatile cpuset_t suspended_cpus;
-
- volatile cpuset_t halted_cpus;
+ struct mtx rendezvous_mtx; /* (o) rendezvous lock */
+ int num_mem_segs; /* (o) guest memory segments */
+ struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS];
+ struct vmspace *vmspace; /* (o) guest's address space */
+ char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */
+ struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */
};
static int vmm_initialized;
@@ -196,7 +200,6 @@ SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
* interrupts disabled.
*/
static int halt_detection_enabled = 1;
-TUNABLE_INT("hw.vmm.halt_detection", &halt_detection_enabled);
SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
&halt_detection_enabled, 0,
"Halt VM if all vcpus execute HLT with interrupts disabled");
@@ -206,31 +209,47 @@ SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
"IPI vector used for vcpu notifications");
static void
-vcpu_cleanup(struct vm *vm, int i)
+vcpu_cleanup(struct vm *vm, int i, bool destroy)
{
struct vcpu *vcpu = &vm->vcpu[i];
VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
- vmm_stat_free(vcpu->stats);
- fpu_save_area_free(vcpu->guestfpu);
+ if (destroy) {
+ vmm_stat_free(vcpu->stats);
+ fpu_save_area_free(vcpu->guestfpu);
+ }
}
static void
-vcpu_init(struct vm *vm, uint32_t vcpu_id)
+vcpu_init(struct vm *vm, int vcpu_id, bool create)
{
struct vcpu *vcpu;
-
+
+ KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU,
+ ("vcpu_init: invalid vcpu %d", vcpu_id));
+
vcpu = &vm->vcpu[vcpu_id];
- vcpu_lock_init(vcpu);
- vcpu->hostcpu = NOCPU;
- vcpu->vcpuid = vcpu_id;
+ if (create) {
+ KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
+ "initialized", vcpu_id));
+ vcpu_lock_init(vcpu);
+ vcpu->state = VCPU_IDLE;
+ vcpu->hostcpu = NOCPU;
+ vcpu->guestfpu = fpu_save_area_alloc();
+ vcpu->stats = vmm_stat_alloc();
+ }
+
vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
+ vcpu->exitintinfo = 0;
+ vcpu->nmi_pending = 0;
+ vcpu->extint_pending = 0;
+ vcpu->exception_pending = 0;
vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
- vcpu->guestfpu = fpu_save_area_alloc();
fpu_save_area_reset(vcpu->guestfpu);
- vcpu->stats = vmm_stat_alloc();
+ vmm_stat_init(vcpu->stats);
+ guest_msrs_init(vm, vcpu_id);
}
struct vm_exit *
@@ -335,15 +354,33 @@ static moduledata_t vmm_kmod = {
DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
MODULE_VERSION(vmm, 1);
+static void
+vm_init(struct vm *vm, bool create)
+{
+ int i;
+
+ vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
+ vm->iommu = NULL;
+ vm->vioapic = vioapic_init(vm);
+ vm->vhpet = vhpet_init(vm);
+ vm->vatpic = vatpic_init(vm);
+ vm->vatpit = vatpit_init(vm);
+
+ CPU_ZERO(&vm->active_cpus);
+
+ vm->suspend = 0;
+ CPU_ZERO(&vm->suspended_cpus);
+
+ for (i = 0; i < VM_MAXCPU; i++)
+ vcpu_init(vm, i, create);
+}
+
int
vm_create(const char *name, struct vm **retvm)
{
- int i;
struct vm *vm;
struct vmspace *vmspace;
- const int BSP = 0;
-
/*
* If vmm.ko could not be successfully initialized then don't attempt
* to create the virtual machine.
@@ -360,20 +397,11 @@ vm_create(const char *name, struct vm **retvm)
vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
strcpy(vm->name, name);
+ vm->num_mem_segs = 0;
vm->vmspace = vmspace;
mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
- vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
- vm->vioapic = vioapic_init(vm);
- vm->vhpet = vhpet_init(vm);
- vm->vatpic = vatpic_init(vm);
- vm->vatpit = vatpit_init(vm);
-
- for (i = 0; i < VM_MAXCPU; i++) {
- vcpu_init(vm, i);
- guest_msrs_init(vm, i);
- }
- vm_activate_cpu(vm, BSP);
+ vm_init(vm, true);
*retvm = vm;
return (0);
@@ -389,8 +417,8 @@ vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
bzero(seg, sizeof(*seg));
}
-void
-vm_destroy(struct vm *vm)
+static void
+vm_cleanup(struct vm *vm, bool destroy)
{
int i;
@@ -404,21 +432,48 @@ vm_destroy(struct vm *vm)
vatpic_cleanup(vm->vatpic);
vioapic_cleanup(vm->vioapic);
- for (i = 0; i < vm->num_mem_segs; i++)
- vm_free_mem_seg(vm, &vm->mem_segs[i]);
+ for (i = 0; i < VM_MAXCPU; i++)
+ vcpu_cleanup(vm, i, destroy);
- vm->num_mem_segs = 0;
+ VMCLEANUP(vm->cookie);
- for (i = 0; i < VM_MAXCPU; i++)
- vcpu_cleanup(vm, i);
+ if (destroy) {
+ for (i = 0; i < vm->num_mem_segs; i++)
+ vm_free_mem_seg(vm, &vm->mem_segs[i]);
- VMSPACE_FREE(vm->vmspace);
+ vm->num_mem_segs = 0;
- VMCLEANUP(vm->cookie);
+ VMSPACE_FREE(vm->vmspace);
+ vm->vmspace = NULL;
+ }
+}
+void
+vm_destroy(struct vm *vm)
+{
+ vm_cleanup(vm, true);
free(vm, M_VM);
}
+int
+vm_reinit(struct vm *vm)
+{
+ int error;
+
+ /*
+ * A virtual machine can be reset only if all vcpus are suspended.
+ */
+ if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
+ vm_cleanup(vm, false);
+ vm_init(vm, false);
+ error = 0;
+ } else {
+ error = EBUSY;
+ }
+
+ return (error);
+}
+
const char *
vm_name(struct vm *vm)
{
@@ -517,6 +572,21 @@ vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
return (0);
}
+static vm_paddr_t
+vm_maxmem(struct vm *vm)
+{
+ int i;
+ vm_paddr_t gpa, maxmem;
+
+ maxmem = 0;
+ for (i = 0; i < vm->num_mem_segs; i++) {
+ gpa = vm->mem_segs[i].gpa + vm->mem_segs[i].len;
+ if (gpa > maxmem)
+ maxmem = gpa;
+ }
+ return (maxmem);
+}
+
static void
vm_gpa_unwire(struct vm *vm)
{
@@ -654,7 +724,7 @@ vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
if (ppt_assigned_devices(vm) == 0) {
KASSERT(vm->iommu == NULL,
("vm_assign_pptdev: iommu must be NULL"));
- maxaddr = vmm_mem_maxaddr();
+ maxaddr = vm_maxmem(vm);
vm->iommu = iommu_create_domain(maxaddr);
error = vm_gpa_wire(vm);
@@ -1050,6 +1120,10 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
}
}
+ /* Don't go to sleep if the vcpu thread needs to yield */
+ if (vcpu_should_yield(vm, vcpuid))
+ break;
+
/*
* Some Linux guests implement "halt" by having all vcpus
* execute HLT with interrupts disabled. 'halted_cpus' keeps
@@ -1073,7 +1147,11 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
t = ticks;
vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
- msleep_spin(vcpu, &vcpu->mtx, wmesg, 0);
+ /*
+ * XXX msleep_spin() cannot be interrupted by signals so
+ * wake up periodically to check pending signals.
+ */
+ msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
vcpu_require_state_locked(vcpu, VCPU_FROZEN);
vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
}
@@ -1137,15 +1215,18 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
struct vm_guest_paging *paging;
mem_region_read_t mread;
mem_region_write_t mwrite;
- int error;
+ enum vm_cpu_mode cpu_mode;
+ int cs_d, error;
vcpu = &vm->vcpu[vcpuid];
vme = &vcpu->exitinfo;
gla = vme->u.inst_emul.gla;
gpa = vme->u.inst_emul.gpa;
+ cs_d = vme->u.inst_emul.cs_d;
vie = &vme->u.inst_emul.vie;
paging = &vme->u.inst_emul.paging;
+ cpu_mode = paging->cpu_mode;
vie_init(vie);
@@ -1159,7 +1240,7 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
else if (error != 0)
panic("%s: vmm_fetch_instruction error %d", __func__, error);
- if (vmm_decode_instruction(vm, vcpuid, gla, paging->cpu_mode, vie) != 0)
+ if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0)
return (EFAULT);
/*
@@ -1186,8 +1267,8 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
return (0);
}
- error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite,
- retu);
+ error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
+ mread, mwrite, retu);
return (error);
}
@@ -1286,6 +1367,32 @@ vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
vmexit->u.suspended.how = vm->suspend;
}
+void
+vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip)
+{
+ struct vm_exit *vmexit;
+
+ KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress"));
+
+ vmexit = vm_exitinfo(vm, vcpuid);
+ vmexit->rip = rip;
+ vmexit->inst_length = 0;
+ vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
+ vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1);
+}
+
+void
+vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
+{
+ struct vm_exit *vmexit;
+
+ vmexit = vm_exitinfo(vm, vcpuid);
+ vmexit->rip = rip;
+ vmexit->inst_length = 0;
+ vmexit->exitcode = VM_EXITCODE_BOGUS;
+ vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
+}
+
int
vm_run(struct vm *vm, struct vm_run *vmrun)
{
@@ -1303,6 +1410,12 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
+ if (!CPU_ISSET(vcpuid, &vm->active_cpus))
+ return (EINVAL);
+
+ if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
+ return (EINVAL);
+
rptr = &vm->rendezvous_func;
sptr = &vm->suspend;
pmap = vmspace_pmap(vm->vmspace);
@@ -1379,6 +1492,202 @@ restart:
}
int
+vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
+{
+ struct vcpu *vcpu;
+ int type, vector;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ if (info & VM_INTINFO_VALID) {
+ type = info & VM_INTINFO_TYPE;
+ vector = info & 0xff;
+ if (type == VM_INTINFO_NMI && vector != IDT_NMI)
+ return (EINVAL);
+ if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
+ return (EINVAL);
+ if (info & VM_INTINFO_RSVD)
+ return (EINVAL);
+ } else {
+ info = 0;
+ }
+ VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
+ vcpu->exitintinfo = info;
+ return (0);
+}
+
+enum exc_class {
+ EXC_BENIGN,
+ EXC_CONTRIBUTORY,
+ EXC_PAGEFAULT
+};
+
+#define IDT_VE 20 /* Virtualization Exception (Intel specific) */
+
+static enum exc_class
+exception_class(uint64_t info)
+{
+ int type, vector;
+
+ KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
+ type = info & VM_INTINFO_TYPE;
+ vector = info & 0xff;
+
+ /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
+ switch (type) {
+ case VM_INTINFO_HWINTR:
+ case VM_INTINFO_SWINTR:
+ case VM_INTINFO_NMI:
+ return (EXC_BENIGN);
+ default:
+ /*
+ * Hardware exception.
+ *
+ * SVM and VT-x use identical type values to represent NMI,
+ * hardware interrupt and software interrupt.
+ *
+ * SVM uses type '3' for all exceptions. VT-x uses type '3'
+ * for exceptions except #BP and #OF. #BP and #OF use a type
+ * value of '5' or '6'. Therefore we don't check for explicit
+ * values of 'type' to classify 'intinfo' into a hardware
+ * exception.
+ */
+ break;
+ }
+
+ switch (vector) {
+ case IDT_PF:
+ case IDT_VE:
+ return (EXC_PAGEFAULT);
+ case IDT_DE:
+ case IDT_TS:
+ case IDT_NP:
+ case IDT_SS:
+ case IDT_GP:
+ return (EXC_CONTRIBUTORY);
+ default:
+ return (EXC_BENIGN);
+ }
+}
+
+static int
+nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
+ uint64_t *retinfo)
+{
+ enum exc_class exc1, exc2;
+ int type1, vector1;
+
+ KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
+ KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
+
+ /*
+ * If an exception occurs while attempting to call the double-fault
+ * handler the processor enters shutdown mode (aka triple fault).
+ */
+ type1 = info1 & VM_INTINFO_TYPE;
+ vector1 = info1 & 0xff;
+ if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
+ VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
+ info1, info2);
+ vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
+ *retinfo = 0;
+ return (0);
+ }
+
+ /*
+ * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
+ */
+ exc1 = exception_class(info1);
+ exc2 = exception_class(info2);
+ if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
+ (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
+ /* Convert nested fault into a double fault. */
+ *retinfo = IDT_DF;
+ *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
+ *retinfo |= VM_INTINFO_DEL_ERRCODE;
+ } else {
+ /* Handle exceptions serially */
+ *retinfo = info2;
+ }
+ return (1);
+}
+
+static uint64_t
+vcpu_exception_intinfo(struct vcpu *vcpu)
+{
+ uint64_t info = 0;
+
+ if (vcpu->exception_pending) {
+ info = vcpu->exception.vector & 0xff;
+ info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
+ if (vcpu->exception.error_code_valid) {
+ info |= VM_INTINFO_DEL_ERRCODE;
+ info |= (uint64_t)vcpu->exception.error_code << 32;
+ }
+ }
+ return (info);
+}
+
+int
+vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
+{
+ struct vcpu *vcpu;
+ uint64_t info1, info2;
+ int valid;
+
+ KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ info1 = vcpu->exitintinfo;
+ vcpu->exitintinfo = 0;
+
+ info2 = 0;
+ if (vcpu->exception_pending) {
+ info2 = vcpu_exception_intinfo(vcpu);
+ vcpu->exception_pending = 0;
+ VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
+ vcpu->exception.vector, info2);
+ }
+
+ if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
+ valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
+ } else if (info1 & VM_INTINFO_VALID) {
+ *retinfo = info1;
+ valid = 1;
+ } else if (info2 & VM_INTINFO_VALID) {
+ *retinfo = info2;
+ valid = 1;
+ } else {
+ valid = 0;
+ }
+
+ if (valid) {
+ VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
+ "retinfo(%#lx)", __func__, info1, info2, *retinfo);
+ }
+
+ return (valid);
+}
+
+int
+vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ vcpu = &vm->vcpu[vcpuid];
+ *info1 = vcpu->exitintinfo;
+ *info2 = vcpu_exception_intinfo(vcpu);
+ return (0);
+}
+
+int
vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
{
struct vcpu *vcpu;
@@ -1389,6 +1698,14 @@ vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
if (exception->vector < 0 || exception->vector >= 32)
return (EINVAL);
+ /*
+ * A double fault exception should never be injected directly into
+ * the guest. It is a derived exception that results from specific
+ * combinations of nested faults.
+ */
+ if (exception->vector == IDT_DF)
+ return (EINVAL);
+
vcpu = &vm->vcpu[vcpuid];
if (vcpu->exception_pending) {
@@ -1404,32 +1721,21 @@ vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
return (0);
}
-int
-vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception)
-{
- struct vcpu *vcpu;
- int pending;
-
- KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
-
- vcpu = &vm->vcpu[vcpuid];
- pending = vcpu->exception_pending;
- if (pending) {
- vcpu->exception_pending = 0;
- *exception = vcpu->exception;
- VCPU_CTR1(vm, vcpuid, "Exception %d delivered",
- exception->vector);
- }
- return (pending);
-}
-
-static void
-vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)
+void
+vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
+ int errcode)
{
+ struct vm_exception exception;
struct vm_exit *vmexit;
+ struct vm *vm;
int error;
- error = vm_inject_exception(vm, vcpuid, exception);
+ vm = vmarg;
+
+ exception.vector = vector;
+ exception.error_code = errcode;
+ exception.error_code_valid = errcode_valid;
+ error = vm_inject_exception(vm, vcpuid, &exception);
KASSERT(error == 0, ("vm_inject_exception error %d", error));
/*
@@ -1444,45 +1750,19 @@ vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)
}
void
-vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
+vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
{
- struct vm_exception pf = {
- .vector = IDT_PF,
- .error_code_valid = 1,
- .error_code = error_code
- };
+ struct vm *vm;
int error;
+ vm = vmarg;
VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
error_code, cr2);
error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
- vm_inject_fault(vm, vcpuid, &pf);
-}
-
-void
-vm_inject_gp(struct vm *vm, int vcpuid)
-{
- struct vm_exception gpf = {
- .vector = IDT_GP,
- .error_code_valid = 1,
- .error_code = 0
- };
-
- vm_inject_fault(vm, vcpuid, &gpf);
-}
-
-void
-vm_inject_ud(struct vm *vm, int vcpuid)
-{
- struct vm_exception udf = {
- .vector = IDT_UD,
- .error_code_valid = 0
- };
-
- vm_inject_fault(vm, vcpuid, &udf);
+ vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
}
static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
@@ -1717,17 +1997,19 @@ vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
return (state);
}
-void
+int
vm_activate_cpu(struct vm *vm, int vcpuid)
{
- KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
- ("vm_activate_cpu: invalid vcpuid %d", vcpuid));
- KASSERT(!CPU_ISSET(vcpuid, &vm->active_cpus),
- ("vm_activate_cpu: vcpuid %d is already active", vcpuid));
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (CPU_ISSET(vcpuid, &vm->active_cpus))
+ return (EBUSY);
VCPU_CTR0(vm, vcpuid, "activated");
CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
+ return (0);
}
cpuset_t
@@ -1737,6 +2019,13 @@ vm_active_cpus(struct vm *vm)
return (vm->active_cpus);
}
+cpuset_t
+vm_suspended_cpus(struct vm *vm)
+{
+
+ return (vm->suspended_cpus);
+}
+
void *
vcpu_stats(struct vm *vm, int vcpuid)
{
@@ -1906,3 +2195,125 @@ vm_segment_name(int seg)
("%s: invalid segment encoding %d", __func__, seg));
return (seg_names[seg]);
}
+
+void
+vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+ int num_copyinfo)
+{
+ int idx;
+
+ for (idx = 0; idx < num_copyinfo; idx++) {
+ if (copyinfo[idx].cookie != NULL)
+ vm_gpa_release(copyinfo[idx].cookie);
+ }
+ bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
+}
+
+int
+vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+ uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
+ int num_copyinfo)
+{
+ int error, idx, nused;
+ size_t n, off, remaining;
+ void *hva, *cookie;
+ uint64_t gpa;
+
+ bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
+
+ nused = 0;
+ remaining = len;
+ while (remaining > 0) {
+ KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
+ error = vmm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa);
+ if (error)
+ return (error);
+ off = gpa & PAGE_MASK;
+ n = min(remaining, PAGE_SIZE - off);
+ copyinfo[nused].gpa = gpa;
+ copyinfo[nused].len = n;
+ remaining -= n;
+ gla += n;
+ nused++;
+ }
+
+ for (idx = 0; idx < nused; idx++) {
+ hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len,
+ prot, &cookie);
+ if (hva == NULL)
+ break;
+ copyinfo[idx].hva = hva;
+ copyinfo[idx].cookie = cookie;
+ }
+
+ if (idx != nused) {
+ vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
+ return (-1);
+ } else {
+ return (0);
+ }
+}
+
+void
+vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
+ size_t len)
+{
+ char *dst;
+ int idx;
+
+ dst = kaddr;
+ idx = 0;
+ while (len > 0) {
+ bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
+ len -= copyinfo[idx].len;
+ dst += copyinfo[idx].len;
+ idx++;
+ }
+}
+
+void
+vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
+ struct vm_copyinfo *copyinfo, size_t len)
+{
+ const char *src;
+ int idx;
+
+ src = kaddr;
+ idx = 0;
+ while (len > 0) {
+ bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
+ len -= copyinfo[idx].len;
+ src += copyinfo[idx].len;
+ idx++;
+ }
+}
+
+/*
+ * Return the amount of in-use and wired memory for the VM. Since
+ * these are global stats, only return the values with for vCPU 0
+ */
+VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
+VMM_STAT_DECLARE(VMM_MEM_WIRED);
+
+static void
+vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
+{
+
+ if (vcpu == 0) {
+ vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
+ PAGE_SIZE * vmspace_resident_count(vm->vmspace));
+ }
+}
+
+static void
+vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
+{
+
+ if (vcpu == 0) {
+ vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
+ PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
+ }
+}
+
+VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
+VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index 0561785..a85109e 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -146,7 +146,8 @@ static int
vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
struct thread *td)
{
- int error, vcpu, state_changed;
+ int error, vcpu, state_changed, size;
+ cpuset_t *cpuset;
struct vmmdev_softc *sc;
struct vm_memory_segment *seg;
struct vm_register *vmreg;
@@ -170,6 +171,9 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
struct vm_gpa_pte *gpapte;
struct vm_suspend *vmsuspend;
struct vm_gla2gpa *gg;
+ struct vm_activate_cpu *vac;
+ struct vm_cpuset *vm_cpuset;
+ struct vm_intinfo *vmii;
sc = vmmdev_lookup2(cdev);
if (sc == NULL)
@@ -195,6 +199,9 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
case VM_PPTDEV_MSIX:
case VM_SET_X2APIC_STATE:
case VM_GLA2GPA:
+ case VM_ACTIVATE_CPU:
+ case VM_SET_INTINFO:
+ case VM_GET_INTINFO:
/*
* XXX fragile, handle with care
* Assumes that the first field of the ioctl data is the vcpu.
@@ -216,6 +223,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
case VM_BIND_PPTDEV:
case VM_UNBIND_PPTDEV:
case VM_MAP_MEMORY:
+ case VM_REINIT:
/*
* ioctls that operate on the entire virtual machine must
* prevent all vcpus from running.
@@ -249,6 +257,9 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
vmsuspend = (struct vm_suspend *)data;
error = vm_suspend(sc->vm, vmsuspend->how);
break;
+ case VM_REINIT:
+ error = vm_reinit(sc->vm);
+ break;
case VM_STAT_DESC: {
statdesc = (struct vm_stat_desc *)data;
error = vmm_stat_desc_copy(statdesc->index,
@@ -439,6 +450,38 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
}
break;
}
+ case VM_ACTIVATE_CPU:
+ vac = (struct vm_activate_cpu *)data;
+ error = vm_activate_cpu(sc->vm, vac->vcpuid);
+ break;
+ case VM_GET_CPUS:
+ error = 0;
+ vm_cpuset = (struct vm_cpuset *)data;
+ size = vm_cpuset->cpusetsize;
+ if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
+ error = ERANGE;
+ break;
+ }
+ cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
+ if (vm_cpuset->which == VM_ACTIVE_CPUS)
+ *cpuset = vm_active_cpus(sc->vm);
+ else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
+ *cpuset = vm_suspended_cpus(sc->vm);
+ else
+ error = EINVAL;
+ if (error == 0)
+ error = copyout(cpuset, vm_cpuset->cpus, size);
+ free(cpuset, M_TEMP);
+ break;
+ case VM_SET_INTINFO:
+ vmii = (struct vm_intinfo *)data;
+ error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
+ break;
+ case VM_GET_INTINFO:
+ vmii = (struct vm_intinfo *)data;
+ error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
+ &vmii->info2);
+ break;
default:
error = ENOTTY;
break;
diff --git a/sys/amd64/vmm/vmm_host.c b/sys/amd64/vmm/vmm_host.c
index 2763e04..9e5b966 100644
--- a/sys/amd64/vmm/vmm_host.c
+++ b/sys/amd64/vmm/vmm_host.c
@@ -66,11 +66,16 @@ vmm_host_state_init(void)
* XSAVE. Only permit a guest to use XSAVE features supported
* by the host. This ensures that the FPU state used by the
* guest is always a subset of the saved guest FPU state.
+ *
+ * In addition, only permit known XSAVE features where the
+ * rules for which features depend on other features is known
+ * to properly emulate xsetbv.
*/
if (vmm_host_cr4 & CR4_XSAVE) {
vmm_xsave_limits.xsave_enabled = 1;
vmm_host_xcr0 = rxcr(0);
- vmm_xsave_limits.xcr0_allowed = vmm_host_xcr0;
+ vmm_xsave_limits.xcr0_allowed = vmm_host_xcr0 &
+ (XFEATURE_AVX | XFEATURE_MPX | XFEATURE_AVX512);
cpuid_count(0xd, 0x0, regs);
vmm_xsave_limits.xsave_max_size = regs[1];
diff --git a/sys/amd64/vmm/vmm_instruction_emul.c b/sys/amd64/vmm/vmm_instruction_emul.c
index 57cc654..185b198 100644
--- a/sys/amd64/vmm/vmm_instruction_emul.c
+++ b/sys/amd64/vmm/vmm_instruction_emul.c
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
#else /* !_KERNEL */
#include <sys/types.h>
#include <sys/errno.h>
+#include <sys/_iovec.h>
#include <machine/vmm.h>
@@ -65,18 +66,26 @@ enum {
VIE_OP_TYPE_AND,
VIE_OP_TYPE_OR,
VIE_OP_TYPE_TWO_BYTE,
+ VIE_OP_TYPE_PUSH,
+ VIE_OP_TYPE_CMP,
VIE_OP_TYPE_LAST
};
/* struct vie_op.op_flags */
-#define VIE_OP_F_IMM (1 << 0) /* immediate operand present */
-#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */
+#define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */
+#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */
+#define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */
+#define VIE_OP_F_NO_MODRM (1 << 3)
static const struct vie_op two_byte_opcodes[256] = {
[0xB6] = {
.op_byte = 0xB6,
.op_type = VIE_OP_TYPE_MOVZX,
},
+ [0xB7] = {
+ .op_byte = 0xB7,
+ .op_type = VIE_OP_TYPE_MOVZX,
+ },
[0xBE] = {
.op_byte = 0xBE,
.op_type = VIE_OP_TYPE_MOVSX,
@@ -88,6 +97,10 @@ static const struct vie_op one_byte_opcodes[256] = {
.op_byte = 0x0F,
.op_type = VIE_OP_TYPE_TWO_BYTE
},
+ [0x3B] = {
+ .op_byte = 0x3B,
+ .op_type = VIE_OP_TYPE_CMP,
+ },
[0x88] = {
.op_byte = 0x88,
.op_type = VIE_OP_TYPE_MOV,
@@ -104,6 +117,22 @@ static const struct vie_op one_byte_opcodes[256] = {
.op_byte = 0x8B,
.op_type = VIE_OP_TYPE_MOV,
},
+ [0xA1] = {
+ .op_byte = 0xA1,
+ .op_type = VIE_OP_TYPE_MOV,
+ .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
+ },
+ [0xA3] = {
+ .op_byte = 0xA3,
+ .op_type = VIE_OP_TYPE_MOV,
+ .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
+ },
+ [0xC6] = {
+ /* XXX Group 11 extended opcode - not just MOV */
+ .op_byte = 0xC6,
+ .op_type = VIE_OP_TYPE_MOV,
+ .op_flags = VIE_OP_F_IMM8,
+ },
[0xC7] = {
.op_byte = 0xC7,
.op_type = VIE_OP_TYPE_MOV,
@@ -125,6 +154,11 @@ static const struct vie_op one_byte_opcodes[256] = {
.op_type = VIE_OP_TYPE_OR,
.op_flags = VIE_OP_F_IMM8,
},
+ [0xFF] = {
+ /* XXX Group 5 extended opcode - not just PUSH */
+ .op_byte = 0xFF,
+ .op_type = VIE_OP_TYPE_PUSH,
+ }
};
/* struct vie.mod */
@@ -175,18 +209,15 @@ vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
return (error);
}
-static int
-vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
+static void
+vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
{
- uint64_t val;
- int error, rshift;
- enum vm_reg_name reg;
-
- rshift = 0;
- reg = gpr_map[vie->reg];
+ *lhbr = 0;
+ *reg = gpr_map[vie->reg];
/*
- * 64-bit mode imposes limitations on accessing legacy byte registers.
+ * 64-bit mode imposes limitations on accessing legacy high byte
+ * registers (lhbr).
*
* The legacy high-byte registers cannot be addressed if the REX
* prefix is present. In this case the values 4, 5, 6 and 7 of the
@@ -198,17 +229,56 @@ vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
*/
if (!vie->rex_present) {
if (vie->reg & 0x4) {
- /*
- * Obtain the value of %ah by reading %rax and shifting
- * right by 8 bits (same for %bh, %ch and %dh).
- */
- rshift = 8;
- reg = gpr_map[vie->reg & 0x3];
+ *lhbr = 1;
+ *reg = gpr_map[vie->reg & 0x3];
}
}
+}
+
+static int
+vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
+{
+ uint64_t val;
+ int error, lhbr;
+ enum vm_reg_name reg;
+ vie_calc_bytereg(vie, &reg, &lhbr);
error = vm_get_register(vm, vcpuid, reg, &val);
- *rval = val >> rshift;
+
+ /*
+ * To obtain the value of a legacy high byte register shift the
+ * base register right by 8 bits (%ah = %rax >> 8).
+ */
+ if (lhbr)
+ *rval = val >> 8;
+ else
+ *rval = val;
+ return (error);
+}
+
+static int
+vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
+{
+ uint64_t origval, val, mask;
+ int error, lhbr;
+ enum vm_reg_name reg;
+
+ vie_calc_bytereg(vie, &reg, &lhbr);
+ error = vm_get_register(vm, vcpuid, reg, &origval);
+ if (error == 0) {
+ val = byte;
+ mask = 0xff;
+ if (lhbr) {
+ /*
+ * Shift left by 8 to store 'byte' in a legacy high
+ * byte register.
+ */
+ val <<= 8;
+ mask <<= 8;
+ }
+ val |= origval & ~mask;
+ error = vm_set_register(vm, vcpuid, reg, val);
+ }
return (error);
}
@@ -242,16 +312,52 @@ vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
}
/*
- * The following simplifying assumptions are made during emulation:
- *
- * - guest is in 64-bit mode
- * - default address size is 64-bits
- * - default operand size is 32-bits
- *
- * - operand size override is not supported
- *
- * - address size override is not supported
+ * Return the status flags that would result from doing (x - y).
*/
+static u_long
+getcc16(uint16_t x, uint16_t y)
+{
+ u_long rflags;
+
+ __asm __volatile("sub %1,%2; pushfq; popq %0" :
+ "=r" (rflags) : "m" (y), "r" (x));
+ return (rflags);
+}
+
+static u_long
+getcc32(uint32_t x, uint32_t y)
+{
+ u_long rflags;
+
+ __asm __volatile("sub %1,%2; pushfq; popq %0" :
+ "=r" (rflags) : "m" (y), "r" (x));
+ return (rflags);
+}
+
+static u_long
+getcc64(uint64_t x, uint64_t y)
+{
+ u_long rflags;
+
+ __asm __volatile("sub %1,%2; pushfq; popq %0" :
+ "=r" (rflags) : "m" (y), "r" (x));
+ return (rflags);
+}
+
+static u_long
+getcc(int opsize, uint64_t x, uint64_t y)
+{
+ KASSERT(opsize == 2 || opsize == 4 || opsize == 8,
+ ("getcc: invalid operand size %d", opsize));
+
+ if (opsize == 2)
+ return (getcc16(x, y));
+ else if (opsize == 4)
+ return (getcc32(x, y));
+ else
+ return (getcc64(x, y));
+}
+
static int
emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
@@ -261,7 +367,7 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
uint8_t byte;
uint64_t val;
- size = 4;
+ size = vie->opsize;
error = EINVAL;
switch (vie->op.op_byte) {
@@ -271,7 +377,7 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
* 88/r: mov r/m8, r8
* REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
*/
- size = 1;
+ size = 1; /* override for byte operation */
error = vie_read_bytereg(vm, vcpuid, vie, &byte);
if (error == 0)
error = memwrite(vm, vcpuid, gpa, byte, size, arg);
@@ -279,11 +385,10 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
case 0x89:
/*
* MOV from reg (ModRM:reg) to mem (ModRM:r/m)
+ * 89/r: mov r/m16, r16
* 89/r: mov r/m32, r32
* REX.W + 89/r mov r/m64, r64
*/
- if (vie->rex_w)
- size = 8;
reg = gpr_map[vie->reg];
error = vie_read_register(vm, vcpuid, reg, &val);
if (error == 0) {
@@ -292,38 +397,72 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
}
break;
case 0x8A:
+ /*
+ * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
+ * 8A/r: mov r8, r/m8
+ * REX + 8A/r: mov r8, r/m8
+ */
+ size = 1; /* override for byte operation */
+ error = memread(vm, vcpuid, gpa, &val, size, arg);
+ if (error == 0)
+ error = vie_write_bytereg(vm, vcpuid, vie, val);
+ break;
case 0x8B:
/*
* MOV from mem (ModRM:r/m) to reg (ModRM:reg)
- * 8A/r: mov r/m8, r8
- * REX + 8A/r: mov r/m8, r8
+ * 8B/r: mov r16, r/m16
* 8B/r: mov r32, r/m32
* REX.W 8B/r: mov r64, r/m64
*/
- if (vie->op.op_byte == 0x8A)
- size = 1;
- else if (vie->rex_w)
- size = 8;
error = memread(vm, vcpuid, gpa, &val, size, arg);
if (error == 0) {
reg = gpr_map[vie->reg];
error = vie_update_register(vm, vcpuid, reg, val, size);
}
break;
+ case 0xA1:
+ /*
+ * MOV from seg:moffset to AX/EAX/RAX
+ * A1: mov AX, moffs16
+ * A1: mov EAX, moffs32
+ * REX.W + A1: mov RAX, moffs64
+ */
+ error = memread(vm, vcpuid, gpa, &val, size, arg);
+ if (error == 0) {
+ reg = VM_REG_GUEST_RAX;
+ error = vie_update_register(vm, vcpuid, reg, val, size);
+ }
+ break;
+ case 0xA3:
+ /*
+ * MOV from AX/EAX/RAX to seg:moffset
+ * A3: mov moffs16, AX
+ * A3: mov moffs32, EAX
+ * REX.W + A3: mov moffs64, RAX
+ */
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
+ if (error == 0) {
+ val &= size2mask[size];
+ error = memwrite(vm, vcpuid, gpa, val, size, arg);
+ }
+ break;
+ case 0xC6:
+ /*
+ * MOV from imm8 to mem (ModRM:r/m)
+ * C6/0 mov r/m8, imm8
+ * REX + C6/0 mov r/m8, imm8
+ */
+ size = 1; /* override for byte operation */
+ error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
+ break;
case 0xC7:
/*
- * MOV from imm32 to mem (ModRM:r/m)
+ * MOV from imm16/imm32 to mem (ModRM:r/m)
+ * C7/0 mov r/m16, imm16
* C7/0 mov r/m32, imm32
* REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits)
*/
- val = vie->immediate; /* already sign-extended */
-
- if (vie->rex_w)
- size = 8;
-
- if (size != 8)
- val &= size2mask[size];
-
+ val = vie->immediate & size2mask[size];
error = memwrite(vm, vcpuid, gpa, val, size, arg);
break;
default:
@@ -333,17 +472,6 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
return (error);
}
-/*
- * The following simplifying assumptions are made during emulation:
- *
- * - guest is in 64-bit mode
- * - default address size is 64-bits
- * - default operand size is 32-bits
- *
- * - operand size override is not supported
- *
- * - address size override is not supported
- */
static int
emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
mem_region_read_t memread, mem_region_write_t memwrite,
@@ -353,7 +481,7 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
enum vm_reg_name reg;
uint64_t val;
- size = 4;
+ size = vie->opsize;
error = EINVAL;
switch (vie->op.op_byte) {
@@ -362,8 +490,9 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
* MOV and zero extend byte from mem (ModRM:r/m) to
* reg (ModRM:reg).
*
- * 0F B6/r movzx r/m8, r32
- * REX.W + 0F B6/r movzx r/m8, r64
+ * 0F B6/r movzx r16, r/m8
+ * 0F B6/r movzx r32, r/m8
+ * REX.W + 0F B6/r movzx r64, r/m8
*/
/* get the first operand */
@@ -374,19 +503,39 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
/* get the second operand */
reg = gpr_map[vie->reg];
- if (vie->rex_w)
- size = 8;
+ /* zero-extend byte */
+ val = (uint8_t)val;
/* write the result */
error = vie_update_register(vm, vcpuid, reg, val, size);
break;
+ case 0xB7:
+ /*
+ * MOV and zero extend word from mem (ModRM:r/m) to
+ * reg (ModRM:reg).
+ *
+ * 0F B7/r movzx r32, r/m16
+ * REX.W + 0F B7/r movzx r64, r/m16
+ */
+ error = memread(vm, vcpuid, gpa, &val, 2, arg);
+ if (error)
+ return (error);
+
+ reg = gpr_map[vie->reg];
+
+ /* zero-extend word */
+ val = (uint16_t)val;
+
+ error = vie_update_register(vm, vcpuid, reg, val, size);
+ break;
case 0xBE:
/*
* MOV and sign extend byte from mem (ModRM:r/m) to
* reg (ModRM:reg).
*
- * 0F BE/r movsx r/m8, r32
- * REX.W + 0F BE/r movsx r/m8, r64
+ * 0F BE/r movsx r16, r/m8
+ * 0F BE/r movsx r32, r/m8
+ * REX.W + 0F BE/r movsx r64, r/m8
*/
/* get the first operand */
@@ -397,9 +546,6 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
/* get the second operand */
reg = gpr_map[vie->reg];
- if (vie->rex_w)
- size = 8;
-
/* sign extend byte */
val = (int8_t)val;
@@ -420,7 +566,7 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
enum vm_reg_name reg;
uint64_t val1, val2;
- size = 4;
+ size = vie->opsize;
error = EINVAL;
switch (vie->op.op_byte) {
@@ -429,11 +575,10 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
* AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
* result in reg.
*
+ * 23/r and r16, r/m16
* 23/r and r32, r/m32
* REX.W + 23/r and r64, r/m64
*/
- if (vie->rex_w)
- size = 8;
/* get the first operand */
reg = gpr_map[vie->reg];
@@ -455,8 +600,9 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
* AND mem (ModRM:r/m) with immediate and store the
* result in mem.
*
- * 81/ and r/m32, imm32
- * REX.W + 81/ and r/m64, imm32 sign-extended to 64
+ * 81 /4 and r/m16, imm16
+ * 81 /4 and r/m32, imm32
+ * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64
*
* Currently, only the AND operation of the 0x81 opcode
* is implemented (ModRM:reg = b100).
@@ -464,9 +610,6 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
if ((vie->reg & 7) != 4)
break;
- if (vie->rex_w)
- size = 8;
-
/* get the first operand */
error = memread(vm, vcpuid, gpa, &val1, size, arg);
if (error)
@@ -492,7 +635,7 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
int error, size;
uint64_t val1;
- size = 4;
+ size = vie->opsize;
error = EINVAL;
switch (vie->op.op_byte) {
@@ -501,8 +644,9 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
* OR mem (ModRM:r/m) with immediate and store the
* result in mem.
*
- * 83/ OR r/m32, imm8 sign-extended to 32
- * REX.W + 83/ OR r/m64, imm8 sign-extended to 64
+ * 83 /1 OR r/m16, imm8 sign-extended to 16
+ * 83 /1 OR r/m32, imm8 sign-extended to 32
+ * REX.W + 83/1 OR r/m64, imm8 sign-extended to 64
*
* Currently, only the OR operation of the 0x83 opcode
* is implemented (ModRM:reg = b001).
@@ -510,9 +654,6 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
if ((vie->reg & 7) != 1)
break;
- if (vie->rex_w)
- size = 8;
-
/* get the first operand */
error = memread(vm, vcpuid, gpa, &val1, size, arg);
if (error)
@@ -531,10 +672,167 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
return (error);
}
+#define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
+
+static int
+emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+ int error, size;
+ uint64_t op1, op2, rflags, rflags2;
+ enum vm_reg_name reg;
+
+ size = vie->opsize;
+ switch (vie->op.op_byte) {
+ case 0x3B:
+ /*
+ * 3B/r CMP r16, r/m16
+ * 3B/r CMP r32, r/m32
+ * REX.W + 3B/r CMP r64, r/m64
+ *
+ * Compare first operand (reg) with second operand (r/m) and
+ * set status flags in EFLAGS register. The comparison is
+ * performed by subtracting the second operand from the first
+ * operand and then setting the status flags.
+ */
+
+ /* Get the first operand */
+ reg = gpr_map[vie->reg];
+ error = vie_read_register(vm, vcpuid, reg, &op1);
+ if (error)
+ return (error);
+
+ /* Get the second operand */
+ error = memread(vm, vcpuid, gpa, &op2, size, arg);
+ if (error)
+ return (error);
+
+ break;
+ default:
+ return (EINVAL);
+ }
+ rflags2 = getcc(size, op1, op2);
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+ if (error)
+ return (error);
+ rflags &= ~RFLAGS_STATUS_BITS;
+ rflags |= rflags2 & RFLAGS_STATUS_BITS;
+
+ error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
+ return (error);
+}
+
+static int
+emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
+ struct vm_guest_paging *paging, mem_region_read_t memread,
+ mem_region_write_t memwrite, void *arg)
+{
+#ifdef _KERNEL
+ struct vm_copyinfo copyinfo[2];
+#else
+ struct iovec copyinfo[2];
+#endif
+ struct seg_desc ss_desc;
+ uint64_t cr0, rflags, rsp, stack_gla, val;
+ int error, size, stackaddrsize;
+
+ /*
+ * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
+ *
+ * PUSH is part of the group 5 extended opcodes and is identified
+ * by ModRM:reg = b110.
+ */
+ if ((vie->reg & 7) != 6)
+ return (EINVAL);
+
+ size = vie->opsize;
+ /*
+ * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
+ */
+ if (paging->cpu_mode == CPU_MODE_REAL) {
+ stackaddrsize = 2;
+ } else if (paging->cpu_mode == CPU_MODE_64BIT) {
+ /*
+ * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
+ * - Stack pointer size is always 64-bits.
+ * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
+ * - 16-bit PUSH/POP is supported by using the operand size
+ * override prefix (66H).
+ */
+ stackaddrsize = 8;
+ size = vie->opsize_override ? 2 : 8;
+ } else {
+ /*
+ * In protected or compability mode the 'B' flag in the
+ * stack-segment descriptor determines the size of the
+ * stack pointer.
+ */
+ error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
+ KASSERT(error == 0, ("%s: error %d getting SS descriptor",
+ __func__, error));
+ if (SEG_DESC_DEF32(ss_desc.access))
+ stackaddrsize = 4;
+ else
+ stackaddrsize = 2;
+ }
+
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
+ KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
+
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+ KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
+ KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
+
+ rsp -= size;
+ if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
+ rsp, size, stackaddrsize, PROT_WRITE, &stack_gla)) {
+ vm_inject_ss(vm, vcpuid, 0);
+ return (0);
+ }
+
+ if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
+ vm_inject_ss(vm, vcpuid, 0);
+ return (0);
+ }
+
+ if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
+ vm_inject_ac(vm, vcpuid, 0);
+ return (0);
+ }
+
+ error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, PROT_WRITE,
+ copyinfo, nitems(copyinfo));
+ if (error == -1) {
+ /*
+ * XXX cannot return a negative error value here because it
+ * ends up being the return value of the VM_RUN() ioctl and
+ * is interpreted as a pseudo-error (for e.g. ERESTART).
+ */
+ return (EFAULT);
+ } else if (error == 1) {
+ /* Resume guest execution to handle page fault */
+ return (0);
+ }
+
+ error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
+ if (error == 0) {
+ vm_copyout(vm, vcpuid, &val, copyinfo, size);
+ error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
+ stackaddrsize);
+ KASSERT(error == 0, ("error %d updating rsp", error));
+ }
+#ifdef _KERNEL
+ vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+#endif
+ return (error);
+}
+
int
vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
- mem_region_read_t memread, mem_region_write_t memwrite,
- void *memarg)
+ struct vm_guest_paging *paging, mem_region_read_t memread,
+ mem_region_write_t memwrite, void *memarg)
{
int error;
@@ -542,6 +840,14 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
return (EINVAL);
switch (vie->op.op_type) {
+ case VIE_OP_TYPE_PUSH:
+ error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
+ memwrite, memarg);
+ break;
+ case VIE_OP_TYPE_CMP:
+ error = emulate_cmp(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
case VIE_OP_TYPE_MOV:
error = emulate_mov(vm, vcpuid, gpa, vie,
memread, memwrite, memarg);
@@ -612,7 +918,7 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
struct seg_desc *desc, uint64_t offset, int length, int addrsize,
int prot, uint64_t *gla)
{
- uint64_t low_limit, high_limit, segbase;
+ uint64_t firstoff, low_limit, high_limit, segbase;
int glasize, type;
KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
@@ -622,6 +928,7 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
("%s: invalid prot %#x", __func__, prot));
+ firstoff = offset;
if (cpu_mode == CPU_MODE_64BIT) {
KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
"size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
@@ -635,7 +942,7 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
* then the descriptor is unusable and attempting to use
* it results in a #GP(0).
*/
- if (SEG_DESC_UNUSABLE(desc))
+ if (SEG_DESC_UNUSABLE(desc->access))
return (-1);
/*
@@ -644,13 +951,13 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
* descriptor that is not present. If this was the case then
* it would have been checked before the VM-exit.
*/
- KASSERT(SEG_DESC_PRESENT(desc), ("segment %d not present: %#x",
- seg, desc->access));
+ KASSERT(SEG_DESC_PRESENT(desc->access),
+ ("segment %d not present: %#x", seg, desc->access));
/*
* The descriptor type must indicate a code/data segment.
*/
- type = SEG_DESC_TYPE(desc);
+ type = SEG_DESC_TYPE(desc->access);
KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
"descriptor type %#x", seg, type));
@@ -679,7 +986,8 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
if ((type & 0xC) == 0x4) {
/* expand-down data segment */
low_limit = desc->limit + 1;
- high_limit = SEG_DESC_DEF32(desc) ? 0xffffffff : 0xffff;
+ high_limit = SEG_DESC_DEF32(desc->access) ?
+ 0xffffffff : 0xffff;
} else {
/* code segment or expand-up data segment */
low_limit = 0;
@@ -707,11 +1015,11 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
}
/*
- * Truncate 'offset' to the effective address size before adding
+ * Truncate 'firstoff' to the effective address size before adding
* it to the segment base.
*/
- offset &= vie_size2mask(addrsize);
- *gla = (segbase + offset) & vie_size2mask(glasize);
+ firstoff &= vie_size2mask(addrsize);
+ *gla = (segbase + firstoff) & vie_size2mask(glasize);
return (0);
}
@@ -946,45 +1254,24 @@ fault:
}
int
-vmm_fetch_instruction(struct vm *vm, int cpuid, struct vm_guest_paging *paging,
+vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
uint64_t rip, int inst_length, struct vie *vie)
{
- int n, error, prot;
- uint64_t gpa, off;
- void *hpa, *cookie;
-
- /*
- * XXX cache previously fetched instructions using 'rip' as the tag
- */
+ struct vm_copyinfo copyinfo[2];
+ int error, prot;
- prot = VM_PROT_READ | VM_PROT_EXECUTE;
if (inst_length > VIE_INST_SIZE)
panic("vmm_fetch_instruction: invalid length %d", inst_length);
- /* Copy the instruction into 'vie' */
- while (vie->num_valid < inst_length) {
- error = vmm_gla2gpa(vm, cpuid, paging, rip, prot, &gpa);
- if (error)
- return (error);
-
- off = gpa & PAGE_MASK;
- n = min(inst_length - vie->num_valid, PAGE_SIZE - off);
-
- if ((hpa = vm_gpa_hold(vm, gpa, n, prot, &cookie)) == NULL)
- break;
-
- bcopy(hpa, &vie->inst[vie->num_valid], n);
-
- vm_gpa_release(cookie);
-
- rip += n;
- vie->num_valid += n;
+ prot = PROT_READ | PROT_EXEC;
+ error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
+ copyinfo, nitems(copyinfo));
+ if (error == 0) {
+ vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
+ vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+ vie->num_valid = inst_length;
}
-
- if (vie->num_valid)
- return (0);
- else
- return (-1);
+ return (error);
}
static int
@@ -1006,24 +1293,65 @@ vie_advance(struct vie *vie)
}
static int
-decode_rex(struct vie *vie)
+decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
{
uint8_t x;
- if (vie_peek(vie, &x))
- return (-1);
+ while (1) {
+ if (vie_peek(vie, &x))
+ return (-1);
- if (x >= 0x40 && x <= 0x4F) {
- vie->rex_present = 1;
+ if (x == 0x66)
+ vie->opsize_override = 1;
+ else if (x == 0x67)
+ vie->addrsize_override = 1;
+ else
+ break;
+
+ vie_advance(vie);
+ }
+ /*
+ * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
+ * - Only one REX prefix is allowed per instruction.
+ * - The REX prefix must immediately precede the opcode byte or the
+ * escape opcode byte.
+ * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
+ * the mandatory prefix must come before the REX prefix.
+ */
+ if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
+ vie->rex_present = 1;
vie->rex_w = x & 0x8 ? 1 : 0;
vie->rex_r = x & 0x4 ? 1 : 0;
vie->rex_x = x & 0x2 ? 1 : 0;
vie->rex_b = x & 0x1 ? 1 : 0;
-
vie_advance(vie);
}
+ /*
+ * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
+ */
+ if (cpu_mode == CPU_MODE_64BIT) {
+ /*
+ * Default address size is 64-bits and default operand size
+ * is 32-bits.
+ */
+ vie->addrsize = vie->addrsize_override ? 4 : 8;
+ if (vie->rex_w)
+ vie->opsize = 8;
+ else if (vie->opsize_override)
+ vie->opsize = 2;
+ else
+ vie->opsize = 4;
+ } else if (cs_d) {
+ /* Default address and operand sizes are 32-bits */
+ vie->addrsize = vie->addrsize_override ? 2 : 4;
+ vie->opsize = vie->opsize_override ? 2 : 4;
+ } else {
+ /* Default address and operand sizes are 16-bits */
+ vie->addrsize = vie->addrsize_override ? 4 : 2;
+ vie->opsize = vie->opsize_override ? 4 : 2;
+ }
return (0);
}
@@ -1070,6 +1398,12 @@ decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
{
uint8_t x;
+ if (cpu_mode == CPU_MODE_REAL)
+ return (-1);
+
+ if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
+ return (0);
+
if (vie_peek(vie, &x))
return (-1);
@@ -1248,20 +1582,32 @@ decode_immediate(struct vie *vie)
union {
char buf[4];
int8_t signed8;
+ int16_t signed16;
int32_t signed32;
} u;
/* Figure out immediate operand size (if any) */
- if (vie->op.op_flags & VIE_OP_F_IMM)
- vie->imm_bytes = 4;
- else if (vie->op.op_flags & VIE_OP_F_IMM8)
+ if (vie->op.op_flags & VIE_OP_F_IMM) {
+ /*
+ * Section 2.2.1.5 "Immediates", Intel SDM:
+ * In 64-bit mode the typical size of immediate operands
+ * remains 32-bits. When the operand size if 64-bits, the
+ * processor sign-extends all immediates to 64-bits prior
+ * to their use.
+ */
+ if (vie->opsize == 4 || vie->opsize == 8)
+ vie->imm_bytes = 4;
+ else
+ vie->imm_bytes = 2;
+ } else if (vie->op.op_flags & VIE_OP_F_IMM8) {
vie->imm_bytes = 1;
+ }
if ((n = vie->imm_bytes) == 0)
return (0);
- if (n != 1 && n != 4)
- panic("decode_immediate: invalid imm_bytes %d", n);
+ KASSERT(n == 1 || n == 2 || n == 4,
+ ("%s: invalid number of immediate bytes: %d", __func__, n));
for (i = 0; i < n; i++) {
if (vie_peek(vie, &x))
@@ -1270,12 +1616,47 @@ decode_immediate(struct vie *vie)
u.buf[i] = x;
vie_advance(vie);
}
-
+
+ /* sign-extend the immediate value before use */
if (n == 1)
- vie->immediate = u.signed8; /* sign-extended */
+ vie->immediate = u.signed8;
+ else if (n == 2)
+ vie->immediate = u.signed16;
else
- vie->immediate = u.signed32; /* sign-extended */
+ vie->immediate = u.signed32;
+
+ return (0);
+}
+
+static int
+decode_moffset(struct vie *vie)
+{
+ int i, n;
+ uint8_t x;
+ union {
+ char buf[8];
+ uint64_t u64;
+ } u;
+
+ if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
+ return (0);
+
+ /*
+ * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
+ * The memory offset size follows the address-size of the instruction.
+ */
+ n = vie->addrsize;
+ KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
+
+ u.u64 = 0;
+ for (i = 0; i < n; i++) {
+ if (vie_peek(vie, &x))
+ return (-1);
+ u.buf[i] = x;
+ vie_advance(vie);
+ }
+ vie->displacement = u.u64;
return (0);
}
@@ -1300,7 +1681,7 @@ static int
verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
{
int error;
- uint64_t base, idx;
+ uint64_t base, idx, gla2;
/* Skip 'gla' verification */
if (gla == VIE_INVALID_GLA)
@@ -1333,11 +1714,14 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
}
}
- if (base + vie->scale * idx + vie->displacement != gla) {
+ /* XXX assuming that the base address of the segment is 0 */
+ gla2 = base + vie->scale * idx + vie->displacement;
+ gla2 &= size2mask[vie->addrsize];
+ if (gla != gla2) {
printf("verify_gla mismatch: "
"base(0x%0lx), scale(%d), index(0x%0lx), "
- "disp(0x%0lx), gla(0x%0lx)\n",
- base, vie->scale, idx, vie->displacement, gla);
+ "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
+ base, vie->scale, idx, vie->displacement, gla, gla2);
return (-1);
}
@@ -1346,13 +1730,11 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
int
vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
- enum vm_cpu_mode cpu_mode, struct vie *vie)
+ enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
{
- if (cpu_mode == CPU_MODE_64BIT) {
- if (decode_rex(vie))
- return (-1);
- }
+ if (decode_prefixes(vie, cpu_mode, cs_d))
+ return (-1);
if (decode_opcode(vie))
return (-1);
@@ -1365,10 +1747,13 @@ vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
if (decode_displacement(vie))
return (-1);
-
+
if (decode_immediate(vie))
return (-1);
+ if (decode_moffset(vie))
+ return (-1);
+
if (verify_inst_length(vie))
return (-1);
diff --git a/sys/amd64/vmm/vmm_stat.c b/sys/amd64/vmm/vmm_stat.c
index 4b960ea..9e195f2 100644
--- a/sys/amd64/vmm/vmm_stat.c
+++ b/sys/amd64/vmm/vmm_stat.c
@@ -52,8 +52,10 @@ static struct vmm_stat_type *vsttab[MAX_VMM_STAT_ELEMS];
static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat");
+#define vst_size ((size_t)vst_num_elems * sizeof(uint64_t))
+
void
-vmm_stat_init(void *arg)
+vmm_stat_register(void *arg)
{
struct vmm_stat_type *vst = arg;
@@ -81,12 +83,21 @@ vmm_stat_init(void *arg)
int
vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf)
{
- int i;
+ struct vmm_stat_type *vst;
uint64_t *stats;
+ int i;
if (vcpu < 0 || vcpu >= VM_MAXCPU)
return (EINVAL);
-
+
+ /* Let stats functions update their counters */
+ for (i = 0; i < vst_num_types; i++) {
+ vst = vsttab[i];
+ if (vst->func != NULL)
+ (*vst->func)(vm, vcpu, vst);
+ }
+
+ /* Copy over the stats */
stats = vcpu_stats(vm, vcpu);
for (i = 0; i < vst_num_elems; i++)
buf[i] = stats[i];
@@ -97,11 +108,15 @@ vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf)
void *
vmm_stat_alloc(void)
{
- u_long size;
-
- size = vst_num_elems * sizeof(uint64_t);
- return (malloc(size, M_VMM_STAT, M_ZERO | M_WAITOK));
+ return (malloc(vst_size, M_VMM_STAT, M_WAITOK));
+}
+
+void
+vmm_stat_init(void *vp)
+{
+
+ bzero(vp, vst_size);
}
void
diff --git a/sys/amd64/vmm/vmm_stat.h b/sys/amd64/vmm/vmm_stat.h
index 831d206..c6d0f0b 100644
--- a/sys/amd64/vmm/vmm_stat.h
+++ b/sys/amd64/vmm/vmm_stat.h
@@ -42,20 +42,28 @@ enum vmm_stat_scope {
VMM_STAT_SCOPE_AMD, /* AMD SVM specific statistic */
};
+struct vmm_stat_type;
+typedef void (*vmm_stat_func_t)(struct vm *vm, int vcpu,
+ struct vmm_stat_type *stat);
+
struct vmm_stat_type {
int index; /* position in the stats buffer */
int nelems; /* standalone or array */
const char *desc; /* description of statistic */
+ vmm_stat_func_t func;
enum vmm_stat_scope scope;
};
-void vmm_stat_init(void *arg);
+void vmm_stat_register(void *arg);
-#define VMM_STAT_DEFINE(type, nelems, desc, scope) \
+#define VMM_STAT_FDEFINE(type, nelems, desc, func, scope) \
struct vmm_stat_type type[1] = { \
- { -1, nelems, desc, scope } \
+ { -1, nelems, desc, func, scope } \
}; \
- SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_init, type)
+ SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_register, type)
+
+#define VMM_STAT_DEFINE(type, nelems, desc, scope) \
+ VMM_STAT_FDEFINE(type, nelems, desc, NULL, scope)
#define VMM_STAT_DECLARE(type) \
extern struct vmm_stat_type type[1]
@@ -67,10 +75,14 @@ void vmm_stat_init(void *arg);
#define VMM_STAT_AMD(type, desc) \
VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_AMD)
+#define VMM_STAT_FUNC(type, desc, func) \
+ VMM_STAT_FDEFINE(type, 1, desc, func, VMM_STAT_SCOPE_ANY)
+
#define VMM_STAT_ARRAY(type, nelems, desc) \
VMM_STAT_DEFINE(type, nelems, desc, VMM_STAT_SCOPE_ANY)
void *vmm_stat_alloc(void);
+void vmm_stat_init(void *vp);
void vmm_stat_free(void *vp);
/*
@@ -92,9 +104,22 @@ vmm_stat_array_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst,
stats[vst->index + statidx] += x;
#endif
}
-
static void __inline
+vmm_stat_array_set(struct vm *vm, int vcpu, struct vmm_stat_type *vst,
+ int statidx, uint64_t val)
+{
+#ifdef VMM_KEEP_STATS
+ uint64_t *stats;
+
+ stats = vcpu_stats(vm, vcpu);
+
+ if (vst->index >= 0 && statidx < vst->nelems)
+ stats[vst->index + statidx] = val;
+#endif
+}
+
+static void __inline
vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x)
{
@@ -103,6 +128,15 @@ vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x)
#endif
}
+static void __inline
+vmm_stat_set(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t val)
+{
+
+#ifdef VMM_KEEP_STATS
+ vmm_stat_array_set(vm, vcpu, vst, 0, val);
+#endif
+}
+
VMM_STAT_DECLARE(VCPU_MIGRATIONS);
VMM_STAT_DECLARE(VMEXIT_COUNT);
VMM_STAT_DECLARE(VMEXIT_EXTINT);
diff --git a/sys/amd64/vmm/x86.c b/sys/amd64/vmm/x86.c
index 704bf68..9f38bf3 100644
--- a/sys/amd64/vmm/x86.c
+++ b/sys/amd64/vmm/x86.c
@@ -243,6 +243,26 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id,
/* leaf 0 */
if (*ecx == 0) {
+ cpuid_count(*eax, *ecx, regs);
+
+ /* Only leaf 0 is supported */
+ regs[0] = 0;
+
+ /*
+ * Expose known-safe features.
+ */
+ regs[1] &= (CPUID_STDEXT_FSGSBASE |
+ CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
+ CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 |
+ CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
+ CPUID_STDEXT_AVX512F |
+ CPUID_STDEXT_AVX512PF |
+ CPUID_STDEXT_AVX512ER |
+ CPUID_STDEXT_AVX512CD);
+ regs[2] = 0;
+ regs[3] = 0;
+
+ /* Advertise INVPCID if it is enabled. */
error = vm_get_capability(vm, vcpu_id,
VM_CAP_ENABLE_INVPCID, &enable_invpcid);
if (error == 0 && enable_invpcid)
OpenPOWER on IntegriCloud