diff options
author | jhb <jhb@FreeBSD.org> | 2014-07-21 19:08:02 +0000 |
---|---|---|
committer | jhb <jhb@FreeBSD.org> | 2014-07-21 19:08:02 +0000 |
commit | e6b48465b7c368666e10a2bd8f4f500483497b24 (patch) | |
tree | 7e529f938e1d777df1f54045191fa3003056dbda /sys/amd64/vmm | |
parent | b164bf591711a4c455ca47e1f58b0bb91e5e904c (diff) | |
download | FreeBSD-src-e6b48465b7c368666e10a2bd8f4f500483497b24.zip FreeBSD-src-e6b48465b7c368666e10a2bd8f4f500483497b24.tar.gz |
MFC 264353,264509,264768,264770,264825,264846,264988,265114,265165,265365,
265941,265951,266390,266550,266910:
Various bhyve fixes:
- Don't save host's return address in 'struct vmxctx'.
- Permit non-32-bit accesses to local APIC registers.
- Factor out common ioport handler code.
- Use calloc() in favor of malloc + memset.
- Change the vlapic timer frequency to be in the ballpark of contemporary
hardware.
- Allow the guest to read the TSC via MSR 0x10.
- A VMCS is always inactive when it exits the vmx_run() loop. Remove
redundant code and the misleading comment that suggest otherwise.
- Ignore writes to microcode update MSR. This MSR is accessed by RHEL7
guest.
Add KTR tracepoints to annotate wrmsr and rdmsr VM exits.
- Provide an alias for the userboot console and name it 'comconsole'.
- Use EV_ADD to create an mevent and EV_ENABLE to enable it.
- abort(3) the process in response to a VMEXIT_ABORT.
- Don't include the guest memory segments in the bhyve(8) process core dump.
- Make the vmx asm code dtrace-fbt-friendly.
- Allow vmx_getdesc() and vmx_setdesc() to be called for a vcpu that is in
the VCPU_RUNNING state.
- Enable VMX in the IA32_FEATURE_CONTROL MSR if it not enabled and the MSR
isn't locked.
Diffstat (limited to 'sys/amd64/vmm')
-rw-r--r-- | sys/amd64/vmm/intel/vmcs.c | 16 | ||||
-rw-r--r-- | sys/amd64/vmm/intel/vmcs.h | 4 | ||||
-rw-r--r-- | sys/amd64/vmm/intel/vmx.c | 45 | ||||
-rw-r--r-- | sys/amd64/vmm/intel/vmx.h | 6 | ||||
-rw-r--r-- | sys/amd64/vmm/intel/vmx_genassym.c | 1 | ||||
-rw-r--r-- | sys/amd64/vmm/intel/vmx_support.S | 35 | ||||
-rw-r--r-- | sys/amd64/vmm/io/vlapic.c | 7 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_ioport.c | 34 | ||||
-rw-r--r-- | sys/amd64/vmm/vmm_lapic.c | 8 |
9 files changed, 92 insertions, 64 deletions
diff --git a/sys/amd64/vmm/intel/vmcs.c b/sys/amd64/vmm/intel/vmcs.c index 1ddefe0..cc97d95 100644 --- a/sys/amd64/vmm/intel/vmcs.c +++ b/sys/amd64/vmm/intel/vmcs.c @@ -231,7 +231,7 @@ vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val) } int -vmcs_setdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc) +vmcs_setdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc) { int error; uint32_t base, limit, access; @@ -240,7 +240,8 @@ vmcs_setdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc) if (error != 0) panic("vmcs_setdesc: invalid segment register %d", seg); - VMPTRLD(vmcs); + if (!running) + VMPTRLD(vmcs); if ((error = vmwrite(base, desc->base)) != 0) goto done; @@ -252,12 +253,13 @@ vmcs_setdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc) goto done; } done: - VMCLEAR(vmcs); + if (!running) + VMCLEAR(vmcs); return (error); } int -vmcs_getdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc) +vmcs_getdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc) { int error; uint32_t base, limit, access; @@ -267,7 +269,8 @@ vmcs_getdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc) if (error != 0) panic("vmcs_getdesc: invalid segment register %d", seg); - VMPTRLD(vmcs); + if (!running) + VMPTRLD(vmcs); if ((error = vmread(base, &u64)) != 0) goto done; desc->base = u64; @@ -282,7 +285,8 @@ vmcs_getdesc(struct vmcs *vmcs, int seg, struct seg_desc *desc) desc->access = u64; } done: - VMCLEAR(vmcs); + if (!running) + VMCLEAR(vmcs); return (error); } diff --git a/sys/amd64/vmm/intel/vmcs.h b/sys/amd64/vmm/intel/vmcs.h index 9cde999..657d5b0 100644 --- a/sys/amd64/vmm/intel/vmcs.h +++ b/sys/amd64/vmm/intel/vmcs.h @@ -49,9 +49,9 @@ int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count); int vmcs_init(struct vmcs *vmcs); int vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *rv); int vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val); -int vmcs_getdesc(struct vmcs *vmcs, int ident, +int vmcs_getdesc(struct vmcs *vmcs, int running, int ident, struct seg_desc *desc); -int vmcs_setdesc(struct vmcs *vmcs, int ident, +int vmcs_setdesc(struct vmcs *vmcs, int running, int ident, struct seg_desc *desc); static __inline uint64_t diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 1c39552..e85e5e4 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -509,6 +509,15 @@ static void vmx_enable(void *arg __unused) { int error; + uint64_t feature_control; + + feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); + if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || + (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { + wrmsr(MSR_IA32_FEATURE_CONTROL, + feature_control | IA32_FEATURE_CONTROL_VMX_EN | + IA32_FEATURE_CONTROL_LOCK); + } load_cr4(rcr4() | CR4_VMXE); @@ -544,7 +553,7 @@ vmx_init(int ipinum) * are set (bits 0 and 2 respectively). */ feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); - if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || + if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 && (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { printf("vmx_init: VMX operation disabled by BIOS\n"); return (ENXIO); @@ -863,6 +872,11 @@ vmx_vminit(struct vm *vm, pmap_t pmap) * MSR_EFER is saved and restored in the guest VMCS area on a * VM exit and entry respectively. It is also restored from the * host VMCS area on a VM exit. + * + * The TSC MSR is exposed read-only. Writes are disallowed as that + * will impact the host TSC. + * XXX Writes would be implemented with a wrmsr trap, and + * then modifying the TSC offset in the VMCS. */ if (guest_msr_rw(vmx, MSR_GSBASE) || guest_msr_rw(vmx, MSR_FSBASE) || @@ -870,7 +884,8 @@ vmx_vminit(struct vm *vm, pmap_t pmap) guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || guest_msr_rw(vmx, MSR_KGSBASE) || - guest_msr_rw(vmx, MSR_EFER)) + guest_msr_rw(vmx, MSR_EFER) || + guest_msr_ro(vmx, MSR_TSC)) panic("vmx_vminit: error setting guest msr access"); /* @@ -1829,6 +1844,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); retu = false; ecx = vmxctx->guest_rcx; + VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx); error = emulate_rdmsr(vmx->vm, vcpu, ecx, &retu); if (error) { vmexit->exitcode = VM_EXITCODE_RDMSR; @@ -1847,6 +1863,8 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) eax = vmxctx->guest_rax; ecx = vmxctx->guest_rcx; edx = vmxctx->guest_rdx; + VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx", + ecx, (uint64_t)edx << 32 | eax); error = emulate_wrmsr(vmx->vm, vcpu, ecx, (uint64_t)edx << 32 | eax, &retu); if (error) { @@ -2257,7 +2275,7 @@ vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap, static void vmx_vmcleanup(void *arg) { - int i, error; + int i; struct vmx *vmx = arg; if (apic_access_virtualization(vmx, 0)) @@ -2266,13 +2284,6 @@ vmx_vmcleanup(void *arg) for (i = 0; i < VM_MAXCPU; i++) vpid_free(vmx->state[i].vpid); - /* - * XXXSMP we also need to clear the VMCS active on the other vcpus. - */ - error = vmclear(&vmx->vmcs[0]); - if (error != 0) - panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); - free(vmx, M_VMX); return; @@ -2430,17 +2441,27 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) { + int hostcpu, running; struct vmx *vmx = arg; - return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc)); + running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu); + + return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc)); } static int vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) { + int hostcpu, running; struct vmx *vmx = arg; - return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc)); + running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu); + + return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc)); } static int diff --git a/sys/amd64/vmm/intel/vmx.h b/sys/amd64/vmm/intel/vmx.h index 80bfd72..208fcee 100644 --- a/sys/amd64/vmm/intel/vmx.h +++ b/sys/amd64/vmm/intel/vmx.h @@ -60,7 +60,6 @@ struct vmxctx { register_t host_rbp; register_t host_rsp; register_t host_rbx; - register_t host_rip; /* * XXX todo debug registers and fpu state */ @@ -68,7 +67,7 @@ struct vmxctx { int inst_fail_status; /* - * The pmap needs to be deactivated in vmx_exit_guest() + * The pmap needs to be deactivated in vmx_enter_guest() * so keep a copy of the 'pmap' in each vmxctx. */ struct pmap *pmap; @@ -122,10 +121,11 @@ CTASSERT((offsetof(struct vmx, pir_desc[0]) & 63) == 0); #define VMX_VMLAUNCH_ERROR 2 #define VMX_INVEPT_ERROR 3 int vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched); -void vmx_exit_guest(void); void vmx_call_isr(uintptr_t entry); u_long vmx_fix_cr0(u_long cr0); u_long vmx_fix_cr4(u_long cr4); +extern char vmx_exit_guest[]; + #endif diff --git a/sys/amd64/vmm/intel/vmx_genassym.c b/sys/amd64/vmm/intel/vmx_genassym.c index 5c91fec..e1b98d6 100644 --- a/sys/amd64/vmm/intel/vmx_genassym.c +++ b/sys/amd64/vmm/intel/vmx_genassym.c @@ -65,7 +65,6 @@ ASSYM(VMXCTX_HOST_R12, offsetof(struct vmxctx, host_r12)); ASSYM(VMXCTX_HOST_RBP, offsetof(struct vmxctx, host_rbp)); ASSYM(VMXCTX_HOST_RSP, offsetof(struct vmxctx, host_rsp)); ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx)); -ASSYM(VMXCTX_HOST_RIP, offsetof(struct vmxctx, host_rip)); ASSYM(VMXCTX_INST_FAIL_STATUS, offsetof(struct vmxctx, inst_fail_status)); ASSYM(VMXCTX_PMAP, offsetof(struct vmxctx, pmap)); diff --git a/sys/amd64/vmm/intel/vmx_support.S b/sys/amd64/vmm/intel/vmx_support.S index 9e8cf2d..840b7e0 100644 --- a/sys/amd64/vmm/intel/vmx_support.S +++ b/sys/amd64/vmm/intel/vmx_support.S @@ -37,6 +37,10 @@ #define LK #endif +/* Be friendly to DTrace FBT's prologue/epilogue pattern matching */ +#define VENTER push %rbp ; mov %rsp,%rbp +#define VLEAVE pop %rbp + /* * Assumes that %rdi holds a pointer to the 'vmxctx'. * @@ -72,8 +76,7 @@ * * Assumes that %rdi holds a pointer to the 'vmxctx'. */ -#define VMX_HOST_SAVE(tmpreg) \ - movq (%rsp), tmpreg; /* return address */ \ +#define VMX_HOST_SAVE \ movq %r15, VMXCTX_HOST_R15(%rdi); \ movq %r14, VMXCTX_HOST_R14(%rdi); \ movq %r13, VMXCTX_HOST_R13(%rdi); \ @@ -81,9 +84,8 @@ movq %rbp, VMXCTX_HOST_RBP(%rdi); \ movq %rsp, VMXCTX_HOST_RSP(%rdi); \ movq %rbx, VMXCTX_HOST_RBX(%rdi); \ - movq tmpreg, VMXCTX_HOST_RIP(%rdi) -#define VMX_HOST_RESTORE(tmpreg) \ +#define VMX_HOST_RESTORE \ movq VMXCTX_HOST_R15(%rdi), %r15; \ movq VMXCTX_HOST_R14(%rdi), %r14; \ movq VMXCTX_HOST_R13(%rdi), %r13; \ @@ -91,8 +93,6 @@ movq VMXCTX_HOST_RBP(%rdi), %rbp; \ movq VMXCTX_HOST_RSP(%rdi), %rsp; \ movq VMXCTX_HOST_RBX(%rdi), %rbx; \ - movq VMXCTX_HOST_RIP(%rdi), tmpreg; \ - movq tmpreg, (%rsp) /* return address */ /* * vmx_enter_guest(struct vmxctx *vmxctx, int launched) @@ -102,10 +102,11 @@ * Interrupts must be disabled on entry. */ ENTRY(vmx_enter_guest) + VENTER /* * Save host state before doing anything else. */ - VMX_HOST_SAVE(%r10) + VMX_HOST_SAVE /* * Activate guest pmap on this cpu. @@ -186,15 +187,18 @@ inst_error: movl PCPU(CPUID), %r10d LK btrl %r10d, PM_ACTIVE(%r11) - VMX_HOST_RESTORE(%r10) + VMX_HOST_RESTORE + VLEAVE ret -END(vmx_enter_guest) /* - * void vmx_exit_guest(void) - * %rsp points to the struct vmxctx + * Non-error VM-exit from the guest. Make this a label so it can + * be used by C code when setting up the VMCS. + * The VMCS-restored %rsp points to the struct vmxctx */ -ENTRY(vmx_exit_guest) + ALIGN_TEXT + .globl vmx_exit_guest +vmx_exit_guest: /* * Save guest state that is not automatically saved in the vmcs. */ @@ -226,15 +230,16 @@ ENTRY(vmx_exit_guest) movl PCPU(CPUID), %r10d LK btrl %r10d, PM_ACTIVE(%r11) - VMX_HOST_RESTORE(%r10) + VMX_HOST_RESTORE /* * This will return to the caller of 'vmx_enter_guest()' with a return * value of VMX_GUEST_VMEXIT. */ movl $VMX_GUEST_VMEXIT, %eax + VLEAVE ret -END(vmx_exit_guest) +END(vmx_enter_guest) /* * %rdi = interrupt handler entry point @@ -243,6 +248,7 @@ END(vmx_exit_guest) * instruction in Intel SDM, Vol 2. */ ENTRY(vmx_call_isr) + VENTER mov %rsp, %r11 /* save %rsp */ and $~0xf, %rsp /* align on 16-byte boundary */ pushq $KERNEL_SS /* %ss */ @@ -251,5 +257,6 @@ ENTRY(vmx_call_isr) pushq $KERNEL_CS /* %cs */ cli /* disable interrupts */ callq *%rdi /* push %rip and call isr */ + VLEAVE ret END(vmx_call_isr) diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c index 2e0a575..d93641c 100644 --- a/sys/amd64/vmm/io/vlapic.c +++ b/sys/amd64/vmm/io/vlapic.c @@ -70,7 +70,12 @@ __FBSDID("$FreeBSD$"); #define VLAPIC_TIMER_UNLOCK(vlapic) mtx_unlock_spin(&((vlapic)->timer_mtx)) #define VLAPIC_TIMER_LOCKED(vlapic) mtx_owned(&((vlapic)->timer_mtx)) -#define VLAPIC_BUS_FREQ tsc_freq +/* + * APIC timer frequency: + * - arbitrary but chosen to be in the ballpark of contemporary hardware. + * - power-of-two to avoid loss of precision when converted to a bintime. + */ +#define VLAPIC_BUS_FREQ (128 * 1024 * 1024) static __inline uint32_t vlapic_get_id(struct vlapic *vlapic) diff --git a/sys/amd64/vmm/vmm_ioport.c b/sys/amd64/vmm/vmm_ioport.c index ed17e40..eae45cc 100644 --- a/sys/amd64/vmm/vmm_ioport.c +++ b/sys/amd64/vmm/vmm_ioport.c @@ -69,18 +69,19 @@ emulate_ioport(struct vm *vm, int vcpuid, struct vm_exit *vmexit) if (handler == NULL) return (-1); + switch (vmexit->u.inout.bytes) { + case 1: + mask = 0xff; + break; + case 2: + mask = 0xffff; + break; + default: + mask = 0xffffffff; + break; + } + if (!vmexit->u.inout.in) { - switch (vmexit->u.inout.bytes) { - case 1: - mask = 0xff; - break; - case 2: - mask = 0xffff; - break; - default: - mask = 0xffffffff; - break; - } val = vmexit->u.inout.eax & mask; } @@ -88,17 +89,6 @@ emulate_ioport(struct vm *vm, int vcpuid, struct vm_exit *vmexit) vmexit->u.inout.port, vmexit->u.inout.bytes, &val); if (!error && vmexit->u.inout.in) { - switch (vmexit->u.inout.bytes) { - case 1: - mask = 0xff; - break; - case 2: - mask = 0xffff; - break; - default: - mask = 0xffffffff; - break; - } vmexit->u.inout.eax &= ~mask; vmexit->u.inout.eax |= val & mask; } diff --git a/sys/amd64/vmm/vmm_lapic.c b/sys/amd64/vmm/vmm_lapic.c index 640c779..fa9832e 100644 --- a/sys/amd64/vmm/vmm_lapic.c +++ b/sys/amd64/vmm/vmm_lapic.c @@ -230,10 +230,12 @@ lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size, off = gpa - DEFAULT_APIC_BASE; /* - * Memory mapped local apic accesses must be 4 bytes wide and - * aligned on a 16-byte boundary. + * Memory mapped local apic accesses should be aligned on a + * 16-byte boundary. They are also suggested to be 4 bytes + * wide, alas not all OSes follow suggestions. */ - if (size != 4 || off & 0xf) + off &= ~3; + if (off & 0xf) return (EINVAL); vlapic = vm_lapic(vm, cpu); |