diff options
Diffstat (limited to 'arch/x86/kvm/vmx.c')
-rw-r--r-- | arch/x86/kvm/vmx.c | 448 |
1 files changed, 248 insertions, 200 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ed53b42..d4918d6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -61,12 +61,37 @@ module_param_named(unrestricted_guest, static int __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); +/* + * These 2 parameters are used to config the controls for Pause-Loop Exiting: + * ple_gap: upper bound on the amount of time between two successive + * executions of PAUSE in a loop. Also indicate if ple enabled. + * According to test, this time is usually small than 41 cycles. + * ple_window: upper bound on the amount of time a guest is allowed to execute + * in a PAUSE loop. Tests indicate that most spinlocks are held for + * less than 2^12 cycles + * Time is measured based on a counter that runs at the same rate as the TSC, + * refer SDM volume 3b section 21.6.13 & 22.1.3. + */ +#define KVM_VMX_DEFAULT_PLE_GAP 41 +#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 +static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; +module_param(ple_gap, int, S_IRUGO); + +static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; +module_param(ple_window, int, S_IRUGO); + struct vmcs { u32 revision_id; u32 abort; char data[0]; }; +struct shared_msr_entry { + unsigned index; + u64 data; + u64 mask; +}; + struct vcpu_vmx { struct kvm_vcpu vcpu; struct list_head local_vcpus_link; @@ -74,13 +99,12 @@ struct vcpu_vmx { int launched; u8 fail; u32 idt_vectoring_info; - struct kvm_msr_entry *guest_msrs; - struct kvm_msr_entry *host_msrs; + struct shared_msr_entry *guest_msrs; int nmsrs; int save_nmsrs; - int msr_offset_efer; #ifdef CONFIG_X86_64 - int msr_offset_kernel_gs_base; + u64 msr_host_kernel_gs_base; + u64 msr_guest_kernel_gs_base; #endif struct vmcs *vmcs; struct { @@ -88,7 +112,6 @@ struct vcpu_vmx { u16 fs_sel, gs_sel, ldt_sel; int gs_ldt_reload_needed; int fs_reload_needed; - int guest_efer_loaded; } host_state; struct { int vm86_active; @@ -107,7 +130,6 @@ struct vcpu_vmx { } rmode; int vpid; bool emulation_required; - enum emulation_result invalid_state_emulation_result; /* Support for vnmi-less CPUs */ int soft_vnmi_blocked; @@ -176,6 +198,8 @@ static struct kvm_vmx_segment_field { VMX_SEGMENT_FIELD(LDTR), }; +static u64 host_efer; + static void ept_save_pdptrs(struct kvm_vcpu *vcpu); /* @@ -184,28 +208,12 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu); */ static const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 - MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, + MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif MSR_EFER, MSR_K6_STAR, }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) -static void load_msrs(struct kvm_msr_entry *e, int n) -{ - int i; - - for (i = 0; i < n; ++i) - wrmsrl(e[i].index, e[i].data); -} - -static void save_msrs(struct kvm_msr_entry *e, int n) -{ - int i; - - for (i = 0; i < n; ++i) - rdmsrl(e[i].index, e[i].data); -} - static inline int is_page_fault(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | @@ -320,6 +328,12 @@ static inline int cpu_has_vmx_unrestricted_guest(void) SECONDARY_EXEC_UNRESTRICTED_GUEST; } +static inline int cpu_has_vmx_ple(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_PAUSE_LOOP_EXITING; +} + static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) { return flexpriority_enabled && @@ -348,7 +362,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) int i; for (i = 0; i < vmx->nmsrs; ++i) - if (vmx->guest_msrs[i].index == msr) + if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) return i; return -1; } @@ -379,7 +393,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa) : : "a" (&operand), "c" (ext) : "cc", "memory"); } -static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) +static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -570,17 +584,12 @@ static void reload_tss(void) load_TR_desc(); } -static void load_transition_efer(struct vcpu_vmx *vmx) +static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) { - int efer_offset = vmx->msr_offset_efer; - u64 host_efer; u64 guest_efer; u64 ignore_bits; - if (efer_offset < 0) - return; - host_efer = vmx->host_msrs[efer_offset].data; - guest_efer = vmx->guest_msrs[efer_offset].data; + guest_efer = vmx->vcpu.arch.shadow_efer; /* * NX is emulated; LMA and LME handled by hardware; SCE meaninless @@ -593,27 +602,17 @@ static void load_transition_efer(struct vcpu_vmx *vmx) if (guest_efer & EFER_LMA) ignore_bits &= ~(u64)EFER_SCE; #endif - if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits)) - return; - - vmx->host_state.guest_efer_loaded = 1; guest_efer &= ~ignore_bits; guest_efer |= host_efer & ignore_bits; - wrmsrl(MSR_EFER, guest_efer); - vmx->vcpu.stat.efer_reload++; -} - -static void reload_host_efer(struct vcpu_vmx *vmx) -{ - if (vmx->host_state.guest_efer_loaded) { - vmx->host_state.guest_efer_loaded = 0; - load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1); - } + vmx->guest_msrs[efer_offset].data = guest_efer; + vmx->guest_msrs[efer_offset].mask = ~ignore_bits; + return true; } static void vmx_save_host_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + int i; if (vmx->host_state.loaded) return; @@ -650,13 +649,15 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) #endif #ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) - save_msrs(vmx->host_msrs + - vmx->msr_offset_kernel_gs_base, 1); - + if (is_long_mode(&vmx->vcpu)) { + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + } #endif - load_msrs(vmx->guest_msrs, vmx->save_nmsrs); - load_transition_efer(vmx); + for (i = 0; i < vmx->save_nmsrs; ++i) + kvm_set_shared_msr(vmx->guest_msrs[i].index, + vmx->guest_msrs[i].data, + vmx->guest_msrs[i].mask); } static void __vmx_load_host_state(struct vcpu_vmx *vmx) @@ -684,9 +685,12 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) local_irq_restore(flags); } reload_tss(); - save_msrs(vmx->guest_msrs, vmx->save_nmsrs); - load_msrs(vmx->host_msrs, vmx->save_nmsrs); - reload_host_efer(vmx); +#ifdef CONFIG_X86_64 + if (is_long_mode(&vmx->vcpu)) { + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + } +#endif } static void vmx_load_host_state(struct vcpu_vmx *vmx) @@ -877,19 +881,14 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, /* * Swap MSR entry in host/guest MSR entry array. */ -#ifdef CONFIG_X86_64 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) { - struct kvm_msr_entry tmp; + struct shared_msr_entry tmp; tmp = vmx->guest_msrs[to]; vmx->guest_msrs[to] = vmx->guest_msrs[from]; vmx->guest_msrs[from] = tmp; - tmp = vmx->host_msrs[to]; - vmx->host_msrs[to] = vmx->host_msrs[from]; - vmx->host_msrs[from] = tmp; } -#endif /* * Set up the vmcs to automatically save and restore system @@ -898,15 +897,13 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) */ static void setup_msrs(struct vcpu_vmx *vmx) { - int save_nmsrs; + int save_nmsrs, index; unsigned long *msr_bitmap; vmx_load_host_state(vmx); save_nmsrs = 0; #ifdef CONFIG_X86_64 if (is_long_mode(&vmx->vcpu)) { - int index; - index = __find_msr_index(vmx, MSR_SYSCALL_MASK); if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); @@ -916,9 +913,6 @@ static void setup_msrs(struct vcpu_vmx *vmx) index = __find_msr_index(vmx, MSR_CSTAR); if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); /* * MSR_K6_STAR is only needed on long mode guests, and only * if efer.sce is enabled. @@ -928,13 +922,11 @@ static void setup_msrs(struct vcpu_vmx *vmx) move_msr_up(vmx, index, save_nmsrs++); } #endif - vmx->save_nmsrs = save_nmsrs; + index = __find_msr_index(vmx, MSR_EFER); + if (index >= 0 && update_transition_efer(vmx, index)) + move_msr_up(vmx, index, save_nmsrs++); -#ifdef CONFIG_X86_64 - vmx->msr_offset_kernel_gs_base = - __find_msr_index(vmx, MSR_KERNEL_GS_BASE); -#endif - vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER); + vmx->save_nmsrs = save_nmsrs; if (cpu_has_vmx_msr_bitmap()) { if (is_long_mode(&vmx->vcpu)) @@ -976,7 +968,7 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc) static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) { u64 data; - struct kvm_msr_entry *msr; + struct shared_msr_entry *msr; if (!pdata) { printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); @@ -991,9 +983,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_GS_BASE: data = vmcs_readl(GUEST_GS_BASE); break; + case MSR_KERNEL_GS_BASE: + vmx_load_host_state(to_vmx(vcpu)); + data = to_vmx(vcpu)->msr_guest_kernel_gs_base; + break; +#endif case MSR_EFER: return kvm_get_msr_common(vcpu, msr_index, pdata); -#endif case MSR_IA32_TSC: data = guest_read_tsc(); break; @@ -1007,6 +1003,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) data = vmcs_readl(GUEST_SYSENTER_ESP); break; default: + vmx_load_host_state(to_vmx(vcpu)); msr = find_msr_entry(to_vmx(vcpu), msr_index); if (msr) { vmx_load_host_state(to_vmx(vcpu)); @@ -1028,7 +1025,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_msr_entry *msr; + struct shared_msr_entry *msr; u64 host_tsc; int ret = 0; @@ -1044,6 +1041,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) case MSR_GS_BASE: vmcs_writel(GUEST_GS_BASE, data); break; + case MSR_KERNEL_GS_BASE: + vmx_load_host_state(vmx); + vmx->msr_guest_kernel_gs_base = data; + break; #endif case MSR_IA32_SYSENTER_CS: vmcs_write32(GUEST_SYSENTER_CS, data); @@ -1097,30 +1098,14 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) } } -static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) +static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - int old_debug = vcpu->guest_debug; - unsigned long flags; - - vcpu->guest_debug = dbg->control; - if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE)) - vcpu->guest_debug = 0; - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]); else vmcs_writel(GUEST_DR7, vcpu->arch.dr7); - flags = vmcs_readl(GUEST_RFLAGS); - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - flags |= X86_EFLAGS_TF | X86_EFLAGS_RF; - else if (old_debug & KVM_GUESTDBG_SINGLESTEP) - flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); - vmcs_writel(GUEST_RFLAGS, flags); - update_exception_bitmap(vcpu); - - return 0; } static __init int cpu_has_kvm_support(void) @@ -1139,12 +1124,15 @@ static __init int vmx_disabled_by_bios(void) /* locked but not enabled */ } -static void hardware_enable(void *garbage) +static int hardware_enable(void *garbage) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); u64 old; + if (read_cr4() & X86_CR4_VMXE) + return -EBUSY; + INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); rdmsrl(MSR_IA32_FEATURE_CONTROL, old); if ((old & (FEATURE_CONTROL_LOCKED | @@ -1159,6 +1147,10 @@ static void hardware_enable(void *garbage) asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) : "memory", "cc"); + + ept_sync_global(); + + return 0; } static void vmclear_local_vcpus(void) @@ -1250,7 +1242,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_ENABLE_EPT | - SECONDARY_EXEC_UNRESTRICTED_GUEST; + SECONDARY_EXEC_UNRESTRICTED_GUEST | + SECONDARY_EXEC_PAUSE_LOOP_EXITING; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -1344,15 +1337,17 @@ static void free_kvm_area(void) { int cpu; - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) { free_vmcs(per_cpu(vmxarea, cpu)); + per_cpu(vmxarea, cpu) = NULL; + } } static __init int alloc_kvm_area(void) { int cpu; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct vmcs *vmcs; vmcs = alloc_vmcs_cpu(cpu); @@ -1394,6 +1389,9 @@ static __init int hardware_setup(void) if (enable_ept && !cpu_has_vmx_ept_2m_page()) kvm_disable_largepages(); + if (!cpu_has_vmx_ple()) + ple_gap = 0; + return alloc_kvm_area(); } @@ -1536,8 +1534,16 @@ continue_rmode: static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); + struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); + + if (!msr) + return; + /* + * Force kernel_gs_base reloading before EFER changes, as control + * of this msr depends on is_long_mode(). + */ + vmx_load_host_state(to_vmx(vcpu)); vcpu->arch.shadow_efer = efer; if (!msr) return; @@ -1727,6 +1733,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vmcs_write64(EPT_POINTER, eptp); guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : vcpu->kvm->arch.ept_identity_map_addr; + ept_load_pdptrs(vcpu); } vmx_flush_tlb(vcpu); @@ -2302,13 +2309,22 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (vmx->vpid == 0) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; - if (!enable_ept) + if (!enable_ept) { exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; + enable_unrestricted_guest = 0; + } if (!enable_unrestricted_guest) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; + if (!ple_gap) + exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } + if (ple_gap) { + vmcs_write32(PLE_GAP, ple_gap); + vmcs_write32(PLE_WINDOW, ple_window); + } + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ @@ -2376,10 +2392,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) if (wrmsr_safe(index, data_low, data_high) < 0) continue; data = data_low | ((u64)data_high << 32); - vmx->host_msrs[j].index = index; - vmx->host_msrs[j].reserved = 0; - vmx->host_msrs[j].data = data; - vmx->guest_msrs[j] = vmx->host_msrs[j]; + vmx->guest_msrs[j].index = i; + vmx->guest_msrs[j].data = 0; + vmx->guest_msrs[j].mask = -1ull; ++vmx->nmsrs; } @@ -2510,7 +2525,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) if (vmx->vpid != 0) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - vmx->vcpu.arch.cr0 = 0x60000010; + vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ vmx_set_cr4(&vmx->vcpu, 0); vmx_set_efer(&vmx->vcpu, 0); @@ -2627,6 +2642,34 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) GUEST_INTR_STATE_NMI)); } +static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) +{ + if (!cpu_has_virtual_nmis()) + return to_vmx(vcpu)->soft_vnmi_blocked; + else + return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + GUEST_INTR_STATE_NMI); +} + +static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!cpu_has_virtual_nmis()) { + if (vmx->soft_vnmi_blocked != masked) { + vmx->soft_vnmi_blocked = masked; + vmx->vnmi_blocked_time = 0; + } + } else { + if (masked) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + } +} + static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && @@ -2659,7 +2702,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, * Cause the #SS fault with 0 error code in VM86 mode. */ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) - if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) + if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) return 1; /* * Forward all other exceptions that are valid in real mode. @@ -2710,15 +2753,16 @@ static void kvm_machine_check(void) #endif } -static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_machine_check(struct kvm_vcpu *vcpu) { /* already handled by vcpu_run */ return 1; } -static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_exception(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_run *kvm_run = vcpu->run; u32 intr_info, ex_no, error_code; unsigned long cr2, rip, dr6; u32 vect_info; @@ -2728,12 +2772,17 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) intr_info = vmcs_read32(VM_EXIT_INTR_INFO); if (is_machine_check(intr_info)) - return handle_machine_check(vcpu, kvm_run); + return handle_machine_check(vcpu); if ((vect_info & VECTORING_INFO_VALID_MASK) && - !is_page_fault(intr_info)) - printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " - "intr info 0x%x\n", __func__, vect_info, intr_info); + !is_page_fault(intr_info)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = vect_info; + vcpu->run->internal.data[1] = intr_info; + return 0; + } if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) return 1; /* already handled by vmx_vcpu_run() */ @@ -2744,7 +2793,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } if (is_invalid_opcode(intr_info)) { - er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); + er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD); if (er != EMULATE_DONE) kvm_queue_exception(vcpu, UD_VECTOR); return 1; @@ -2803,20 +2852,19 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 0; } -static int handle_external_interrupt(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static int handle_external_interrupt(struct kvm_vcpu *vcpu) { ++vcpu->stat.irq_exits; return 1; } -static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_triple_fault(struct kvm_vcpu *vcpu) { - kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; } -static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_io(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; int size, in, string; @@ -2827,8 +2875,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) string = (exit_qualification & 16) != 0; if (string) { - if (emulate_instruction(vcpu, - kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) + if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO) return 0; return 1; } @@ -2838,7 +2885,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) port = exit_qualification >> 16; skip_emulated_instruction(vcpu); - return kvm_emulate_pio(vcpu, kvm_run, in, size, port); + return kvm_emulate_pio(vcpu, in, size, port); } static void @@ -2852,7 +2899,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xc1; } -static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_cr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification, val; int cr; @@ -2887,7 +2934,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; if (cr8_prev <= cr8) return 1; - kvm_run->exit_reason = KVM_EXIT_SET_TPR; + vcpu->run->exit_reason = KVM_EXIT_SET_TPR; return 0; } }; @@ -2922,13 +2969,13 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) default: break; } - kvm_run->exit_reason = 0; + vcpu->run->exit_reason = 0; pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", (int)(exit_qualification >> 4) & 3, cr); return 0; } -static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_dr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; unsigned long val; @@ -2944,13 +2991,13 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) * guest debugging itself. */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { - kvm_run->debug.arch.dr6 = vcpu->arch.dr6; - kvm_run->debug.arch.dr7 = dr; - kvm_run->debug.arch.pc = + vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; + vcpu->run->debug.arch.dr7 = dr; + vcpu->run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + vmcs_readl(GUEST_RIP); - kvm_run->debug.arch.exception = DB_VECTOR; - kvm_run->exit_reason = KVM_EXIT_DEBUG; + vcpu->run->debug.arch.exception = DB_VECTOR; + vcpu->run->exit_reason = KVM_EXIT_DEBUG; return 0; } else { vcpu->arch.dr7 &= ~DR7_GD; @@ -3016,13 +3063,13 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } -static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_cpuid(struct kvm_vcpu *vcpu) { kvm_emulate_cpuid(vcpu); return 1; } -static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_rdmsr(struct kvm_vcpu *vcpu) { u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; u64 data; @@ -3041,7 +3088,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } -static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_wrmsr(struct kvm_vcpu *vcpu) { u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) @@ -3058,14 +3105,12 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } -static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) { return 1; } -static int handle_interrupt_window(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static int handle_interrupt_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; @@ -3081,34 +3126,34 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, * possible */ if (!irqchip_in_kernel(vcpu->kvm) && - kvm_run->request_interrupt_window && + vcpu->run->request_interrupt_window && !kvm_cpu_has_interrupt(vcpu)) { - kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; + vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; return 0; } return 1; } -static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_halt(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); return kvm_emulate_halt(vcpu); } -static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_vmcall(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); kvm_emulate_hypercall(vcpu); return 1; } -static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_vmx_insn(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } -static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_invlpg(struct kvm_vcpu *vcpu) { unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -3117,14 +3162,14 @@ static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } -static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_wbinvd(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); /* TODO: Add support for VT-d/pass-through device */ return 1; } -static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_apic_access(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; enum emulation_result er; @@ -3133,7 +3178,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); offset = exit_qualification & 0xffful; - er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); + er = emulate_instruction(vcpu, 0, 0, 0); if (er != EMULATE_DONE) { printk(KERN_ERR @@ -3144,7 +3189,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } -static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_task_switch(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification; @@ -3198,7 +3243,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } -static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_ept_violation(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; gpa_t gpa; @@ -3219,8 +3264,8 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vmcs_readl(GUEST_LINEAR_ADDRESS)); printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", (long unsigned int)exit_qualification); - kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; return 0; } @@ -3290,7 +3335,7 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, } } -static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { u64 sptes[4]; int nr_sptes, i; @@ -3306,13 +3351,13 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); - kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; return 0; } -static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_nmi_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; @@ -3325,36 +3370,50 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } -static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); enum emulation_result err = EMULATE_DONE; - - local_irq_enable(); - preempt_enable(); + int ret = 1; while (!guest_state_valid(vcpu)) { - err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); + err = emulate_instruction(vcpu, 0, 0, 0); - if (err == EMULATE_DO_MMIO) - break; + if (err == EMULATE_DO_MMIO) { + ret = 0; + goto out; + } if (err != EMULATE_DONE) { kvm_report_emulation_failure(vcpu, "emulation failure"); - break; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + ret = 0; + goto out; } if (signal_pending(current)) - break; + goto out; if (need_resched()) schedule(); } - preempt_disable(); - local_irq_disable(); + vmx->emulation_required = 0; +out: + return ret; +} - vmx->invalid_state_emulation_result = err; +/* + * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE + * exiting, so only get here on cpu with PAUSE-Loop-Exiting. + */ +static int handle_pause(struct kvm_vcpu *vcpu) +{ + skip_emulated_instruction(vcpu); + kvm_vcpu_on_spin(vcpu); + + return 1; } /* @@ -3362,8 +3421,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, * may resume. Otherwise they set the kvm_run parameter to indicate what needs * to be done to userspace and return 0. */ -static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) = { +static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EXCEPTION_NMI] = handle_exception, [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, @@ -3394,6 +3452,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, + [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, }; static const int kvm_vmx_max_exit_handlers = @@ -3403,7 +3462,7 @@ static const int kvm_vmx_max_exit_handlers = * The guest has exited. See if we can fix it or if we need userspace * assistance. */ -static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +static int vmx_handle_exit(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exit_reason = vmx->exit_reason; @@ -3411,13 +3470,9 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); - /* If we need to emulate an MMIO from handle_invalid_guest_state - * we just return 0 */ - if (vmx->emulation_required && emulate_invalid_guest_state) { - if (guest_state_valid(vcpu)) - vmx->emulation_required = 0; - return vmx->invalid_state_emulation_result != EMULATE_DO_MMIO; - } + /* If guest state is invalid, start emulating */ + if (vmx->emulation_required && emulate_invalid_guest_state) + return handle_invalid_guest_state(vcpu); /* Access CR3 don't cause VMExit in paging mode, so we need * to sync with guest real CR3. */ @@ -3425,8 +3480,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); if (unlikely(vmx->fail)) { - kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; - kvm_run->fail_entry.hardware_entry_failure_reason + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason = vmcs_read32(VM_INSTRUCTION_ERROR); return 0; } @@ -3459,10 +3514,10 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) if (exit_reason < kvm_vmx_max_exit_handlers && kvm_vmx_exit_handlers[exit_reason]) - return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); + return kvm_vmx_exit_handlers[exit_reason](vcpu); else { - kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - kvm_run->hw.hardware_exit_reason = exit_reason; + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = exit_reason; } return 0; } @@ -3600,23 +3655,18 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) #define Q "l" #endif -static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (enable_ept && is_paging(vcpu)) { - vmcs_writel(GUEST_CR3, vcpu->arch.cr3); - ept_load_pdptrs(vcpu); - } /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) vmx->entry_time = ktime_get(); - /* Handle invalid guest state instead of entering VMX */ - if (vmx->emulation_required && emulate_invalid_guest_state) { - handle_invalid_guest_state(vcpu, kvm_run); + /* Don't enter VMX if guest state is invalid, let the exit handler + start emulation until we arrive back to a valid state */ + if (vmx->emulation_required && emulate_invalid_guest_state) return; - } if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); @@ -3775,7 +3825,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) __clear_bit(vmx->vpid, vmx_vpid_bitmap); spin_unlock(&vmx_vpid_lock); vmx_free_vmcs(vcpu); - kfree(vmx->host_msrs); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vmx); @@ -3802,10 +3851,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto uninit_vcpu; } - vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!vmx->host_msrs) - goto free_guest_msrs; - vmx->vmcs = alloc_vmcs(); if (!vmx->vmcs) goto free_msrs; @@ -3836,8 +3881,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) free_vmcs: free_vmcs(vmx->vmcs); free_msrs: - kfree(vmx->host_msrs); -free_guest_msrs: kfree(vmx->guest_msrs); uninit_vcpu: kvm_vcpu_uninit(&vmx->vcpu); @@ -3973,6 +4016,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .queue_exception = vmx_queue_exception, .interrupt_allowed = vmx_interrupt_allowed, .nmi_allowed = vmx_nmi_allowed, + .get_nmi_mask = vmx_get_nmi_mask, + .set_nmi_mask = vmx_set_nmi_mask, .enable_nmi_window = enable_nmi_window, .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, @@ -3987,7 +4032,12 @@ static struct kvm_x86_ops vmx_x86_ops = { static int __init vmx_init(void) { - int r; + int r, i; + + rdmsrl_safe(MSR_EFER, &host_efer); + + for (i = 0; i < NR_VMX_MSR; ++i) + kvm_define_shared_msr(i, vmx_msr_index[i]); vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_io_bitmap_a) @@ -4049,8 +4099,6 @@ static int __init vmx_init(void) if (bypass_guest_pf) kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); - ept_sync_global(); - return 0; out3: |