From e0ead41a6dac09f86675ce07a66e4b253a9b7bd5 Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Thu, 6 Jun 2013 15:32:37 +0200 Subject: KVM: async_pf: Provide additional direct page notification By setting a Kconfig option, the architecture can control when guest notifications will be presented by the apf backend. There is the default batch mechanism, working as before, where the vcpu thread should pull in this information. Opposite to this, there is now the direct mechanism, that will push the information to the guest. This way s390 can use an already existing architecture interface. Still the vcpu thread should call check_completion to cleanup leftovers. Signed-off-by: Dominik Dingel Signed-off-by: Christian Borntraeger --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e50425d..aaa60f3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3328,7 +3328,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) arch.direct_map = vcpu->arch.mmu.direct_map; arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); - return kvm_setup_async_pf(vcpu, gva, gfn, &arch); + return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch); } static bool can_do_async_pf(struct kvm_vcpu *vcpu) -- cgit v1.1 From 4f34d683e52271197e1ee17b7095e8ba27761ba6 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 29 Jan 2014 17:31:38 -0200 Subject: KVM: x86: remove unused last_kernel_ns variable Remove unused last_kernel_ns variable. Signed-off-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 39c28f09..151e8c3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1581,7 +1581,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* With all the info we got, fill in the values */ vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; - vcpu->last_kernel_ns = kernel_ns; vcpu->last_guest_tsc = tsc_timestamp; /* -- cgit v1.1 From 5befdc385ddb2d5ae8995ad89004529a3acf58fc Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 18 Feb 2014 17:22:47 +0900 Subject: KVM: Simplify kvm->tlbs_dirty handling When this was introduced, kvm_flush_remote_tlbs() could be called without holding mmu_lock. It is now acknowledged that the function must be called before releasing mmu_lock, and all callers have already been changed to do so. There is no need to use smp_mb() and cmpxchg() any more. Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini --- arch/x86/kvm/paging_tmpl.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index cba218a..b1e6c1b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -913,7 +913,8 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't * used by guest then tlbs are not flushed, so guest is allowed to access the * freed pages. - * And we increase kvm->tlbs_dirty to delay tlbs flush in this case. + * We set tlbs_dirty to let the notifier know this change and delay the flush + * until such a case actually happens. */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -942,7 +943,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return -EINVAL; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { - vcpu->kvm->tlbs_dirty++; + vcpu->kvm->tlbs_dirty = true; continue; } @@ -957,7 +958,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (gfn != sp->gfns[i]) { drop_spte(vcpu->kvm, &sp->spt[i]); - vcpu->kvm->tlbs_dirty++; + vcpu->kvm->tlbs_dirty = true; continue; } -- cgit v1.1 From f303b4ce8b386558b2b92aeb0c6af96685fcd4b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Fri, 17 Jan 2014 20:52:42 +0100 Subject: KVM: SVM: fix NMI window after iret MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We should open NMI window right after an iret, but SVM exits before it. We wanted to single step using the trap flag and then open it. (or we could emulate the iret instead) We don't do it since commit 3842d135ff2 (likely), because the iret exit handler does not request an event, so NMI window remains closed until the next exit. Fix this by making KVM_REQ_EVENT request in the iret handler. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e81df8f..64d9bb9 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2842,6 +2842,7 @@ static int iret_interception(struct vcpu_svm *svm) clr_intercept(svm, INTERCEPT_IRET); svm->vcpu.arch.hflags |= HF_IRET_MASK; svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); return 1; } -- cgit v1.1 From 0c79893b2bad49e0c391a9499f50fcd5b0f80874 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Fri, 21 Feb 2014 17:33:32 +0000 Subject: KVM: x86: expose new instruction RDSEED to guest From 24ffdce9efebf13c6ed4882f714b2b57ef1141eb Mon Sep 17 00:00:00 2001 From: Liu Jinsong Date: Thu, 20 Feb 2014 17:38:26 +0800 Subject: [PATCH] KVM: x86: expose new instruction RDSEED to guest RDSEED instruction return a random number, which supplied by a cryptographically secure, deterministic random bit generator(DRBG). Signed-off-by: Xudong Hao Signed-off-by: Liu Jinsong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c697625..abe18b4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -303,7 +303,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | - F(BMI2) | F(ERMS) | f_invpcid | F(RTM); + F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | F(RDSEED); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); -- cgit v1.1 From 49345f13f0830741b94b867cf906c4aad3988306 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Fri, 21 Feb 2014 17:36:12 +0000 Subject: KVM: x86: expose ADX feature to guest From 0750e335eb5860b0b483e217e8a08bd743cbba16 Mon Sep 17 00:00:00 2001 From: Liu Jinsong Date: Thu, 20 Feb 2014 17:39:32 +0800 Subject: [PATCH] KVM: x86: expose ADX feature to guest ADCX and ADOX instructions perform an unsigned addition with Carry flag and Overflow flag respectively. Signed-off-by: Xudong Hao Signed-off-by: Liu Jinsong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index abe18b4..a951ae4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -303,7 +303,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | - F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | F(RDSEED); + F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | F(RDSEED) | + F(ADX); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); -- cgit v1.1 From 56c103ec040b1944c8866f79aa768265c0dd2986 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Fri, 21 Feb 2014 17:39:02 +0000 Subject: KVM: x86: Fix xsave cpuid exposing bug From 00c920c96127d20d4c3bb790082700ae375c39a0 Mon Sep 17 00:00:00 2001 From: Liu Jinsong Date: Fri, 21 Feb 2014 23:47:18 +0800 Subject: [PATCH] KVM: x86: Fix xsave cpuid exposing bug EBX of cpuid(0xD, 0) is dynamic per XCR0 features enable/disable. Bit 63 of XCR0 is reserved for future expansion. Signed-off-by: Liu Jinsong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 6 +++--- arch/x86/kvm/x86.c | 7 +++++-- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a951ae4..b241325 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -28,7 +28,7 @@ static u32 xstate_required_size(u64 xstate_bv) int feature_bit = 0; u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; - xstate_bv &= ~XSTATE_FPSSE; + xstate_bv &= XSTATE_EXTEND_MASK; while (xstate_bv) { if (xstate_bv & 0x1) { u32 eax, ebx, ecx, edx; @@ -74,8 +74,8 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.guest_supported_xcr0 = (best->eax | ((u64)best->edx << 32)) & host_xcr0 & KVM_SUPPORTED_XCR0; - vcpu->arch.guest_xstate_size = - xstate_required_size(vcpu->arch.guest_supported_xcr0); + vcpu->arch.guest_xstate_size = best->ebx = + xstate_required_size(vcpu->arch.xcr0); } kvm_pmu_cpuid_update(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 151e8c3..3da8df8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -595,13 +595,13 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { - u64 xcr0; + u64 xcr0 = xcr; + u64 old_xcr0 = vcpu->arch.xcr0; u64 valid_bits; /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ if (index != XCR_XFEATURE_ENABLED_MASK) return 1; - xcr0 = xcr; if (!(xcr0 & XSTATE_FP)) return 1; if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) @@ -618,6 +618,9 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) kvm_put_guest_xcr0(vcpu); vcpu->arch.xcr0 = xcr0; + + if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK) + kvm_update_cpuid(vcpu); return 0; } -- cgit v1.1 From da8999d31818fdc8508d527ba3aac2e128005af4 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Mon, 24 Feb 2014 10:55:46 +0000 Subject: KVM: x86: Intel MPX vmx and msr handle From caddc009a6d2019034af8f2346b2fd37a81608d0 Mon Sep 17 00:00:00 2001 From: Liu Jinsong Date: Mon, 24 Feb 2014 18:11:11 +0800 Subject: [PATCH v5 1/3] KVM: x86: Intel MPX vmx and msr handle This patch handle vmx and msr of Intel MPX feature. Signed-off-by: Xudong Hao Signed-off-by: Liu Jinsong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a06f101..e4e4b50 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -441,6 +441,7 @@ struct vcpu_vmx { #endif int gs_ldt_reload_needed; int fs_reload_needed; + u64 msr_host_bndcfgs; } host_state; struct { int vm86_active; @@ -1710,6 +1711,8 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) if (is_long_mode(&vmx->vcpu)) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif + if (boot_cpu_has(X86_FEATURE_MPX)) + rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, vmx->guest_msrs[i].data, @@ -1747,6 +1750,8 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif + if (vmx->host_state.msr_host_bndcfgs) + wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); /* * If the FPU is not active (through the host task or * the guest vcpu), then restore the cr0.TS bit. @@ -2837,7 +2842,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | - VM_EXIT_ACK_INTR_ON_EXIT; + VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; @@ -2854,7 +2859,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; min = 0; - opt = VM_ENTRY_LOAD_IA32_PAT; + opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, &_vmentry_control) < 0) return -EIO; @@ -7052,6 +7057,12 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) local_irq_enable(); } +static bool vmx_mpx_supported(void) +{ + return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && + (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -8634,6 +8645,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_intercept = vmx_check_intercept, .handle_external_intr = vmx_handle_external_intr, + .mpx_supported = vmx_mpx_supported, }; static int __init vmx_init(void) @@ -8721,6 +8733,8 @@ static int __init vmx_init(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); + vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); + memcpy(vmx_msr_bitmap_legacy_x2apic, vmx_msr_bitmap_legacy, PAGE_SIZE); memcpy(vmx_msr_bitmap_longmode_x2apic, -- cgit v1.1 From 0dd376e709975779cf43f368498c5c0eec843b02 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Mon, 24 Feb 2014 10:56:53 +0000 Subject: KVM: x86: add MSR_IA32_BNDCFGS to msrs_to_save From 5d5a80cd172ea6fb51786369bcc23356b1e9e956 Mon Sep 17 00:00:00 2001 From: Liu Jinsong Date: Mon, 24 Feb 2014 18:11:55 +0800 Subject: [PATCH v5 2/3] KVM: x86: add MSR_IA32_BNDCFGS to msrs_to_save Add MSR_IA32_BNDCFGS to msrs_to_save, and corresponding logic to kvm_get/set_msr(). Signed-off-by: Liu Jinsong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 6 ++++++ arch/x86/kvm/x86.c | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e4e4b50..83ee24f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2484,6 +2484,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_IA32_SYSENTER_ESP: data = vmcs_readl(GUEST_SYSENTER_ESP); break; + case MSR_IA32_BNDCFGS: + data = vmcs_read64(GUEST_BNDCFGS); + break; case MSR_IA32_FEATURE_CONTROL: if (!nested_vmx_allowed(vcpu)) return 1; @@ -2552,6 +2555,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_SYSENTER_ESP: vmcs_writel(GUEST_SYSENTER_ESP, data); break; + case MSR_IA32_BNDCFGS: + vmcs_write64(GUEST_BNDCFGS, data); + break; case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); break; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3da8df8..33fa9e3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -882,7 +882,7 @@ static u32 msrs_to_save[] = { MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, - MSR_IA32_FEATURE_CONTROL + MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS }; static unsigned num_msrs_to_save; -- cgit v1.1 From 390bd528ae1c14d0b7f5db8225984f98617b3357 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Mon, 24 Feb 2014 10:58:09 +0000 Subject: KVM: x86: Enable Intel MPX for guest From 44c2abca2c2eadc6f2f752b66de4acc8131880c4 Mon Sep 17 00:00:00 2001 From: Liu Jinsong Date: Mon, 24 Feb 2014 18:12:31 +0800 Subject: [PATCH v5 3/3] KVM: x86: Enable Intel MPX for guest This patch enable Intel MPX feature to guest. Signed-off-by: Xudong Hao Signed-off-by: Liu Jinsong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 4 +++- arch/x86/kvm/x86.c | 3 +++ arch/x86/kvm/x86.h | 3 ++- 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b241325..ddc8a7e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -256,6 +256,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, #endif unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; + unsigned f_mpx = kvm_x86_ops->mpx_supported ? + (kvm_x86_ops->mpx_supported() ? F(MPX) : 0) : 0; /* cpuid 1.edx */ const u32 kvm_supported_word0_x86_features = @@ -303,7 +305,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | - F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | F(RDSEED) | + F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX); /* all calls to cpuid_count() should be made on the same cpu */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 33fa9e3..65300191 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -616,6 +616,9 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) if (xcr0 & ~valid_bits) return 1; + if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR))) + return 1; + kvm_put_guest_xcr0(vcpu); vcpu->arch.xcr0 = xcr0; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 8da5823..392ecbf 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -122,7 +122,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception); -#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) +#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ + | XSTATE_BNDREGS | XSTATE_BNDCSR) extern u64 host_xcr0; extern unsigned int min_timer_period_us; -- cgit v1.1 From d3714010c307d26df251c45be9cd12ab6d41f0c4 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 25 Feb 2014 22:44:54 -0300 Subject: KVM: x86: emulator_cmpxchg_emulated should mark_page_dirty emulator_cmpxchg_emulated writes to guest memory, therefore it should update the dirty bitmap accordingly. Signed-off-by: Marcelo Tosatti Reviewed-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 65300191..4cca458 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4399,6 +4399,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (!exchanged) return X86EMUL_CMPXCHG_FAILED; + mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); kvm_mmu_pte_write(vcpu, gpa, new, bytes); return X86EMUL_CONTINUE; -- cgit v1.1 From 684851a15744355f294ee3fee4ca2e9108382b47 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Thu, 27 Feb 2014 15:08:31 +0900 Subject: KVM: x86: Break kvm_for_each_vcpu loop after finding the VP_INDEX No need to scan the entire VCPU array. Signed-off-by: Takuya Yoshikawa Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4cca458..773eba7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2328,9 +2328,12 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_VP_INDEX: { int r; struct kvm_vcpu *v; - kvm_for_each_vcpu(r, v, vcpu->kvm) - if (v == vcpu) + kvm_for_each_vcpu(r, v, vcpu->kvm) { + if (v == vcpu) { data = r; + break; + } + } break; } case HV_X64_MSR_EOI: -- cgit v1.1 From ccf9844e5d99c1ee9a5b8c4f1332ac5211cbce03 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 27 Feb 2014 22:54:11 +0100 Subject: kvm, vmx: Really fix lazy FPU on nested guest Commit e504c9098ed6 (kvm, vmx: Fix lazy FPU on nested guest, 2013-11-13) highlighted a real problem, but the fix was subtly wrong. nested_read_cr0 is the CR0 as read by L2, but here we want to look at the CR0 value reflecting L1's setup. In other words, L2 might think that TS=0 (so nested_read_cr0 has the bit clear); but if L1 is actually running it with TS=1, we should inject the fault into L1. The effective value of CR0 in L2 is contained in vmcs12->guest_cr0, use it. Fixes: e504c9098ed6acd9e1079c5e10e4910724ad429f Reported-by: Kashyap Chamarty Reported-by: Stefan Bader Tested-by: Kashyap Chamarty Tested-by: Anthoine Bourgeois Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 83ee24f..53c324f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6699,7 +6699,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) else if (is_page_fault(intr_info)) return enable_ept; else if (is_no_device(intr_info) && - !(nested_read_cr0(vmcs12) & X86_CR0_TS)) + !(vmcs12->guest_cr0 & X86_CR0_TS)) return 0; return vmcs12->exception_bitmap & (1u << (intr_info & INTR_INFO_VECTOR_MASK)); -- cgit v1.1 From 7e44e4495a398eb553ce561f29f9148f40a3448f Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 28 Feb 2014 12:52:54 +0100 Subject: x86: kvm: rate-limit global clock updates When we update a vcpu's local clock it may pick up an NTP correction. We can't wait an indeterminate amount of time for other vcpus to pick up that correction, so commit 0061d53daf26f introduced a global clock update. However, we can't request a global clock update on every vcpu load either (which is what happens if the tsc is marked as unstable). The solution is to rate-limit the global clock updates. Marcelo calculated that we should delay the global clock updates no more than 0.1s as follows: Assume an NTP correction c is applied to one vcpu, but not the other, then in n seconds the delta of the vcpu system_timestamps will be c * n. If we assume a correction of 500ppm (worst-case), then the two vcpus will diverge 50us in 0.1s, which is a considerable amount. Signed-off-by: Andrew Jones Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 773eba7..5ed9293 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1628,14 +1628,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * the others. * * So in those cases, request a kvmclock update for all vcpus. - * The worst case for a remote vcpu to update its kvmclock - * is then bounded by maximum nohz sleep latency. + * We need to rate-limit these requests though, as they can + * considerably slow guests that have a large number of vcpus. + * The time for a remote vcpu to update its kvmclock is bound + * by the delay we use to rate-limit the updates. */ -static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) +#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) + +static void kvmclock_update_fn(struct work_struct *work) { int i; - struct kvm *kvm = v->kvm; + struct delayed_work *dwork = to_delayed_work(work); + struct kvm_arch *ka = container_of(dwork, struct kvm_arch, + kvmclock_update_work); + struct kvm *kvm = container_of(ka, struct kvm, arch); struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { @@ -1644,6 +1651,15 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) } } +static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) +{ + struct kvm *kvm = v->kvm; + + set_bit(KVM_REQ_CLOCK_UPDATE, &v->requests); + schedule_delayed_work(&kvm->arch.kvmclock_update_work, + KVMCLOCK_UPDATE_DELAY); +} + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { @@ -7022,6 +7038,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) pvclock_update_vm_gtod_copy(kvm); + INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); + return 0; } @@ -7059,6 +7077,7 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_sync_events(struct kvm *kvm) { + cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); kvm_free_all_assigned_devices(kvm); kvm_free_pit(kvm); } -- cgit v1.1 From 332967a3eac06f6379283cf155c84fe7cd0537c2 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 28 Feb 2014 12:52:55 +0100 Subject: x86: kvm: introduce periodic global clock updates commit 0061d53daf26f introduced a mechanism to execute a global clock update for a vm. We can apply this periodically in order to propagate host NTP corrections. Also, if all vcpus of a vm are pinned, then without an additional trigger, no guest NTP corrections can propagate either, as the current trigger is only vcpu cpu migration. Signed-off-by: Andrew Jones Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5ed9293..1e91a24 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1660,6 +1660,20 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) KVMCLOCK_UPDATE_DELAY); } +#define KVMCLOCK_SYNC_PERIOD (300 * HZ) + +static void kvmclock_sync_fn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct kvm_arch *ka = container_of(dwork, struct kvm_arch, + kvmclock_sync_work); + struct kvm *kvm = container_of(ka, struct kvm, arch); + + schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, + KVMCLOCK_SYNC_PERIOD); +} + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { @@ -6736,6 +6750,7 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { int r; struct msr_data msr; + struct kvm *kvm = vcpu->kvm; r = vcpu_load(vcpu); if (r) @@ -6746,6 +6761,9 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) kvm_write_tsc(vcpu, &msr); vcpu_put(vcpu); + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, + KVMCLOCK_SYNC_PERIOD); + return r; } @@ -7039,6 +7057,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) pvclock_update_vm_gtod_copy(kvm); INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); + INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); return 0; } @@ -7077,6 +7096,7 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_sync_events(struct kvm *kvm) { + cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); kvm_free_all_assigned_devices(kvm); kvm_free_pit(kvm); -- cgit v1.1 From b6b8a1451fc40412c57d10c94b62e22acab28f94 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 7 Mar 2014 20:03:12 +0100 Subject: KVM: nVMX: Rework interception of IRQs and NMIs Move the check for leaving L2 on pending and intercepted IRQs or NMIs from the *_allowed handler into a dedicated callback. Invoke this callback at the relevant points before KVM checks if IRQs/NMIs can be injected. The callback has the task to switch from L2 to L1 if needed and inject the proper vmexit events. The rework fixes L2 wakeups from HLT and provides the foundation for preemption timer emulation. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 67 +++++++++++++++++++++++++++++++----------------------- arch/x86/kvm/x86.c | 26 +++++++++++++++------ 2 files changed, 57 insertions(+), 36 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 53c324f..11718b44 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4631,22 +4631,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) { - if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; - if (nested_exit_on_nmi(vcpu)) { - nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, - NMI_VECTOR | INTR_TYPE_NMI_INTR | - INTR_INFO_VALID_MASK, 0); - /* - * The NMI-triggered VM exit counts as injection: - * clear this one and block further NMIs. - */ - vcpu->arch.nmi_pending = 0; - vmx_set_nmi_mask(vcpu, true); - return 0; - } - } + if (to_vmx(vcpu)->nested.nested_run_pending) + return 0; if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) return 0; @@ -4658,19 +4644,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) { - if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; - if (nested_exit_on_intr(vcpu)) { - nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, - 0, 0); - /* - * fall through to normal code, but now in L1, not L2 - */ - } - } - - return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + return (!to_vmx(vcpu)->nested.nested_run_pending && + vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); } @@ -8172,6 +8147,35 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, } } +static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { + if (vmx->nested.nested_run_pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, + NMI_VECTOR | INTR_TYPE_NMI_INTR | + INTR_INFO_VALID_MASK, 0); + /* + * The NMI-triggered VM exit counts as injection: + * clear this one and block further NMIs. + */ + vcpu->arch.nmi_pending = 0; + vmx_set_nmi_mask(vcpu, true); + return 0; + } + + if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && + nested_exit_on_intr(vcpu)) { + if (vmx->nested.nested_run_pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + } + + return 0; +} + /* * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), @@ -8512,6 +8516,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, nested_vmx_succeed(vcpu); if (enable_shadow_vmcs) vmx->nested.sync_shadow_vmcs = true; + + /* in case we halted in L2 */ + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } /* @@ -8652,6 +8659,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_intercept = vmx_check_intercept, .handle_external_intr = vmx_handle_external_intr, .mpx_supported = vmx_mpx_supported, + + .check_nested_events = vmx_check_nested_events, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a45bcac45..7382625 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5821,8 +5821,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); } -static void inject_pending_event(struct kvm_vcpu *vcpu) +static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) { + int r; + /* try to reinject previous events if any */ if (vcpu->arch.exception.pending) { trace_kvm_inj_exception(vcpu->arch.exception.nr, @@ -5832,17 +5834,23 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code, vcpu->arch.exception.reinject); - return; + return 0; } if (vcpu->arch.nmi_injected) { kvm_x86_ops->set_nmi(vcpu); - return; + return 0; } if (vcpu->arch.interrupt.pending) { kvm_x86_ops->set_irq(vcpu); - return; + return 0; + } + + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { + r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); + if (r != 0) + return r; } /* try to inject new event if pending */ @@ -5859,6 +5867,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) kvm_x86_ops->set_irq(vcpu); } } + return 0; } static void process_nmi(struct kvm_vcpu *vcpu) @@ -5963,10 +5972,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } - inject_pending_event(vcpu); - + if (inject_pending_event(vcpu, req_int_win) != 0) + req_immediate_exit = true; /* enable NMI/IRQ window open exits if needed */ - if (vcpu->arch.nmi_pending) + else if (vcpu->arch.nmi_pending) req_immediate_exit = kvm_x86_ops->enable_nmi_window(vcpu) != 0; else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) @@ -7296,6 +7305,9 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) + kvm_x86_ops->check_nested_events(vcpu, false); + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted) || !list_empty_careful(&vcpu->async_pf.done) -- cgit v1.1 From f4124500c2c13eb1208c6143b3f6d469709dea10 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 7 Mar 2014 20:03:13 +0100 Subject: KVM: nVMX: Fully emulate preemption timer We cannot rely on the hardware-provided preemption timer support because we are holding L2 in HLT outside non-root mode. Furthermore, emulating the preemption will resolve tick rate errata on older Intel CPUs. The emulation is based on hrtimer which is started on L2 entry, stopped on L2 exit and evaluated via the new check_nested_events hook. As we no longer rely on hardware features, we can enable both the preemption timer support and value saving unconditionally. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 151 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 96 insertions(+), 55 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 11718b44..e559675 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "kvm_cache_regs.h" #include "x86.h" @@ -110,6 +111,8 @@ module_param(nested, bool, S_IRUGO); #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) +#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -374,6 +377,9 @@ struct nested_vmx { */ struct page *apic_access_page; u64 msr_ia32_feature_control; + + struct hrtimer preemption_timer; + bool preemption_timer_expired; }; #define POSTED_INTR_ON 0 @@ -1048,6 +1054,12 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; } +static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & + PIN_BASED_VMX_PREEMPTION_TIMER; +} + static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); @@ -2253,9 +2265,9 @@ static __init void nested_vmx_setup_ctls_msrs(void) */ nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | - PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS | + PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; + nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; - nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; /* * Exit controls @@ -2270,15 +2282,10 @@ static __init void nested_vmx_setup_ctls_msrs(void) #ifdef CONFIG_X86_64 VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif - VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; + nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | + VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; - if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) || - !(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) { - nested_vmx_exit_ctls_high &= ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; - nested_vmx_pinbased_ctls_high &= ~PIN_BASED_VMX_PREEMPTION_TIMER; - } - nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | - VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER); /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2347,9 +2354,9 @@ static __init void nested_vmx_setup_ctls_msrs(void) /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); - nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK | - VMX_MISC_SAVE_EFER_LMA; - nested_vmx_misc_low |= VMX_MISC_ACTIVITY_HLT; + nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; + nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | + VMX_MISC_ACTIVITY_HLT; nested_vmx_misc_high = 0; } @@ -5713,6 +5720,18 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu, */ } +static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) +{ + struct vcpu_vmx *vmx = + container_of(timer, struct vcpu_vmx, nested.preemption_timer); + + vmx->nested.preemption_timer_expired = true; + kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); + kvm_vcpu_kick(&vmx->vcpu); + + return HRTIMER_NORESTART; +} + /* * Emulate the VMXON instruction. * Currently, we just remember that VMX is active, and do not save or even @@ -5777,6 +5796,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu) INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); vmx->nested.vmcs02_num = 0; + hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + vmx->nested.vmxon = true; skip_emulated_instruction(vcpu); @@ -6753,9 +6776,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) * table is L0's fault. */ return 0; - case EXIT_REASON_PREEMPTION_TIMER: - return vmcs12->pin_based_vm_exec_control & - PIN_BASED_VMX_PREEMPTION_TIMER; case EXIT_REASON_WBINVD: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); case EXIT_REASON_XSETBV: @@ -6771,27 +6791,6 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = vmcs_read32(VM_EXIT_INTR_INFO); } -static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu) -{ - u64 delta_tsc_l1; - u32 preempt_val_l1, preempt_val_l2, preempt_scale; - - if (!(get_vmcs12(vcpu)->pin_based_vm_exec_control & - PIN_BASED_VMX_PREEMPTION_TIMER)) - return; - preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) & - MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE; - preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE); - delta_tsc_l1 = vmx_read_l1_tsc(vcpu, native_read_tsc()) - - vcpu->arch.last_guest_tsc; - preempt_val_l1 = delta_tsc_l1 >> preempt_scale; - if (preempt_val_l2 <= preempt_val_l1) - preempt_val_l2 = 0; - else - preempt_val_l2 -= preempt_val_l1; - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2); -} - /* * The guest has exited. See if we can fix it or if we need userspace * assistance. @@ -7210,8 +7209,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) atomic_switch_perf_msrs(vmx); debugctlmsr = get_debugctlmsr(); - if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) - nested_adjust_preemption_timer(vcpu); vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ @@ -7608,6 +7605,28 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, kvm_inject_page_fault(vcpu, fault); } +static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) +{ + u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vcpu->arch.virtual_tsc_khz == 0) + return; + + /* Make sure short timeouts reliably trigger an immediate vmexit. + * hrtimer_start does not guarantee this. */ + if (preemption_timeout <= 1) { + vmx_preemption_timer_fn(&vmx->nested.preemption_timer); + return; + } + + preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; + preemption_timeout *= 1000000; + do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); + hrtimer_start(&vmx->nested.preemption_timer, + ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -7621,7 +7640,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exec_control; - u32 exit_control; vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); @@ -7679,13 +7697,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(VMCS_LINK_POINTER, -1ull); - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, - (vmcs_config.pin_based_exec_ctrl | - vmcs12->pin_based_vm_exec_control)); + exec_control = vmcs12->pin_based_vm_exec_control; + exec_control |= vmcs_config.pin_based_exec_ctrl; + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); - if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, - vmcs12->vmx_preemption_timer_value); + vmx->nested.preemption_timer_expired = false; + if (nested_cpu_has_preemption_timer(vmcs12)) + vmx_start_preemption_timer(vcpu); /* * Whether page-faults are trapped is determined by a combination of @@ -7713,7 +7732,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) enable_ept ? vmcs12->page_fault_error_code_match : 0); if (cpu_has_secondary_exec_ctrls()) { - u32 exec_control = vmx_secondary_exec_control(vmx); + exec_control = vmx_secondary_exec_control(vmx); if (!vmx->rdtscp_enabled) exec_control &= ~SECONDARY_EXEC_RDTSCP; /* Take the following fields only from vmcs12 */ @@ -7800,10 +7819,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER * bits are further modified by vmx_set_efer() below. */ - exit_control = vmcs_config.vmexit_ctrl; - if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) - exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; - vm_exit_controls_init(vmx, exit_control); + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are * emulated by vmx_set_efer(), below. @@ -8151,6 +8167,14 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && + vmx->nested.preemption_timer_expired) { + if (vmx->nested.nested_run_pending) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); + return 0; + } + if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { if (vmx->nested.nested_run_pending) return -EBUSY; @@ -8176,6 +8200,20 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) return 0; } +static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) +{ + ktime_t remaining = + hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); + u64 value; + + if (ktime_to_ns(remaining) <= 0) + return 0; + + value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; + do_div(value, 1000000); + return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; +} + /* * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), @@ -8246,10 +8284,13 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, else vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; - if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) && - (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) - vmcs12->vmx_preemption_timer_value = - vmcs_read32(VMX_PREEMPTION_TIMER_VALUE); + if (nested_cpu_has_preemption_timer(vmcs12)) { + if (vmcs12->vm_exit_controls & + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) + vmcs12->vmx_preemption_timer_value = + vmx_get_preemption_timer_value(vcpu); + hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); + } /* * In some cases (usually, nested EPT), L2 is allowed to change its -- cgit v1.1 From 220c56729766444f3dd823f740a147ca6d82c4c6 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 7 Mar 2014 20:03:14 +0100 Subject: KVM: nVMX: Do not inject NMI vmexits when L2 has a pending interrupt According to SDM 27.2.3, IDT vectoring information will not be valid on vmexits caused by external NMIs. So we have to avoid creating such scenarios by delaying EXIT_REASON_EXCEPTION_NMI injection as long as we have a pending interrupt because that one would be migrated to L1's IDT vectoring info on nested exit. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e559675..2c9d21e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8176,7 +8176,8 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) } if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { - if (vmx->nested.nested_run_pending) + if (vmx->nested.nested_run_pending || + vcpu->arch.interrupt.pending) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, NMI_VECTOR | INTR_TYPE_NMI_INTR | -- cgit v1.1 From c9a7953f09bbe2b66050ebf97e0532eaeefbc9f3 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 7 Mar 2014 20:03:15 +0100 Subject: KVM: x86: Remove return code from enable_irq/nmi_window It's no longer possible to enter enable_irq_window in guest mode when L1 intercepts external interrupts and we are entering L2. This is now caught in vcpu_enter_guest. So we can remove the check from the VMX version of enable_irq_window, thus the need to return an error code from both enable_irq_window and enable_nmi_window. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 8 +++----- arch/x86/kvm/vmx.c | 25 +++++++------------------ arch/x86/kvm/x86.c | 6 ++---- 3 files changed, 12 insertions(+), 27 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 64d9bb9..1e8616e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3650,7 +3650,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) return ret; } -static int enable_irq_window(struct kvm_vcpu *vcpu) +static void enable_irq_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3664,16 +3664,15 @@ static int enable_irq_window(struct kvm_vcpu *vcpu) svm_set_vintr(svm); svm_inject_irq(svm, 0x0); } - return 0; } -static int enable_nmi_window(struct kvm_vcpu *vcpu) +static void enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK) - return 0; /* IRET will cause a vm exit */ + return; /* IRET will cause a vm exit */ /* * Something prevents NMI from been injected. Single step over possible @@ -3682,7 +3681,6 @@ static int enable_nmi_window(struct kvm_vcpu *vcpu) svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); update_db_bp_intercept(vcpu); - return 0; } static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2c9d21e..fcc1947 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4514,39 +4514,28 @@ static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) PIN_BASED_NMI_EXITING; } -static int enable_irq_window(struct kvm_vcpu *vcpu) +static void enable_irq_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; - if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) - /* - * We get here if vmx_interrupt_allowed() said we can't - * inject to L1 now because L2 must run. The caller will have - * to make L2 exit right after entry, so we can inject to L1 - * more promptly. - */ - return -EBUSY; - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - return 0; } -static int enable_nmi_window(struct kvm_vcpu *vcpu) +static void enable_nmi_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; - if (!cpu_has_virtual_nmis()) - return enable_irq_window(vcpu); - - if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) - return enable_irq_window(vcpu); + if (!cpu_has_virtual_nmis() || + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { + enable_irq_window(vcpu); + return; + } cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - return 0; } static void vmx_inject_irq(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7382625..6223121 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5976,11 +5976,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) req_immediate_exit = true; /* enable NMI/IRQ window open exits if needed */ else if (vcpu->arch.nmi_pending) - req_immediate_exit = - kvm_x86_ops->enable_nmi_window(vcpu) != 0; + kvm_x86_ops->enable_nmi_window(vcpu); else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) - req_immediate_exit = - kvm_x86_ops->enable_irq_window(vcpu) != 0; + kvm_x86_ops->enable_irq_window(vcpu); if (kvm_lapic_enabled(vcpu)) { /* -- cgit v1.1 From c845f9c646e646e6a5fe416c2e835342984249f7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 21 Feb 2014 10:55:44 +0100 Subject: KVM: vmx: we do rely on loading DR7 on entry Currently, this works even if the bit is not in "min", because the bit is always set in MSR_IA32_VMX_ENTRY_CTLS. Mention it for the sake of documentation, and to avoid surprises if we later switch to MSR_IA32_VMX_TRUE_ENTRY_CTLS. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fcc1947..b2a913b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2871,7 +2871,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; - min = 0; + min = VM_ENTRY_LOAD_DEBUG_CONTROLS; opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, &_vmentry_control) < 0) -- cgit v1.1 From 360b948d88bf30ef4b10b693adf497f51fb46a08 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 21 Feb 2014 09:55:56 +0100 Subject: KVM: x86: change vcpu->arch.switch_db_regs to a bit mask The next patch will add another bit that we can test with the same "if". Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6223121..85c74e7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -759,7 +759,9 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu) else dr7 = vcpu->arch.dr7; kvm_x86_ops->set_dr7(vcpu, dr7); - vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK); + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; + if (dr7 & DR7_BP_EN_MASK) + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; } static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) -- cgit v1.1 From c77fb5fe6f031bee9403397ae7b94ea22ea19aa7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 21 Feb 2014 10:17:24 +0100 Subject: KVM: x86: Allow the guest to run with dirty debug registers When not running in guest-debug mode, the guest controls the debug registers and having to take an exit for each DR access is a waste of time. If the guest gets into a state where each context switch causes DR to be saved and restored, this can take away as much as 40% of the execution time from the guest. After this patch, VMX- and SVM-specific code can set a flag in switch_db_regs, telling vcpu_enter_guest that on the next exit the debug registers might be dirty and need to be reloaded (syncing will be taken care of by a new callback in kvm_x86_ops). This flag can be set on the first access to a debug registers, so that multiple accesses to the debug registers only cause one vmexit. Note that since the guest will be able to read debug registers and enable breakpoints in DR7, we need to ensure that they are synchronized on entry to the guest---including DR6 that was not synced before. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 85c74e7..d906391 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6040,12 +6040,28 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); + set_debugreg(vcpu->arch.dr6, 6); } trace_kvm_entry(vcpu->vcpu_id); kvm_x86_ops->run(vcpu); /* + * Do this here before restoring debug registers on the host. And + * since we do this before handling the vmexit, a DR access vmexit + * can (a) read the correct value of the debug registers, (b) set + * KVM_DEBUGREG_WONT_EXIT again. + */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { + int i; + + WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); + kvm_x86_ops->sync_dirty_debug_regs(vcpu); + for (i = 0; i < KVM_NR_DB_REGS; i++) + vcpu->arch.eff_db[i] = vcpu->arch.db[i]; + } + + /* * If the guest has used debug registers, at least dr7 * will be disabled while returning to the host. * If we don't have active breakpoints in the host, we don't -- cgit v1.1 From 81908bf44340eb5ebc9969f67e6c8be0c92f2857 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 21 Feb 2014 10:32:27 +0100 Subject: KVM: vmx: Allow the guest to run with dirty debug registers When not running in guest-debug mode (i.e. the guest controls the debug registers, having to take an exit for each DR access is a waste of time. If the guest gets into a state where each context switch causes DR to be saved and restored, this can take away as much as 40% of the execution time from the guest. If the guest is running with vcpu->arch.db == vcpu->arch.eff_db, we can let it write freely to the debug registers and reload them on the next exit. We still need to exit on the first access, so that the KVM_DEBUGREG_WONT_EXIT flag is set in switch_db_regs; after that, further accesses to the debug registers will not cause a vmexit. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b2a913b..a9940ec 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include "trace.h" @@ -2850,7 +2851,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmx_capability.ept, vmx_capability.vpid); } - min = 0; + min = VM_EXIT_SAVE_DEBUG_CONTROLS; #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif @@ -5084,6 +5085,22 @@ static int handle_dr(struct kvm_vcpu *vcpu) } } + if (vcpu->guest_debug == 0) { + u32 cpu_based_vm_exec_control; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + + /* + * No more DR vmexits; force a reload of the debug registers + * and reenter on this instruction. The next vmexit will + * retrieve the full state of the debug registers. + */ + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + return 1; + } + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); dr = exit_qualification & DEBUG_REG_ACCESS_NUM; reg = DEBUG_REG_ACCESS_REG(exit_qualification); @@ -5110,6 +5127,24 @@ static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) { } +static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + get_debugreg(vcpu->arch.db[0], 0); + get_debugreg(vcpu->arch.db[1], 1); + get_debugreg(vcpu->arch.db[2], 2); + get_debugreg(vcpu->arch.db[3], 3); + get_debugreg(vcpu->arch.dr6, 6); + vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); + + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} + static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); @@ -8628,6 +8663,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .get_dr6 = vmx_get_dr6, .set_dr6 = vmx_set_dr6, .set_dr7 = vmx_set_dr7, + .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, -- cgit v1.1 From d16c293e4ecbddedfc1d64095ce56f0569adc12b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 21 Feb 2014 10:36:37 +0100 Subject: KVM: nVMX: Allow nested guests to run with dirty debug registers When preparing the VMCS02, the CPU-based execution controls is computed by vmx_exec_control. Turn off DR access exits there, too, if the KVM_DEBUGREG_WONT_EXIT bit is set in switch_db_regs. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a9940ec..f4e5aed 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4242,6 +4242,10 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; + + if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) + exec_control &= ~CPU_BASED_MOV_DR_EXITING; + if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { exec_control &= ~CPU_BASED_TPR_SHADOW; #ifdef CONFIG_X86_64 -- cgit v1.1 From 5315c716b69f47e1751d09e16c7bd5b559419531 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 3 Mar 2014 13:08:29 +0100 Subject: KVM: svm: set/clear all DR intercepts in one swoop Unlike other intercepts, debug register intercepts will be modified in hot paths if the guest OS is bad or otherwise gets tricked into doing so. Avoid calling recalc_intercepts 16 times for debug registers. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1e8616e..86d802b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -303,20 +303,35 @@ static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit) return vmcb->control.intercept_cr & (1U << bit); } -static inline void set_dr_intercept(struct vcpu_svm *svm, int bit) +static inline void set_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_dr |= (1U << bit); + vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ) + | (1 << INTERCEPT_DR1_READ) + | (1 << INTERCEPT_DR2_READ) + | (1 << INTERCEPT_DR3_READ) + | (1 << INTERCEPT_DR4_READ) + | (1 << INTERCEPT_DR5_READ) + | (1 << INTERCEPT_DR6_READ) + | (1 << INTERCEPT_DR7_READ) + | (1 << INTERCEPT_DR0_WRITE) + | (1 << INTERCEPT_DR1_WRITE) + | (1 << INTERCEPT_DR2_WRITE) + | (1 << INTERCEPT_DR3_WRITE) + | (1 << INTERCEPT_DR4_WRITE) + | (1 << INTERCEPT_DR5_WRITE) + | (1 << INTERCEPT_DR6_WRITE) + | (1 << INTERCEPT_DR7_WRITE); recalc_intercepts(svm); } -static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit) +static inline void clr_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_dr &= ~(1U << bit); + vmcb->control.intercept_dr = 0; recalc_intercepts(svm); } @@ -1080,23 +1095,7 @@ static void init_vmcb(struct vcpu_svm *svm) set_cr_intercept(svm, INTERCEPT_CR4_WRITE); set_cr_intercept(svm, INTERCEPT_CR8_WRITE); - set_dr_intercept(svm, INTERCEPT_DR0_READ); - set_dr_intercept(svm, INTERCEPT_DR1_READ); - set_dr_intercept(svm, INTERCEPT_DR2_READ); - set_dr_intercept(svm, INTERCEPT_DR3_READ); - set_dr_intercept(svm, INTERCEPT_DR4_READ); - set_dr_intercept(svm, INTERCEPT_DR5_READ); - set_dr_intercept(svm, INTERCEPT_DR6_READ); - set_dr_intercept(svm, INTERCEPT_DR7_READ); - - set_dr_intercept(svm, INTERCEPT_DR0_WRITE); - set_dr_intercept(svm, INTERCEPT_DR1_WRITE); - set_dr_intercept(svm, INTERCEPT_DR2_WRITE); - set_dr_intercept(svm, INTERCEPT_DR3_WRITE); - set_dr_intercept(svm, INTERCEPT_DR4_WRITE); - set_dr_intercept(svm, INTERCEPT_DR5_WRITE); - set_dr_intercept(svm, INTERCEPT_DR6_WRITE); - set_dr_intercept(svm, INTERCEPT_DR7_WRITE); + set_dr_intercepts(svm); set_exception_intercept(svm, PF_VECTOR); set_exception_intercept(svm, UD_VECTOR); -- cgit v1.1 From facb0139698923dc7b7d15cafbb319219969f4fd Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 21 Feb 2014 10:32:27 +0100 Subject: KVM: svm: Allow the guest to run with dirty debug registers When not running in guest-debug mode (i.e. the guest controls the debug registers, having to take an exit for each DR access is a waste of time. If the guest gets into a state where each context switch causes DR to be saved and restored, this can take away as much as 40% of the execution time from the guest. If the guest is running with vcpu->arch.db == vcpu->arch.eff_db, we can let it write freely to the debug registers and reload them on the next exit. We still need to exit on the first access, so that the KVM_DEBUGREG_WONT_EXIT flag is set in switch_db_regs; after that, further accesses to the debug registers will not cause a vmexit. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 86d802b..a449c3d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -1683,6 +1684,21 @@ static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) mark_dirty(svm->vmcb, VMCB_DR); } +static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + get_debugreg(vcpu->arch.db[0], 0); + get_debugreg(vcpu->arch.db[1], 1); + get_debugreg(vcpu->arch.db[2], 2); + get_debugreg(vcpu->arch.db[3], 3); + vcpu->arch.dr6 = svm_get_dr6(vcpu); + vcpu->arch.dr7 = svm->vmcb->save.dr7; + + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; + set_dr_intercepts(svm); +} + static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2974,6 +2990,17 @@ static int dr_interception(struct vcpu_svm *svm) unsigned long val; int err; + if (svm->vcpu.guest_debug == 0) { + /* + * No more DR vmexits; force a reload of the debug registers + * and reenter on this instruction. The next vmexit will + * retrieve the full state of the debug registers. + */ + clr_dr_intercepts(svm); + svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + return 1; + } + if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) return emulate_on_interception(svm); @@ -4300,6 +4327,7 @@ static struct kvm_x86_ops svm_x86_ops = { .get_dr6 = svm_get_dr6, .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, + .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, -- cgit v1.1 From 100943c54e0947a07d2c0185368fc2fd848f7f28 Mon Sep 17 00:00:00 2001 From: "Gabriel L. Somlo" Date: Thu, 27 Feb 2014 23:06:17 -0500 Subject: kvm: x86: ignore ioapic polarity Both QEMU and KVM have already accumulated a significant number of optimizations based on the hard-coded assumption that ioapic polarity will always use the ActiveHigh convention, where the logical and physical states of level-triggered irq lines always match (i.e., active(asserted) == high == 1, inactive == low == 0). QEMU guests are expected to follow directions given via ACPI and configure the ioapic with polarity 0 (ActiveHigh). However, even when misbehaving guests (e.g. OS X <= 10.9) set the ioapic polarity to 1 (ActiveLow), QEMU will still use the ActiveHigh signaling convention when interfacing with KVM. This patch modifies KVM to completely ignore ioapic polarity as set by the guest OS, enabling misbehaving guests to work alongside those which comply with the ActiveHigh polarity specified by QEMU's ACPI tables. Signed-off-by: Michael S. Tsirkin Signed-off-by: Gabriel L. Somlo [Move documentation to KVM_IRQ_LINE, add ia64. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d906391..a37da6b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2657,6 +2657,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_KVMCLOCK_CTRL: case KVM_CAP_READONLY_MEM: case KVM_CAP_HYPERV_TIME: + case KVM_CAP_IOAPIC_POLARITY_IGNORED: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: -- cgit v1.1 From 27ce825823a145eb72bd5a5832c6dbb3168b720e Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Sat, 15 Mar 2014 21:01:59 +0100 Subject: KVM: x86 emulator: emulate MOVAPS HCK memory driver test fails when testing 32-bit Windows 8.1 with baloon driver. tracing KVM shows error: reason EXIT_ERR rip 0x81c18326 info 0 0 x/10i 0x81c18326-20 0x0000000081c18312: add %al,(%eax) 0x0000000081c18314: add %cl,-0x7127711d(%esi) 0x0000000081c1831a: rolb $0x0,0x80ec(%ecx) 0x0000000081c18321: and $0xfffffff0,%esp 0x0000000081c18324: mov %esp,%esi 0x0000000081c18326: movaps %xmm0,(%esi) 0x0000000081c18329: movaps %xmm1,0x10(%esi) 0x0000000081c1832d: movaps %xmm2,0x20(%esi) 0x0000000081c18331: movaps %xmm3,0x30(%esi) 0x0000000081c18335: movaps %xmm4,0x40(%esi) which points to MOVAPS instruction currently no emulated by KVM. Fix it by adding appropriate entries to opcode table in KVM's emulator. Signed-off-by: Igor Mammedov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 07ffca0..a26d075 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3668,6 +3668,10 @@ static const struct gprefix pfx_vmovntpx = { I(0, em_mov), N, N, N, }; +static const struct gprefix pfx_0f_28_0f_29 = { + I(Aligned, em_mov), N, N, N, +}; + static const struct escape escape_d9 = { { N, N, N, N, N, N, N, I(DstMem, em_fnstcw), }, { @@ -3870,7 +3874,9 @@ static const struct opcode twobyte_table[256] = { IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), N, N, N, N, - N, N, N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx), + GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29), + GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29), + N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx), N, N, N, N, /* 0x30 - 0x3F */ II(ImplicitOps | Priv, em_wrmsr, wrmsr), -- cgit v1.1 From 6fec27d80feb12f88babcfe75f70f955c51723e8 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Sat, 15 Mar 2014 21:02:00 +0100 Subject: KVM: x86 emulator: emulate MOVAPD Add emulation for 0x66 prefixed instruction of 0f 28 opcode that has been added earlier. Signed-off-by: Igor Mammedov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a26d075..205b17e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3669,7 +3669,7 @@ static const struct gprefix pfx_vmovntpx = { }; static const struct gprefix pfx_0f_28_0f_29 = { - I(Aligned, em_mov), N, N, N, + I(Aligned, em_mov), I(Aligned, em_mov), N, N, }; static const struct escape escape_d9 = { { -- cgit v1.1 From 4ff417320c2dfc984ec1939a7da888976441a881 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 24 Feb 2014 12:15:16 +0100 Subject: KVM: x86: introduce kvm_supported_xcr0() XSAVE support for KVM is already using host_xcr0 & KVM_SUPPORTED_XCR0 as a "dynamic" version of KVM_SUPPORTED_XCR0. However, this is not enough because the MPX bits should not be presented to the guest unless kvm_x86_ops confirms the support. So, replace all instances of host_xcr0 & KVM_SUPPORTED_XCR0 with a new function kvm_supported_xcr0() that also has this check. Note that here: if (xstate_bv & ~KVM_SUPPORTED_XCR0) return -EINVAL; if (xstate_bv & ~host_cr0) return -EINVAL; the code is equivalent to if ((xstate_bv & ~KVM_SUPPORTED_XCR0) || (xstate_bv & ~host_cr0) return -EINVAL; i.e. "xstate_bv & (~KVM_SUPPORTED_XCR0 | ~host_cr0)" which is in turn equal to "xstate_bv & ~(KVM_SUPPORTED_XCR0 & host_cr0)". So we should also use the new function there. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 27 ++++++++++++++++----------- arch/x86/kvm/x86.c | 4 +--- arch/x86/kvm/x86.h | 2 ++ 3 files changed, 19 insertions(+), 14 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ddc8a7e..18aefb4 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -43,6 +43,16 @@ static u32 xstate_required_size(u64 xstate_bv) return ret; } +u64 kvm_supported_xcr0(void) +{ + u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0; + + if (!kvm_x86_ops->mpx_supported || !kvm_x86_ops->mpx_supported()) + xcr0 &= ~(XSTATE_BNDREGS | XSTATE_BNDCSR); + + return xcr0; +} + void kvm_update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -73,7 +83,7 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu) } else { vcpu->arch.guest_supported_xcr0 = (best->eax | ((u64)best->edx << 32)) & - host_xcr0 & KVM_SUPPORTED_XCR0; + kvm_supported_xcr0(); vcpu->arch.guest_xstate_size = best->ebx = xstate_required_size(vcpu->arch.xcr0); } @@ -210,13 +220,6 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags = 0; } -static bool supported_xcr0_bit(unsigned bit) -{ - u64 mask = ((u64)1 << bit); - - return mask & KVM_SUPPORTED_XCR0 & host_xcr0; -} - #define F(x) bit(X86_FEATURE_##x) static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, @@ -439,16 +442,18 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } case 0xd: { int idx, i; + u64 supported = kvm_supported_xcr0(); - entry->eax &= host_xcr0 & KVM_SUPPORTED_XCR0; - entry->edx &= (host_xcr0 & KVM_SUPPORTED_XCR0) >> 32; + entry->eax &= supported; + entry->edx &= supported >> 32; entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; for (idx = 1, i = 1; idx < 64; ++idx) { + u64 mask = ((u64)1 << idx); if (*nent >= maxnent) goto out; do_cpuid_1_ent(&entry[i], function, idx); - if (entry[i].eax == 0 || !supported_xcr0_bit(idx)) + if (entry[i].eax == 0 || !(supported & mask)) continue; entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a37da6b..3f5fb45 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3084,9 +3084,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility * with old userspace. */ - if (xstate_bv & ~KVM_SUPPORTED_XCR0) - return -EINVAL; - if (xstate_bv & ~host_xcr0) + if (xstate_bv & ~kvm_supported_xcr0()) return -EINVAL; memcpy(&vcpu->arch.guest_fpu.state->xsave, guest_xsave->region, vcpu->arch.guest_xstate_size); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 392ecbf..8c97bac 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -126,6 +126,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, | XSTATE_BNDREGS | XSTATE_BNDCSR) extern u64 host_xcr0; +extern u64 kvm_supported_xcr0(void); + extern unsigned int min_timer_period_us; extern struct static_key kvm_no_apic_vcpu; -- cgit v1.1 From 36be0b9deb23161e9eba962c215aece551113a15 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 24 Feb 2014 12:30:04 +0100 Subject: KVM: x86: Add nested virtualization support for MPX This is simple to do, the "host" BNDCFGS is either 0 or the guest value. However, both controls have to be present. We cannot provide MPX if we only have one of the "load BNDCFGS" or "clear BNDCFGS" controls. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f4e5aed..c95bea1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -206,6 +206,7 @@ struct __packed vmcs12 { u64 guest_pdptr1; u64 guest_pdptr2; u64 guest_pdptr3; + u64 guest_bndcfgs; u64 host_ia32_pat; u64 host_ia32_efer; u64 host_ia32_perf_global_ctrl; @@ -541,6 +542,7 @@ static const unsigned long shadow_read_write_fields[] = { GUEST_CS_LIMIT, GUEST_CS_BASE, GUEST_ES_BASE, + GUEST_BNDCFGS, CR0_GUEST_HOST_MASK, CR0_READ_SHADOW, CR4_READ_SHADOW, @@ -596,6 +598,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD64(GUEST_PDPTR1, guest_pdptr1), FIELD64(GUEST_PDPTR2, guest_pdptr2), FIELD64(GUEST_PDPTR3, guest_pdptr3), + FIELD64(GUEST_BNDCFGS, guest_bndcfgs), FIELD64(HOST_IA32_PAT, host_ia32_pat), FIELD64(HOST_IA32_EFER, host_ia32_efer), FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), @@ -736,6 +739,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var); static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); +static bool vmx_mpx_supported(void); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -2287,6 +2291,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; + if (vmx_mpx_supported()) + nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2300,6 +2306,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) VM_ENTRY_LOAD_IA32_PAT; nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); + if (vmx_mpx_supported()) + nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, @@ -7866,6 +7874,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) set_cr4_guest_host_mask(vmx); + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); @@ -8351,6 +8362,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); + if (vmx_mpx_supported()) + vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); /* update exit information fields: */ @@ -8460,6 +8473,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); + /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ + if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) + vmcs_write64(GUEST_BNDCFGS, 0); + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); vcpu->arch.pat = vmcs12->host_ia32_pat; -- cgit v1.1 From 93c4adc7afedf9b0ec190066d45b6d67db5270da Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 5 Mar 2014 23:19:52 +0100 Subject: KVM: x86: handle missing MPX in nested virtualization When doing nested virtualization, we may be able to read BNDCFGS but still not be allowed to write to GUEST_BNDCFGS in the VMCS. Guard writes to the field with vmx_mpx_supported(), and similarly hide the MSR from userspace if the processor does not support the field. We could work around this with the generic MSR save/load machinery, but there is only a limited number of MSR save/load slots and it is not really worthwhile to waste one for a scenario that should not happen except in the nested virtualization case. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 5 ++--- arch/x86/kvm/svm.c | 6 ++++++ arch/x86/kvm/vmx.c | 5 +++++ arch/x86/kvm/x86.c | 17 +++++++++++++++++ 4 files changed, 30 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 18aefb4..64fae65 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -47,7 +47,7 @@ u64 kvm_supported_xcr0(void) { u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0; - if (!kvm_x86_ops->mpx_supported || !kvm_x86_ops->mpx_supported()) + if (!kvm_x86_ops->mpx_supported()) xcr0 &= ~(XSTATE_BNDREGS | XSTATE_BNDCSR); return xcr0; @@ -259,8 +259,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, #endif unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; - unsigned f_mpx = kvm_x86_ops->mpx_supported ? - (kvm_x86_ops->mpx_supported() ? F(MPX) : 0) : 0; + unsigned f_mpx = kvm_x86_ops->mpx_supported() ? F(MPX) : 0; /* cpuid 1.edx */ const u32 kvm_supported_word0_x86_features = diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a449c3d..2136cb6 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4089,6 +4089,11 @@ static bool svm_invpcid_supported(void) return false; } +static bool svm_mpx_supported(void) +{ + return false; +} + static bool svm_has_wbinvd_exit(void) { return true; @@ -4371,6 +4376,7 @@ static struct kvm_x86_ops svm_x86_ops = { .rdtscp_supported = svm_rdtscp_supported, .invpcid_supported = svm_invpcid_supported, + .mpx_supported = svm_mpx_supported, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c95bea1..1320e0f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -729,6 +729,7 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); +static bool vmx_mpx_supported(void); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -2501,6 +2502,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) data = vmcs_readl(GUEST_SYSENTER_ESP); break; case MSR_IA32_BNDCFGS: + if (!vmx_mpx_supported()) + return 1; data = vmcs_read64(GUEST_BNDCFGS); break; case MSR_IA32_FEATURE_CONTROL: @@ -2572,6 +2575,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_BNDCFGS: + if (!vmx_mpx_supported()) + return 1; vmcs_write64(GUEST_BNDCFGS, data); break; case MSR_IA32_TSC: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3f5fb45..aa98695 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3937,6 +3937,23 @@ static void kvm_init_msr_list(void) for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) continue; + + /* + * Even MSRs that are valid in the host may not be exposed + * to the guests in some cases. We could work around this + * in VMX with the generic MSR save/load machinery, but it + * is not really worthwhile since it will really only + * happen with nested virtualization. + */ + switch (msrs_to_save[i]) { + case MSR_IA32_BNDCFGS: + if (!kvm_x86_ops->mpx_supported()) + continue; + break; + default: + break; + } + if (j < i) msrs_to_save[j] = msrs_to_save[i]; j++; -- cgit v1.1 From 920c837785699bcc48f4a729ba9ee3492f620b95 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 26 Mar 2014 15:54:00 +0100 Subject: KVM: vmx: fix MPX detection kvm_x86_ops is still NULL at this point. Since kvm_init_msr_list cannot fail, it is safe to initialize it before the call. Fixes: 93c4adc7afedf9b0ec190066d45b6d67db5270da Reported-by: Fengguang Wu Tested-by: Jet Chen Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index aa98695..d1c55f8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5594,9 +5594,10 @@ int kvm_arch_init(void *opaque) goto out_free_percpu; kvm_set_mmio_spte_mask(); - kvm_init_msr_list(); kvm_x86_ops = ops; + kvm_init_msr_list(); + kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0); -- cgit v1.1