From 908a7bdd6adba3dfd35d8a74a48aed90593de178 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 18 Feb 2013 11:21:16 +0100
Subject: KVM: nVMX: Improve I/O exit handling

This prevents trapping L2 I/O exits if L1 has neither unconditional nor
bitmap-based exiting enabled. Furthermore, it implements I/O bitmap
handling.

Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6667042..b4ce43c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5908,6 +5908,52 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 static const int kvm_vmx_max_exit_handlers =
 	ARRAY_SIZE(kvm_vmx_exit_handlers);
 
+static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
+				       struct vmcs12 *vmcs12)
+{
+	unsigned long exit_qualification;
+	gpa_t bitmap, last_bitmap;
+	unsigned int port;
+	int size;
+	u8 b;
+
+	if (nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING))
+		return 1;
+
+	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+		return 0;
+
+	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+	port = exit_qualification >> 16;
+	size = (exit_qualification & 7) + 1;
+
+	last_bitmap = (gpa_t)-1;
+	b = -1;
+
+	while (size > 0) {
+		if (port < 0x8000)
+			bitmap = vmcs12->io_bitmap_a;
+		else if (port < 0x10000)
+			bitmap = vmcs12->io_bitmap_b;
+		else
+			return 1;
+		bitmap += (port & 0x7fff) / 8;
+
+		if (last_bitmap != bitmap)
+			if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1))
+				return 1;
+		if (b & (1 << (port & 7)))
+			return 1;
+
+		port++;
+		size--;
+		last_bitmap = bitmap;
+	}
+
+	return 0;
+}
+
 /*
  * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
  * rather than handle it ourselves in L0. I.e., check whether L1 expressed
@@ -6097,8 +6143,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 	case EXIT_REASON_DR_ACCESS:
 		return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
 	case EXIT_REASON_IO_INSTRUCTION:
-		/* TODO: support IO bitmaps */
-		return 1;
+		return nested_vmx_exit_handled_io(vcpu, vmcs12);
 	case EXIT_REASON_MSR_READ:
 	case EXIT_REASON_MSR_WRITE:
 		return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
-- 
cgit v1.1


From bd31a7f55777a46063ebe3d50e6536cfff51a757 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Thu, 14 Feb 2013 19:46:27 +0100
Subject: KVM: nVMX: Trap unconditionally if msr bitmap access fails

This avoids basing decisions on uninitialized variables, potentially
leaking kernel data to the L1 guest.

Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b4ce43c..c1b3041 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5985,7 +5985,8 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
 	/* Then read the msr_index'th bit from this bitmap: */
 	if (msr_index < 1024*8) {
 		unsigned char b;
-		kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1);
+		if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1))
+			return 1;
 		return 1 & (b >> (msr_index & 7));
 	} else
 		return 1; /* let L1 handle the wrong parameter */
-- 
cgit v1.1


From 733568f9cecc061eca213ba0e877a1f820a40de5 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sat, 23 Feb 2013 15:07:47 +0100
Subject: KVM: VMX: Make prepare_vmcs12 and load_vmcs12_host_state static

Both are only used locally.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c1b3041..8a99a62 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7330,7 +7330,7 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  * exit-information fields only. Other fields are modified by L1 with VMWRITE,
  * which already writes to vmcs12 directly.
  */
-void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 {
 	/* update guest state fields: */
 	vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
@@ -7421,7 +7421,8 @@ void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  * Failures During or After Loading Guest State").
  * This function should be called when the active VMCS is L1's (vmcs01).
  */
-void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
+				   struct vmcs12 *vmcs12)
 {
 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
 		vcpu->arch.efer = vmcs12->host_ia32_efer;
-- 
cgit v1.1


From d6851fbeee3f79ac2629f823e15ac2a7f6f54e0e Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sat, 23 Feb 2013 22:34:39 +0100
Subject: KVM: nVMX: Advertise PAUSE and WBINVD exiting support

These exits have no preconditions, and we already process the
corresponding reasons in nested_vmx_exit_handled correctly.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8a99a62..329836a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2080,6 +2080,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 		CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
 		CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
 		CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
+		CPU_BASED_PAUSE_EXITING |
 		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
 	/*
 	 * We can allow some features even when not supported by the
@@ -2094,7 +2095,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 		nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
 	nested_vmx_secondary_ctls_low = 0;
 	nested_vmx_secondary_ctls_high &=
-		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+		SECONDARY_EXEC_WBINVD_EXITING;
 }
 
 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
-- 
cgit v1.1


From 36c3cc422b7c5d3cd84cbac769758b197e08f221 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sat, 23 Feb 2013 22:35:37 +0100
Subject: KVM: nVMX: Clear segment cache after switching between L1 and L2

Switching the VMCS obviously invalidates what may have been cached about
the guest segments.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 329836a..3a58c1b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7271,6 +7271,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	vcpu->cpu = cpu;
 	put_cpu();
 
+	vmx_segment_cache_clear(vmx);
+
 	vmcs12->launch_state = 1;
 
 	prepare_vmcs02(vcpu, vmcs12);
@@ -7517,6 +7519,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
 	vcpu->cpu = cpu;
 	put_cpu();
 
+	vmx_segment_cache_clear(vmx);
+
 	/* if no vmcs02 cache requested, remove the one we used */
 	if (VMCS02_POOL_SIZE == 0)
 		nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
-- 
cgit v1.1


From 957c897e8cf5a26abbce1a1a38833251339d596d Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sun, 24 Feb 2013 14:11:34 +0100
Subject: KVM: nVMX: Use cached exit reason

No need to re-read what vmx_vcpu_run already picked up for us.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3a58c1b..f7d2242 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6082,10 +6082,10 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
  */
 static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 {
-	u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
 	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	u32 exit_reason = vmx->exit_reason;
 
 	if (vmx->nested.nested_run_pending)
 		return 0;
@@ -7399,7 +7399,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
 	/* update exit information fields: */
 
-	vmcs12->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
+	vmcs12->vm_exit_reason  = to_vmx(vcpu)->exit_reason;
 	vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
 	vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-- 
cgit v1.1


From 02cdb50fd7e4b5ce1f6f70e27f74283ced0e1872 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Wed, 27 Feb 2013 11:33:25 +0800
Subject: arch/x86/kvm: beautify source code for __u32 irq which is never < 0

  irp->irq is __u32 which is never < 0.

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3c5bb6f..d0cf737 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2696,7 +2696,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 				    struct kvm_interrupt *irq)
 {
-	if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS)
+	if (irq->irq >= KVM_NR_INTERRUPTS)
 		return -EINVAL;
 	if (irqchip_in_kernel(vcpu->kvm))
 		return -ENXIO;
-- 
cgit v1.1


From 44ceb9d6653306f73fb40cbeca303b23937efd85 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Wed, 20 Feb 2013 14:02:23 +0100
Subject: KVM: nVMX: Avoid one redundant vmcs_read in prepare_vmcs12

IDT_VECTORING_INFO_FIELD was already read right after vmexit.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f7d2242..238c59b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7404,8 +7404,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
 	vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 	vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
-	vmcs12->idt_vectoring_info_field =
-		vmcs_read32(IDT_VECTORING_INFO_FIELD);
+	vmcs12->idt_vectoring_info_field = to_vmx(vcpu)->idt_vectoring_info;
 	vmcs12->idt_vectoring_error_code =
 		vmcs_read32(IDT_VECTORING_ERROR_CODE);
 	vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
-- 
cgit v1.1


From 3ab66e8a455a4877889c65a848f2fb32be502f2c Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Wed, 20 Feb 2013 14:03:24 +0100
Subject: KVM: VMX: Pass vcpu to __vmx_complete_interrupts

Cleanup: __vmx_complete_interrupts has no use for the vmx structure.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 238c59b..7cc566b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6436,7 +6436,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 			ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
 }
 
-static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
+static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
 				      u32 idt_vectoring_info,
 				      int instr_len_field,
 				      int error_code_field)
@@ -6447,46 +6447,43 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
 
 	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 
-	vmx->vcpu.arch.nmi_injected = false;
-	kvm_clear_exception_queue(&vmx->vcpu);
-	kvm_clear_interrupt_queue(&vmx->vcpu);
+	vcpu->arch.nmi_injected = false;
+	kvm_clear_exception_queue(vcpu);
+	kvm_clear_interrupt_queue(vcpu);
 
 	if (!idtv_info_valid)
 		return;
 
-	kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
 	vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
 	type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
 
 	switch (type) {
 	case INTR_TYPE_NMI_INTR:
-		vmx->vcpu.arch.nmi_injected = true;
+		vcpu->arch.nmi_injected = true;
 		/*
 		 * SDM 3: 27.7.1.2 (September 2008)
 		 * Clear bit "block by NMI" before VM entry if a NMI
 		 * delivery faulted.
 		 */
-		vmx_set_nmi_mask(&vmx->vcpu, false);
+		vmx_set_nmi_mask(vcpu, false);
 		break;
 	case INTR_TYPE_SOFT_EXCEPTION:
-		vmx->vcpu.arch.event_exit_inst_len =
-			vmcs_read32(instr_len_field);
+		vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
 		/* fall through */
 	case INTR_TYPE_HARD_EXCEPTION:
 		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
 			u32 err = vmcs_read32(error_code_field);
-			kvm_queue_exception_e(&vmx->vcpu, vector, err);
+			kvm_queue_exception_e(vcpu, vector, err);
 		} else
-			kvm_queue_exception(&vmx->vcpu, vector);
+			kvm_queue_exception(vcpu, vector);
 		break;
 	case INTR_TYPE_SOFT_INTR:
-		vmx->vcpu.arch.event_exit_inst_len =
-			vmcs_read32(instr_len_field);
+		vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
 		/* fall through */
 	case INTR_TYPE_EXT_INTR:
-		kvm_queue_interrupt(&vmx->vcpu, vector,
-			type == INTR_TYPE_SOFT_INTR);
+		kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
 		break;
 	default:
 		break;
@@ -6497,7 +6494,7 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
 	if (is_guest_mode(&vmx->vcpu))
 		return;
-	__vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
+	__vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
 				  VM_EXIT_INSTRUCTION_LEN,
 				  IDT_VECTORING_ERROR_CODE);
 }
@@ -6506,7 +6503,7 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
 {
 	if (is_guest_mode(vcpu))
 		return;
-	__vmx_complete_interrupts(to_vmx(vcpu),
+	__vmx_complete_interrupts(vcpu,
 				  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
 				  VM_ENTRY_INSTRUCTION_LEN,
 				  VM_ENTRY_EXCEPTION_ERROR_CODE);
-- 
cgit v1.1


From 462fce46065ec4b200c08619c047b9e5a8fd154a Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Wed, 27 Feb 2013 19:41:56 +0900
Subject: KVM: set_memory_region: Drop user_alloc from
 prepare/commit_memory_region()

X86 does not use this any more.  The remaining user, s390's !user_alloc
check, can be simply removed since KVM_SET_MEMORY_REGION ioctl is no
longer supported.

Note: fixed powerpc's indentations with spaces to suppress checkpatch
errors.

Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 811c5c9..26216bb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6907,8 +6907,7 @@ out_free:
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
 				struct kvm_memory_slot old,
-				struct kvm_userspace_memory_region *mem,
-				bool user_alloc)
+				struct kvm_userspace_memory_region *mem)
 {
 	int npages = memslot->npages;
 
@@ -6938,8 +6937,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
-				struct kvm_memory_slot old,
-				bool user_alloc)
+				struct kvm_memory_slot old)
 {
 
 	int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
-- 
cgit v1.1


From 47ae31e257c548abdb199e0d26723139a9a967ba Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Wed, 27 Feb 2013 19:43:00 +0900
Subject: KVM: set_memory_region: Drop user_alloc from set_memory_region()

Except ia64's stale code, KVM_SET_MEMORY_REGION support, this is only
used for sanity checks in __kvm_set_memory_region() which can easily
be changed to use slot id instead.

Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7cc566b..58fb7c2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3694,7 +3694,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
 	kvm_userspace_mem.flags = 0;
 	kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
 	kvm_userspace_mem.memory_size = PAGE_SIZE;
-	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false);
+	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
 	if (r)
 		goto out;
 
@@ -3724,7 +3724,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
 	kvm_userspace_mem.guest_phys_addr =
 		kvm->arch.ept_identity_map_addr;
 	kvm_userspace_mem.memory_size = PAGE_SIZE;
-	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false);
+	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
 	if (r)
 		goto out;
 
@@ -4364,7 +4364,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
 		.flags = 0,
 	};
 
-	ret = kvm_set_memory_region(kvm, &tss_mem, false);
+	ret = kvm_set_memory_region(kvm, &tss_mem);
 	if (ret)
 		return ret;
 	kvm->arch.tss_addr = addr;
-- 
cgit v1.1


From 7b6195a91d60909a2834ab7181e2b9476e6fe749 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Wed, 27 Feb 2013 19:44:34 +0900
Subject: KVM: set_memory_region: Refactor prepare_memory_region()

This patch drops the parameter old, a copy of the old memory slot, and
adds a new parameter named change to know the change being requested.

This not only cleans up the code but also removes extra copying of the
memory slot structure.

Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 26216bb..7198234 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6906,23 +6906,21 @@ out_free:
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
-				struct kvm_memory_slot old,
-				struct kvm_userspace_memory_region *mem)
+				struct kvm_userspace_memory_region *mem,
+				enum kvm_mr_change change)
 {
-	int npages = memslot->npages;
-
 	/*
 	 * Only private memory slots need to be mapped here since
 	 * KVM_SET_MEMORY_REGION ioctl is no longer supported.
 	 */
-	if ((memslot->id >= KVM_USER_MEM_SLOTS) && npages && !old.npages) {
+	if ((memslot->id >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_CREATE)) {
 		unsigned long userspace_addr;
 
 		/*
 		 * MAP_SHARED to prevent internal slot pages from being moved
 		 * by fork()/COW.
 		 */
-		userspace_addr = vm_mmap(NULL, 0, npages * PAGE_SIZE,
+		userspace_addr = vm_mmap(NULL, 0, memslot->npages * PAGE_SIZE,
 					 PROT_READ | PROT_WRITE,
 					 MAP_SHARED | MAP_ANONYMOUS, 0);
 
-- 
cgit v1.1


From 8482644aea11e0647867732319ccf35879a9acc2 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Wed, 27 Feb 2013 19:45:25 +0900
Subject: KVM: set_memory_region: Refactor commit_memory_region()

This patch makes the parameter old a const pointer to the old memory
slot and adds a new parameter named change to know the change being
requested: the former is for removing extra copying and the latter is
for cleaning up the code.

Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7198234..35b4912 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6935,16 +6935,17 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
-				struct kvm_memory_slot old)
+				const struct kvm_memory_slot *old,
+				enum kvm_mr_change change)
 {
 
-	int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
+	int nr_mmu_pages = 0;
 
-	if ((mem->slot >= KVM_USER_MEM_SLOTS) && old.npages && !npages) {
+	if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
 		int ret;
 
-		ret = vm_munmap(old.userspace_addr,
-				old.npages * PAGE_SIZE);
+		ret = vm_munmap(old->userspace_addr,
+				old->npages * PAGE_SIZE);
 		if (ret < 0)
 			printk(KERN_WARNING
 			       "kvm_vm_ioctl_set_memory_region: "
@@ -6961,13 +6962,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	 * Existing largepage mappings are destroyed here and new ones will
 	 * not be created until the end of the logging.
 	 */
-	if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
+	if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
 		kvm_mmu_slot_remove_write_access(kvm, mem->slot);
 	/*
 	 * If memory slot is created, or moved, we need to clear all
 	 * mmio sptes.
 	 */
-	if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) {
+	if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
 		kvm_mmu_zap_all(kvm);
 		kvm_reload_remote_mmus(kvm);
 	}
-- 
cgit v1.1


From 503cd0c50ac7161eb5c3891b48b620cb0a5521cd Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sun, 3 Mar 2013 13:05:44 +0100
Subject: KVM: nVMX: Fix switching of debug state

First of all, do not blindly overwrite GUEST_DR7 on L2 entry. The host
may have guest debugging enabled. Then properly reset DR7 and DEBUG_CTL
on L2->L1 switch as specified in the SDM.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 58fb7c2..097f5d6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6978,7 +6978,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 		vmcs12->guest_interruptibility_info);
 	vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state);
 	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
-	vmcs_writel(GUEST_DR7, vmcs12->guest_dr7);
+	kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
 	vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
 	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
 		vmcs12->guest_pending_dbg_exceptions);
@@ -7492,6 +7492,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
 		vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
 			vmcs12->host_ia32_perf_global_ctrl);
+
+	kvm_set_dr(vcpu, 7, 0x400);
+	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 }
 
 /*
-- 
cgit v1.1


From c4627c72e9c9e0fc35af2e9d612888fe4564377d Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sun, 3 Mar 2013 20:47:11 +0100
Subject: KVM: nVMX: Reset RFLAGS on VM-exit

Ouch, how could this work so well that far? We need to clear RFLAGS to
the reset value as specified by the SDM. Particularly, IF must be off
after VM-exit!

Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 097f5d6..aacf6a4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7434,6 +7434,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 
 	kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
 	kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
+	vmx_set_rflags(vcpu, X86_EFLAGS_BIT1);
 	/*
 	 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
 	 * actually changed, because it depends on the current state of
-- 
cgit v1.1


From 33fb20c39e98b90813b5ab2d9a0d6faa6300caca Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Wed, 6 Mar 2013 15:44:03 +0100
Subject: KVM: nVMX: Fix content of MSR_IA32_VMX_ENTRY/EXIT_CTLS

Properly set those bits to 1 that the spec demands in case bit 55 of
VMX_BASIC is 0 - like in our case.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index aacf6a4..a9d8853 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2049,21 +2049,28 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 		PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
 		PIN_BASED_VIRTUAL_NMIS;
 
-	/* exit controls */
-	nested_vmx_exit_ctls_low = 0;
+	/*
+	 * Exit controls
+	 * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
+	 * 17 must be 1.
+	 */
+	nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
 	/* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
 #ifdef CONFIG_X86_64
 	nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
 #else
 	nested_vmx_exit_ctls_high = 0;
 #endif
+	nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
 
 	/* entry controls */
 	rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
 		nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
-	nested_vmx_entry_ctls_low = 0;
+	/* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */
+	nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
 	nested_vmx_entry_ctls_high &=
 		VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
+	nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
 
 	/* cpu-based controls */
 	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
-- 
cgit v1.1


From 1a0d74e66405a795bb37a4a23ece50f8d8e5e81e Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Thu, 7 Mar 2013 14:08:07 +0100
Subject: KVM: nVMX: Fix setting of CR0 and CR4 in guest mode

The logic for calculating the value with which we call kvm_set_cr0/4 was
broken (will definitely be visible with nested unrestricted guest mode
support). Also, we performed the check regarding CR0_ALWAYSON too early
when in guest mode.

What really needs to be done on both CR0 and CR4 is to mask out L1-owned
bits and merge them in from L1's guest_cr0/4. In contrast, arch.cr0/4
and arch.cr0/4_guest_owned_bits contain the mangled L0+L1 state and,
thus, are not suited as input.

For both CRs, we can then apply the check against VMXON_CRx_ALWAYSON and
refuse the update if it fails. To be fully consistent, we implement this
check now also for CR4. For CR4, we move the check into vmx_set_cr4
while we keep it in handle_set_cr0. This is because the CR0 checks for
vmxon vs. guest mode will diverge soon when adding unrestricted guest
mode support.

Finally, we have to set the shadow to the value L2 wanted to write
originally.

Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 48 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 33 insertions(+), 15 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a9d8853..260da9a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3223,7 +3223,9 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		 */
 		if (!nested_vmx_allowed(vcpu))
 			return 1;
-	} else if (to_vmx(vcpu)->nested.vmxon)
+	}
+	if (to_vmx(vcpu)->nested.vmxon &&
+	    ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON))
 		return 1;
 
 	vcpu->arch.cr4 = cr4;
@@ -4612,34 +4614,50 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
 {
-	if (to_vmx(vcpu)->nested.vmxon &&
-	    ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
-		return 1;
-
 	if (is_guest_mode(vcpu)) {
+		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+		unsigned long orig_val = val;
+
 		/*
 		 * We get here when L2 changed cr0 in a way that did not change
 		 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
-		 * but did change L0 shadowed bits. This can currently happen
-		 * with the TS bit: L0 may want to leave TS on (for lazy fpu
-		 * loading) while pretending to allow the guest to change it.
+		 * but did change L0 shadowed bits. So we first calculate the
+		 * effective cr0 value that L1 would like to write into the
+		 * hardware. It consists of the L2-owned bits from the new
+		 * value combined with the L1-owned bits from L1's guest_cr0.
 		 */
-		if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) |
-			 (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits)))
+		val = (val & ~vmcs12->cr0_guest_host_mask) |
+			(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
+
+		/* TODO: will have to take unrestricted guest mode into
+		 * account */
+		if ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)
 			return 1;
-		vmcs_writel(CR0_READ_SHADOW, val);
+
+		if (kvm_set_cr0(vcpu, val))
+			return 1;
+		vmcs_writel(CR0_READ_SHADOW, orig_val);
 		return 0;
-	} else
+	} else {
+		if (to_vmx(vcpu)->nested.vmxon &&
+		    ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
+			return 1;
 		return kvm_set_cr0(vcpu, val);
+	}
 }
 
 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
 {
 	if (is_guest_mode(vcpu)) {
-		if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) |
-			 (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits)))
+		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+		unsigned long orig_val = val;
+
+		/* analogously to handle_set_cr0 */
+		val = (val & ~vmcs12->cr4_guest_host_mask) |
+			(vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
+		if (kvm_set_cr4(vcpu, val))
 			return 1;
-		vmcs_writel(CR4_READ_SHADOW, val);
+		vmcs_writel(CR4_READ_SHADOW, orig_val);
 		return 0;
 	} else
 		return kvm_set_cr4(vcpu, val);
-- 
cgit v1.1


From 1044b0303464788820984a5b01c0a81860dce749 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Wed, 6 Mar 2013 16:05:07 +0900
Subject: KVM: MMU: Fix and clean up for_each_gfn_* macros

The expression (sp)->gfn should not be expanded using @gfn.
Although no user of these macros passes a string other than gfn now,
this should be fixed before anyone sees strange errors.

Note: ignored the following checkpatch errors:
  ERROR: Macros with complex values should be enclosed in parenthesis
  ERROR: trailing statements should be on next line

Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 956ca35..3e4822b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1644,16 +1644,14 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 				    struct list_head *invalid_list);
 
-#define for_each_gfn_sp(kvm, sp, gfn)					\
-  hlist_for_each_entry(sp,						\
-   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
-	if ((sp)->gfn != (gfn)) {} else
-
-#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn)			\
-  hlist_for_each_entry(sp,						\
-   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
-		if ((sp)->gfn != (gfn) || (sp)->role.direct ||		\
-			(sp)->role.invalid) {} else
+#define for_each_gfn_sp(_kvm, _sp, _gfn)				\
+	hlist_for_each_entry(_sp,					\
+	  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
+		if ((_sp)->gfn != (_gfn)) {} else
+
+#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)			\
+	for_each_gfn_sp(_kvm, _sp, _gfn)				\
+		if ((_sp)->role.direct || (_sp)->role.invalid) {} else
 
 /* @sp->gfn should be write-protected at the call site */
 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-- 
cgit v1.1


From 945315b9dbbe102bb3393a34ea4a10fb2a5ff303 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Wed, 6 Mar 2013 16:05:52 +0900
Subject: KVM: MMU: Use list_for_each_entry_safe in kvm_mmu_commit_zap_page()

We are traversing the linked list, invalid_list, deleting each entry by
kvm_mmu_free_page().  _safe version is there for such a case.

Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3e4822b..0f42645 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2087,7 +2087,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 				    struct list_head *invalid_list)
 {
-	struct kvm_mmu_page *sp;
+	struct kvm_mmu_page *sp, *nsp;
 
 	if (list_empty(invalid_list))
 		return;
@@ -2104,11 +2104,10 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 	 */
 	kvm_flush_remote_tlbs(kvm);
 
-	do {
-		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+	list_for_each_entry_safe(sp, nsp, invalid_list, link) {
 		WARN_ON(!sp->role.invalid || sp->root_count);
 		kvm_mmu_free_page(sp);
-	} while (!list_empty(invalid_list));
+	}
 }
 
 /*
-- 
cgit v1.1


From 5da596078f915a62e39a20e582308eab91b88c9a Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Wed, 6 Mar 2013 16:06:58 +0900
Subject: KVM: MMU: Introduce a helper function for FIFO zapping

Make the code for zapping the oldest mmu page, placed at the tail of the
active list, a separate function.

Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 55 +++++++++++++++++++++++-------------------------------
 1 file changed, 23 insertions(+), 32 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0f42645..fdacabb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2110,6 +2110,21 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 	}
 }
 
+static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
+					struct list_head *invalid_list)
+{
+	struct kvm_mmu_page *sp;
+
+	if (list_empty(&kvm->arch.active_mmu_pages))
+		return false;
+
+	sp = list_entry(kvm->arch.active_mmu_pages.prev,
+			struct kvm_mmu_page, link);
+	kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+
+	return true;
+}
+
 /*
  * Changing the number of mmu pages allocated to the vm
  * Note: if goal_nr_mmu_pages is too small, you will get dead lock
@@ -2117,23 +2132,15 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
 {
 	LIST_HEAD(invalid_list);
-	/*
-	 * If we set the number of mmu pages to be smaller be than the
-	 * number of actived pages , we must to free some mmu pages before we
-	 * change the value
-	 */
 
 	spin_lock(&kvm->mmu_lock);
 
 	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
-		while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
-			!list_empty(&kvm->arch.active_mmu_pages)) {
-			struct kvm_mmu_page *page;
+		/* Need to free some mmu pages to achieve the goal. */
+		while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
+			if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
+				break;
 
-			page = container_of(kvm->arch.active_mmu_pages.prev,
-					    struct kvm_mmu_page, link);
-			kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
-		}
 		kvm_mmu_commit_zap_page(kvm, &invalid_list);
 		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
 	}
@@ -4007,13 +4014,10 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
 	LIST_HEAD(invalid_list);
 
-	while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
-	       !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
-		struct kvm_mmu_page *sp;
+	while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
+		if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
+			break;
 
-		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
-				  struct kvm_mmu_page, link);
-		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
 		++vcpu->kvm->stat.mmu_recycled;
 	}
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
@@ -4182,19 +4186,6 @@ restart:
 	spin_unlock(&kvm->mmu_lock);
 }
 
-static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
-						struct list_head *invalid_list)
-{
-	struct kvm_mmu_page *page;
-
-	if (list_empty(&kvm->arch.active_mmu_pages))
-		return;
-
-	page = container_of(kvm->arch.active_mmu_pages.prev,
-			    struct kvm_mmu_page, link);
-	kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
-}
-
 static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct kvm *kvm;
@@ -4229,7 +4220,7 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
 
-		kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list);
+		prepare_zap_oldest_mmu_page(kvm, &invalid_list);
 		kvm_mmu_commit_zap_page(kvm, &invalid_list);
 
 		spin_unlock(&kvm->mmu_lock);
-- 
cgit v1.1


From 0fa24ce3f57144e390a1566774c23434975a52a9 Mon Sep 17 00:00:00 2001
From: Ioan Orghici <ioanorghici@gmail.com>
Date: Sun, 10 Mar 2013 15:46:00 +0200
Subject: kvm: remove cast for kmalloc return value

Signed-off-by: Ioan Orghici<ioan.orghici@gmail.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 260da9a..473a5fe 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5286,8 +5286,7 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
 	}
 
 	/* Create a new VMCS */
-	item = (struct vmcs02_list *)
-		kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
+	item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
 	if (!item)
 		return NULL;
 	item->vmcs02.vmcs = alloc_vmcs();
-- 
cgit v1.1


From 03ba32cae66e3798d697e582633af2c7dd6907e5 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 11 Mar 2013 23:10:24 -0300
Subject: VMX: x86: handle host TSC calibration failure

If the host TSC calibration fails, tsc_khz is zero (see tsc_init.c).
Handle such case properly in KVM (instead of dividing by zero).

https://bugzilla.redhat.com/show_bug.cgi?id=859282

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/x86.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 35b4912..b67985a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1079,6 +1079,10 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
 	u32 thresh_lo, thresh_hi;
 	int use_scaling = 0;
 
+	/* tsc_khz can be zero if TSC calibration fails */
+	if (this_tsc_khz == 0)
+		return;
+
 	/* Compute a scale to convert nanoseconds in TSC cycles */
 	kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
 			   &vcpu->arch.virtual_tsc_shift,
@@ -1156,20 +1160,23 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 	ns = get_kernel_ns();
 	elapsed = ns - kvm->arch.last_tsc_nsec;
 
-	/* n.b - signed multiplication and division required */
-	usdiff = data - kvm->arch.last_tsc_write;
+	if (vcpu->arch.virtual_tsc_khz) {
+		/* n.b - signed multiplication and division required */
+		usdiff = data - kvm->arch.last_tsc_write;
 #ifdef CONFIG_X86_64
-	usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
+		usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
 #else
-	/* do_div() only does unsigned */
-	asm("idivl %2; xor %%edx, %%edx"
-	    : "=A"(usdiff)
-	    : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
+		/* do_div() only does unsigned */
+		asm("idivl %2; xor %%edx, %%edx"
+		: "=A"(usdiff)
+		: "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
 #endif
-	do_div(elapsed, 1000);
-	usdiff -= elapsed;
-	if (usdiff < 0)
-		usdiff = -usdiff;
+		do_div(elapsed, 1000);
+		usdiff -= elapsed;
+		if (usdiff < 0)
+			usdiff = -usdiff;
+	} else
+		usdiff = USEC_PER_SEC; /* disable TSC match window below */
 
 	/*
 	 * Special case: TSC write with a small delta (1 second) of virtual
-- 
cgit v1.1


From 57f252f22908535e04d520f3833a6e3116eb159d Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 12 Mar 2013 10:20:24 +0100
Subject: KVM: x86: Drop unused return code from VCPU reset callback

Neither vmx nor svm nor the common part may generate an error on
kvm_vcpu_reset. So drop the return code.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/svm.c |  4 +---
 arch/x86/kvm/vmx.c |  7 +------
 arch/x86/kvm/x86.c | 15 ++++++---------
 3 files changed, 8 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e1b1ce2..907e428 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1191,7 +1191,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 	enable_gif(svm);
 }
 
-static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
+static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	u32 dummy;
@@ -1207,8 +1207,6 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
 	kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
-
-	return 0;
 }
 
 static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 473a5fe..f588171 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4100,11 +4100,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	return 0;
 }
 
-static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u64 msr;
-	int ret;
 
 	vmx->rmode.vm86_active = 0;
 
@@ -4195,10 +4194,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	update_exception_bitmap(&vmx->vcpu);
 
 	vpid_sync_context(vmx);
-
-	ret = 0;
-
-	return ret;
 }
 
 /*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b67985a..fadd5a7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -162,7 +162,7 @@ u64 __read_mostly host_xcr0;
 
 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
 
-static int kvm_vcpu_reset(struct kvm_vcpu *vcpu);
+static void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
 
 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
 {
@@ -5858,9 +5858,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 		pr_debug("vcpu %d received sipi with vector # %x\n",
 			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
 		kvm_lapic_reset(vcpu);
-		r = kvm_vcpu_reset(vcpu);
-		if (r)
-			return r;
+		kvm_vcpu_reset(vcpu);
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 	}
 
@@ -6486,9 +6484,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	r = vcpu_load(vcpu);
 	if (r)
 		return r;
-	r = kvm_vcpu_reset(vcpu);
-	if (r == 0)
-		r = kvm_mmu_setup(vcpu);
+	kvm_vcpu_reset(vcpu);
+	r = kvm_mmu_setup(vcpu);
 	vcpu_put(vcpu);
 
 	return r;
@@ -6525,7 +6522,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->vcpu_free(vcpu);
 }
 
-static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)
+static void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
 {
 	atomic_set(&vcpu->arch.nmi_queued, 0);
 	vcpu->arch.nmi_pending = 0;
@@ -6552,7 +6549,7 @@ static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)
 	vcpu->arch.regs_avail = ~0;
 	vcpu->arch.regs_dirty = ~0;
 
-	return kvm_x86_ops->vcpu_reset(vcpu);
+	kvm_x86_ops->vcpu_reset(vcpu);
 }
 
 int kvm_arch_hardware_enable(void *garbage)
-- 
cgit v1.1


From 5d218814328da91a27e982748443e7e375e11396 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 12 Mar 2013 22:36:43 -0300
Subject: KVM: MMU: make kvm_mmu_available_pages robust against
 n_used_mmu_pages > n_max_mmu_pages

As noticed by Ulrich Obergfell <uobergfe@redhat.com>, the mmu
counters are for beancounting purposes only - so n_used_mmu_pages and
n_max_mmu_pages could be relaxed (example: before f0f5933a1626c8df7b),
resulting in n_used_mmu_pages > n_max_mmu_pages.

Make code robust against n_used_mmu_pages > n_max_mmu_pages.

Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/mmu.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 6987108..3b1ad00 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -57,8 +57,11 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
-	return kvm->arch.n_max_mmu_pages -
-		kvm->arch.n_used_mmu_pages;
+	if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
+		return kvm->arch.n_max_mmu_pages -
+			kvm->arch.n_used_mmu_pages;
+
+	return 0;
 }
 
 static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
-- 
cgit v1.1


From 66450a21f99636af4fafac2afd33f1a40631bc3a Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Wed, 13 Mar 2013 12:42:34 +0100
Subject: KVM: x86: Rework INIT and SIPI handling

A VCPU sending INIT or SIPI to some other VCPU races for setting the
remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED
was overwritten by kvm_emulate_halt and, thus, got lost.

This introduces APIC events for those two signals, keeping them in
kvm_apic until kvm_apic_accept_events is run over the target vcpu
context. kvm_apic_has_events reports to kvm_arch_vcpu_runnable if there
are pending events, thus if vcpu blocking should end.

The patch comes with the side effect of effectively obsoleting
KVM_MP_STATE_SIPI_RECEIVED. We still accept it from user space, but
immediately translate it to KVM_MP_STATE_INIT_RECEIVED + KVM_APIC_SIPI.
The vcpu itself will no longer enter the KVM_MP_STATE_SIPI_RECEIVED
state. That also means we no longer exit to user space after receiving a
SIPI event.

Furthermore, we already reset the VCPU on INIT, only fixing up the code
segment later on when SIPI arrives. Moreover, we fix INIT handling for
the BSP: it never enter wait-for-SIPI but directly starts over on INIT.

Tested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/lapic.c | 48 +++++++++++++++++++++++++++++++++++--------
 arch/x86/kvm/lapic.h | 11 ++++++++++
 arch/x86/kvm/svm.c   |  6 ------
 arch/x86/kvm/vmx.c   | 12 ++---------
 arch/x86/kvm/x86.c   | 58 ++++++++++++++++++++++++++++++++++------------------
 5 files changed, 91 insertions(+), 44 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 02b51dd..a8e9369 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -731,7 +731,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 	case APIC_DM_INIT:
 		if (!trig_mode || level) {
 			result = 1;
-			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+			/* assumes that there are only KVM_APIC_INIT/SIPI */
+			apic->pending_events = (1UL << KVM_APIC_INIT);
+			/* make sure pending_events is visible before sending
+			 * the request */
+			smp_wmb();
 			kvm_make_request(KVM_REQ_EVENT, vcpu);
 			kvm_vcpu_kick(vcpu);
 		} else {
@@ -743,13 +747,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 	case APIC_DM_STARTUP:
 		apic_debug("SIPI to vcpu %d vector 0x%02x\n",
 			   vcpu->vcpu_id, vector);
-		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
-			result = 1;
-			vcpu->arch.sipi_vector = vector;
-			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
-			kvm_make_request(KVM_REQ_EVENT, vcpu);
-			kvm_vcpu_kick(vcpu);
-		}
+		result = 1;
+		apic->sipi_vector = vector;
+		/* make sure sipi_vector is visible for the receiver */
+		smp_wmb();
+		set_bit(KVM_APIC_SIPI, &apic->pending_events);
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
+		kvm_vcpu_kick(vcpu);
 		break;
 
 	case APIC_DM_EXTINT:
@@ -1860,6 +1864,34 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
 					 addr);
 }
 
+void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	unsigned int sipi_vector;
+
+	if (!kvm_vcpu_has_lapic(vcpu))
+		return;
+
+	if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) {
+		kvm_lapic_reset(vcpu);
+		kvm_vcpu_reset(vcpu);
+		if (kvm_vcpu_is_bsp(apic->vcpu))
+			vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+		else
+			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+	}
+	if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events) &&
+	    vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+		/* evaluate pending_events before reading the vector */
+		smp_rmb();
+		sipi_vector = apic->sipi_vector;
+		pr_debug("vcpu %d received sipi with vector # %x\n",
+			 vcpu->vcpu_id, sipi_vector);
+		kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector);
+		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+	}
+}
+
 void kvm_lapic_init(void)
 {
 	/* do not patch jump label more than once per second */
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 1676d34..2c721b9 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -5,6 +5,9 @@
 
 #include <linux/kvm_host.h>
 
+#define KVM_APIC_INIT		0
+#define KVM_APIC_SIPI		1
+
 struct kvm_timer {
 	struct hrtimer timer;
 	s64 period; 				/* unit: ns */
@@ -32,6 +35,8 @@ struct kvm_lapic {
 	void *regs;
 	gpa_t vapic_addr;
 	struct page *vapic_page;
+	unsigned long pending_events;
+	unsigned int sipi_vector;
 };
 int kvm_create_lapic(struct kvm_vcpu *vcpu);
 void kvm_free_lapic(struct kvm_vcpu *vcpu);
@@ -39,6 +44,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu);
 int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
 int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
+void kvm_apic_accept_events(struct kvm_vcpu *vcpu);
 void kvm_lapic_reset(struct kvm_vcpu *vcpu);
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
@@ -158,4 +164,9 @@ void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
 				struct kvm_lapic_irq *irq,
 				u64 *eoi_bitmap);
 
+static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.apic->pending_events;
+}
+
 #endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 907e428..7219a40 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1199,12 +1199,6 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	init_vmcb(svm);
 
-	if (!kvm_vcpu_is_bsp(vcpu)) {
-		kvm_rip_write(vcpu, 0);
-		svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
-		svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
-	}
-
 	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
 	kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f588171..af1ffaf 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4119,12 +4119,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmx_segment_cache_clear(vmx);
 
 	seg_setup(VCPU_SREG_CS);
-	if (kvm_vcpu_is_bsp(&vmx->vcpu))
-		vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
-	else {
-		vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
-		vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
-	}
+	vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
 
 	seg_setup(VCPU_SREG_DS);
 	seg_setup(VCPU_SREG_ES);
@@ -4147,10 +4142,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmcs_writel(GUEST_SYSENTER_EIP, 0);
 
 	vmcs_writel(GUEST_RFLAGS, 0x02);
-	if (kvm_vcpu_is_bsp(&vmx->vcpu))
-		kvm_rip_write(vcpu, 0xfff0);
-	else
-		kvm_rip_write(vcpu, 0);
+	kvm_rip_write(vcpu, 0xfff0);
 
 	vmcs_writel(GUEST_GDTR_BASE, 0);
 	vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fadd5a7..61a5bb6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -162,8 +162,6 @@ u64 __read_mostly host_xcr0;
 
 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
 
-static void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
-
 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
 {
 	int i;
@@ -2830,10 +2828,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 	events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
 	events->nmi.pad = 0;
 
-	events->sipi_vector = vcpu->arch.sipi_vector;
+	events->sipi_vector = 0; /* never valid when reporting to user space */
 
 	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
-			 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
 			 | KVM_VCPUEVENT_VALID_SHADOW);
 	memset(&events->reserved, 0, sizeof(events->reserved));
 }
@@ -2864,8 +2861,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 		vcpu->arch.nmi_pending = events->nmi.pending;
 	kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
 
-	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
-		vcpu->arch.sipi_vector = events->sipi_vector;
+	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
+	    kvm_vcpu_has_lapic(vcpu))
+		vcpu->arch.apic->sipi_vector = events->sipi_vector;
 
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
@@ -5720,6 +5718,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+		kvm_apic_accept_events(vcpu);
+		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+			r = 1;
+			goto out;
+		}
+
 		inject_pending_event(vcpu);
 
 		/* enable NMI/IRQ window open exits if needed */
@@ -5854,14 +5858,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 	int r;
 	struct kvm *kvm = vcpu->kvm;
 
-	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
-		pr_debug("vcpu %d received sipi with vector # %x\n",
-			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
-		kvm_lapic_reset(vcpu);
-		kvm_vcpu_reset(vcpu);
-		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-	}
-
 	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 	r = vapic_enter(vcpu);
 	if (r) {
@@ -5878,8 +5874,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 			kvm_vcpu_block(vcpu);
 			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
-			if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
-			{
+			if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
+				kvm_apic_accept_events(vcpu);
 				switch(vcpu->arch.mp_state) {
 				case KVM_MP_STATE_HALTED:
 					vcpu->arch.mp_state =
@@ -5887,7 +5883,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 				case KVM_MP_STATE_RUNNABLE:
 					vcpu->arch.apf.halted = false;
 					break;
-				case KVM_MP_STATE_SIPI_RECEIVED:
+				case KVM_MP_STATE_INIT_RECEIVED:
+					break;
 				default:
 					r = -EINTR;
 					break;
@@ -6022,6 +6019,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
 		kvm_vcpu_block(vcpu);
+		kvm_apic_accept_events(vcpu);
 		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
 		r = -EAGAIN;
 		goto out;
@@ -6178,6 +6176,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
+	kvm_apic_accept_events(vcpu);
 	mp_state->mp_state = vcpu->arch.mp_state;
 	return 0;
 }
@@ -6185,7 +6184,15 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
-	vcpu->arch.mp_state = mp_state->mp_state;
+	if (!kvm_vcpu_has_lapic(vcpu) &&
+	    mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
+		return -EINVAL;
+
+	if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
+		vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+		set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
+	} else
+		vcpu->arch.mp_state = mp_state->mp_state;
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	return 0;
 }
@@ -6522,7 +6529,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->vcpu_free(vcpu);
 }
 
-static void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
 {
 	atomic_set(&vcpu->arch.nmi_queued, 0);
 	vcpu->arch.nmi_pending = 0;
@@ -6552,6 +6559,17 @@ static void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->vcpu_reset(vcpu);
 }
 
+void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector)
+{
+	struct kvm_segment cs;
+
+	kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
+	cs.selector = vector << 8;
+	cs.base = vector << 12;
+	kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
+	kvm_rip_write(vcpu, 0);
+}
+
 int kvm_arch_hardware_enable(void *garbage)
 {
 	struct kvm *kvm;
@@ -6995,7 +7013,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
 		!vcpu->arch.apf.halted)
 		|| !list_empty_careful(&vcpu->async_pf.done)
-		|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
+		|| kvm_apic_has_events(vcpu)
 		|| atomic_read(&vcpu->arch.nmi_queued) ||
 		(kvm_arch_interrupt_allowed(vcpu) &&
 		 kvm_cpu_has_interrupt(vcpu));
-- 
cgit v1.1


From eabeaaccfca0ed61b8e00a09b8cfa703c4f11b59 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Wed, 13 Mar 2013 11:30:50 +0100
Subject: KVM: nVMX: Clean up and fix pin-based execution controls

Only interrupt and NMI exiting are mandatory for KVM to work, thus can
be exposed to the guest unconditionally, virtual NMI exiting is
optional. So we must not advertise it unless the host supports it.

Introduce the symbolic constant PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR at
this chance.

Reviewed-by:: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index af1ffaf..8eaabfb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2040,14 +2040,16 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 	 */
 
 	/* pin-based controls */
+	rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
+	      nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
 	/*
 	 * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is
 	 * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR.
 	 */
-	nested_vmx_pinbased_ctls_low = 0x16 ;
-	nested_vmx_pinbased_ctls_high = 0x16 |
-		PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
-		PIN_BASED_VIRTUAL_NMIS;
+	nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+	nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
+		PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
+	nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
 
 	/*
 	 * Exit controls
-- 
cgit v1.1


From c18911a23ce1dec27fa3325b50587de2569d26f8 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Wed, 13 Mar 2013 16:06:41 +0100
Subject: KVM: nVMX: Provide EFER.LMA saving support

We will need EFER.LMA saving to provide unrestricted guest mode. All
what is missing for this is picking up EFER.LMA from VM_ENTRY_CONTROLS
on L2->L1 switches. If the host does not support EFER.LMA saving,
no change is performed, otherwise we properly emulate for L1 what the
hardware does for L0. Advertise the support, depending on the host
feature.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8eaabfb..02f8c32 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2022,6 +2022,7 @@ static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
 static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
+static u32 nested_vmx_misc_low, nested_vmx_misc_high;
 static __init void nested_vmx_setup_ctls_msrs(void)
 {
 	/*
@@ -2106,6 +2107,11 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 	nested_vmx_secondary_ctls_high &=
 		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 		SECONDARY_EXEC_WBINVD_EXITING;
+
+	/* miscellaneous data */
+	rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
+	nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
+	nested_vmx_misc_high = 0;
 }
 
 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@ -2176,7 +2182,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 					nested_vmx_entry_ctls_high);
 		break;
 	case MSR_IA32_VMX_MISC:
-		*pdata = 0;
+		*pdata = vmx_control_msr(nested_vmx_misc_low,
+					 nested_vmx_misc_high);
 		break;
 	/*
 	 * These MSRs specify bits which the guest must keep fixed (on or off)
@@ -7398,6 +7405,10 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vmcs12->guest_pending_dbg_exceptions =
 		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
 
+	vmcs12->vm_entry_controls =
+		(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
+		(vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
+
 	/* TODO: These cannot have changed unless we have MSR bitmaps and
 	 * the relevant bit asks not to trap the change */
 	vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
-- 
cgit v1.1


From 0238ea913c21a89387f93097acfbdfeebc9c9257 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Wed, 13 Mar 2013 11:31:24 +0100
Subject: KVM: nVMX: Add preemption timer support

Provided the host has this feature, it's straightforward to offer it to
the guest as well. We just need to load to timer value on L2 entry if
the feature was enabled by L1 and watch out for the corresponding exit
reason.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 02f8c32..17a6938 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -298,7 +298,8 @@ struct __packed vmcs12 {
 	u32 guest_activity_state;
 	u32 guest_sysenter_cs;
 	u32 host_ia32_sysenter_cs;
-	u32 padding32[8]; /* room for future expansion */
+	u32 vmx_preemption_timer_value;
+	u32 padding32[7]; /* room for future expansion */
 	u16 virtual_processor_id;
 	u16 guest_es_selector;
 	u16 guest_cs_selector;
@@ -537,6 +538,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
 	FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
 	FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
+	FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
 	FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
 	FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
 	FIELD(CR0_READ_SHADOW, cr0_read_shadow),
@@ -2049,7 +2051,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 	 */
 	nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
 	nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
-		PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
+		PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS |
+		PIN_BASED_VMX_PREEMPTION_TIMER;
 	nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
 
 	/*
@@ -2110,7 +2113,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 
 	/* miscellaneous data */
 	rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
-	nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
+	nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
+		VMX_MISC_SAVE_EFER_LMA;
 	nested_vmx_misc_high = 0;
 }
 
@@ -6190,6 +6194,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 	case EXIT_REASON_EPT_VIOLATION:
 	case EXIT_REASON_EPT_MISCONFIG:
 		return 0;
+	case EXIT_REASON_PREEMPTION_TIMER:
+		return vmcs12->pin_based_vm_exec_control &
+			PIN_BASED_VMX_PREEMPTION_TIMER;
 	case EXIT_REASON_WBINVD:
 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
 	case EXIT_REASON_XSETBV:
@@ -7011,6 +7018,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 		(vmcs_config.pin_based_exec_ctrl |
 		 vmcs12->pin_based_vm_exec_control));
 
+	if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
+		vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
+			     vmcs12->vmx_preemption_timer_value);
+
 	/*
 	 * Whether page-faults are trapped is determined by a combination of
 	 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
-- 
cgit v1.1


From 95b0430d1a53541076ffbaf453f8b49a547cceba Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Tue, 12 Mar 2013 17:44:40 +0900
Subject: KVM: MMU: Mark sp mmio cached when creating mmio spte

This will be used not to zap unrelated mmu pages when creating/moving
a memory slot later.

Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/mmu.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fdacabb..de45ec1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -199,8 +199,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 
 static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
 {
+	struct kvm_mmu_page *sp =  page_header(__pa(sptep));
+
 	access &= ACC_WRITE_MASK | ACC_USER_MASK;
 
+	sp->mmio_cached = true;
 	trace_mark_mmio_spte(sptep, gfn, access);
 	mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
 }
-- 
cgit v1.1


From 982b3394dd23eec6e5a2f7871238435a167b63cc Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Tue, 12 Mar 2013 17:45:30 +0900
Subject: KVM: x86: Optimize mmio spte zapping when creating/moving memslot

When we create or move a memory slot, we need to zap mmio sptes.
Currently, zap_all() is used for this and this is causing two problems:
 - extra page faults after zapping mmu pages
 - long mmu_lock hold time during zapping mmu pages

For the latter, Marcelo reported a disastrous mmu_lock hold time during
hot-plug, which made the guest unresponsive for a long time.

This patch takes a simple way to fix these problems: do not zap mmu
pages unless they are marked mmio cached.  On our test box, this took
only 50us for the 4GB guest and we did not see ms of mmu_lock hold time
any more.

Note that we still need to do zap_all() for other cases.  So another
work is also needed: Xiao's work may be the one.

Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/mmu.c | 18 ++++++++++++++++++
 arch/x86/kvm/x86.c |  2 +-
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index de45ec1..c1a9b7b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4189,6 +4189,24 @@ restart:
 	spin_unlock(&kvm->mmu_lock);
 }
 
+void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
+{
+	struct kvm_mmu_page *sp, *node;
+	LIST_HEAD(invalid_list);
+
+	spin_lock(&kvm->mmu_lock);
+restart:
+	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
+		if (!sp->mmio_cached)
+			continue;
+		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
+			goto restart;
+	}
+
+	kvm_mmu_commit_zap_page(kvm, &invalid_list);
+	spin_unlock(&kvm->mmu_lock);
+}
+
 static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct kvm *kvm;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 61a5bb6..d3c4787 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6991,7 +6991,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	 * mmio sptes.
 	 */
 	if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
-		kvm_mmu_zap_all(kvm);
+		kvm_mmu_zap_mmio_sptes(kvm);
 		kvm_reload_remote_mmus(kvm);
 	}
 }
-- 
cgit v1.1


From 4918c6ca6838658b71d3ce75e1a4373195ac8d40 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 15 Mar 2013 08:38:56 +0100
Subject: KVM: VMX: Require KVM_SET_TSS_ADDR being called prior to running a
 VCPU

Very old user space (namely qemu-kvm before kvm-49) didn't set the TSS
base before running the VCPU. We always warned about this bug, but no
reports about users actually seeing this are known. Time to finally
remove the workaround that effectively prevented to call vmx_vcpu_reset
while already holding the KVM srcu lock.

Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 30 ++++--------------------------
 1 file changed, 4 insertions(+), 26 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 17a6938..ad978a6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2898,22 +2898,6 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
 	vmx->cpl = 0;
 }
 
-static gva_t rmode_tss_base(struct kvm *kvm)
-{
-	if (!kvm->arch.tss_addr) {
-		struct kvm_memslots *slots;
-		struct kvm_memory_slot *slot;
-		gfn_t base_gfn;
-
-		slots = kvm_memslots(kvm);
-		slot = id_to_memslot(slots, 0);
-		base_gfn = slot->base_gfn + slot->npages - 3;
-
-		return base_gfn << PAGE_SHIFT;
-	}
-	return kvm->arch.tss_addr;
-}
-
 static void fix_rmode_seg(int seg, struct kvm_segment *save)
 {
 	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -2964,19 +2948,15 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 
 	/*
 	 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
-	 * vcpu. Call it here with phys address pointing 16M below 4G.
+	 * vcpu. Warn the user that an update is overdue.
 	 */
-	if (!vcpu->kvm->arch.tss_addr) {
+	if (!vcpu->kvm->arch.tss_addr)
 		printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
 			     "called before entering vcpu\n");
-		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-		vmx_set_tss_addr(vcpu->kvm, 0xfeffd000);
-		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-	}
 
 	vmx_segment_cache_clear(vmx);
 
-	vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
+	vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr);
 	vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
 	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
 
@@ -3623,7 +3603,7 @@ static int init_rmode_tss(struct kvm *kvm)
 	int r, idx, ret = 0;
 
 	idx = srcu_read_lock(&kvm->srcu);
-	fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
+	fn = kvm->arch.tss_addr >> PAGE_SHIFT;
 	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
 	if (r < 0)
 		goto out;
@@ -4190,9 +4170,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
 
 	vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
-	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 	vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
-	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 	vmx_set_cr4(&vmx->vcpu, 0);
 	vmx_set_efer(&vmx->vcpu, 0);
 	vmx_fpu_activate(&vmx->vcpu);
-- 
cgit v1.1


From 04b66839d312d3bdaff77f265eb7305347fa1fb7 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 19 Mar 2013 16:30:26 +0100
Subject: KVM: x86: correctly initialize the CS base on reset

The CS base was initialized to 0 on VMX (wrong, but usually overridden
by userspace before starting) or 0xf0000 on SVM.  The correct value is
0xffff0000, and VMX is able to emulate it now, so use it.

Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 8 +-------
 arch/x86/kvm/vmx.c | 1 +
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7219a40..7a46c1f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1131,17 +1131,11 @@ static void init_vmcb(struct vcpu_svm *svm)
 	init_seg(&save->gs);
 
 	save->cs.selector = 0xf000;
+	save->cs.base = 0xffff0000;
 	/* Executable/Readable Code Segment */
 	save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
 		SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
 	save->cs.limit = 0xffff;
-	/*
-	 * cs.base should really be 0xffff0000, but vmx can't handle that, so
-	 * be consistent with it.
-	 *
-	 * Replace when we have real mode working for vmx.
-	 */
-	save->cs.base = 0xf0000;
 
 	save->gdtr.limit = 0xffff;
 	save->idtr.limit = 0xffff;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ad978a6..03f5746 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4113,6 +4113,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	seg_setup(VCPU_SREG_CS);
 	vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
+	vmcs_write32(GUEST_CS_BASE, 0xffff0000);
 
 	seg_setup(VCPU_SREG_DS);
 	seg_setup(VCPU_SREG_ES);
-- 
cgit v1.1


From 7ddca7e43c8f28f9419da81a0e7730b66aa60fe9 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Thu, 21 Mar 2013 19:33:43 +0900
Subject: KVM: MMU: Move kvm_mmu_free_some_pages() into kvm_mmu_alloc_page()

What this function is doing is to ensure that the number of shadow pages
does not exceed the maximum limit stored in n_max_mmu_pages: so this is
placed at every code path that can reach kvm_mmu_alloc_page().

Although it might have some sense to spread this function in each such
code path when it could be called before taking mmu_lock, the rule was
changed not to do so.

Taking this background into account, this patch moves it into
kvm_mmu_alloc_page() and simplifies the code.

Note: the unlikely hint in kvm_mmu_free_some_pages() guarantees that the
overhead of this function is almost zero except when we actually need to
allocate some shadow pages, so we do not need to care about calling it
multiple times in one path by doing kvm_mmu_get_page() a few times.

Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c         | 9 +++------
 arch/x86/kvm/paging_tmpl.h | 1 -
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c1a9b7b..38f34c5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1505,6 +1505,9 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 					       u64 *parent_pte, int direct)
 {
 	struct kvm_mmu_page *sp;
+
+	kvm_mmu_free_some_pages(vcpu);
+
 	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
 	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
 	if (!direct)
@@ -2842,7 +2845,6 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
 		goto out_unlock;
-	kvm_mmu_free_some_pages(vcpu);
 	if (likely(!force_pt_level))
 		transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
 	r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
@@ -2920,7 +2922,6 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
 		spin_lock(&vcpu->kvm->mmu_lock);
-		kvm_mmu_free_some_pages(vcpu);
 		sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
 				      1, ACC_ALL, NULL);
 		++sp->root_count;
@@ -2932,7 +2933,6 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 
 			ASSERT(!VALID_PAGE(root));
 			spin_lock(&vcpu->kvm->mmu_lock);
-			kvm_mmu_free_some_pages(vcpu);
 			sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
 					      i << 30,
 					      PT32_ROOT_LEVEL, 1, ACC_ALL,
@@ -2971,7 +2971,6 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 		ASSERT(!VALID_PAGE(root));
 
 		spin_lock(&vcpu->kvm->mmu_lock);
-		kvm_mmu_free_some_pages(vcpu);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
 				      0, ACC_ALL, NULL);
 		root = __pa(sp->spt);
@@ -3005,7 +3004,6 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 				return 1;
 		}
 		spin_lock(&vcpu->kvm->mmu_lock);
-		kvm_mmu_free_some_pages(vcpu);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
 				      PT32_ROOT_LEVEL, 0,
 				      ACC_ALL, NULL);
@@ -3311,7 +3309,6 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
 		goto out_unlock;
-	kvm_mmu_free_some_pages(vcpu);
 	if (likely(!force_pt_level))
 		transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
 	r = __direct_map(vcpu, gpa, write, map_writable,
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 105dd5b..af143f0 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -627,7 +627,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 		goto out_unlock;
 
 	kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
-	kvm_mmu_free_some_pages(vcpu);
 	if (!force_pt_level)
 		transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
 	r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
-- 
cgit v1.1


From 81f4f76bbc712a2dff8bb020057c554e285370e1 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Thu, 21 Mar 2013 19:34:27 +0900
Subject: KVM: MMU: Rename kvm_mmu_free_some_pages() to
 make_mmu_pages_available()

The current name "kvm_mmu_free_some_pages" should be used for something
that actually frees some shadow pages, as we expect from the name, but
what the function is doing is to make some, KVM_MIN_FREE_MMU_PAGES,
shadow pages available: it does nothing when there are enough.

This patch changes the name to reflect this meaning better; while doing
this renaming, the code in the wrapper function is inlined into the main
body since the whole function will be inlined into the only caller now.

Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 9 +++++++--
 arch/x86/kvm/mmu.h | 6 ------
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 38f34c5..633e30c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1501,12 +1501,14 @@ static void drop_parent_pte(struct kvm_mmu_page *sp,
 	mmu_spte_clear_no_track(parent_pte);
 }
 
+static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
+
 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 					       u64 *parent_pte, int direct)
 {
 	struct kvm_mmu_page *sp;
 
-	kvm_mmu_free_some_pages(vcpu);
+	make_mmu_pages_available(vcpu);
 
 	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
 	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
@@ -4010,10 +4012,13 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
 
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+static void make_mmu_pages_available(struct kvm_vcpu *vcpu)
 {
 	LIST_HEAD(invalid_list);
 
+	if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
+		return;
+
 	while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
 		if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
 			break;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 3b1ad00..2adcbc2 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -64,12 +64,6 @@ static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 	return 0;
 }
 
-static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
-{
-	if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES))
-		__kvm_mmu_free_some_pages(vcpu);
-}
-
 static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
 {
 	if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
-- 
cgit v1.1


From afd80d85aefac27e6e2f9dc10f60515357c504d2 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 28 Mar 2013 17:18:35 +0100
Subject: pmu: prepare for migration support

In order to migrate the PMU state correctly, we need to restore the
values of MSR_CORE_PERF_GLOBAL_STATUS (a read-only register) and
MSR_CORE_PERF_GLOBAL_OVF_CTRL (which has side effects when written).
We also need to write the full 40-bit value of the performance counter,
which would only be possible with a v3 architectural PMU's full-width
counter MSRs.

To distinguish host-initiated writes from the guest's, pass the
full struct msr_data to kvm_pmu_set_msr.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/pmu.c | 14 +++++++++++---
 arch/x86/kvm/x86.c |  4 ++--
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index cfc258a..c53e797 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -360,10 +360,12 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
 	return 1;
 }
 
-int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	struct kvm_pmu *pmu = &vcpu->arch.pmu;
 	struct kvm_pmc *pmc;
+	u32 index = msr_info->index;
+	u64 data = msr_info->data;
 
 	switch (index) {
 	case MSR_CORE_PERF_FIXED_CTR_CTRL:
@@ -375,6 +377,10 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
 		}
 		break;
 	case MSR_CORE_PERF_GLOBAL_STATUS:
+		if (msr_info->host_initiated) {
+			pmu->global_status = data;
+			return 0;
+		}
 		break; /* RO MSR */
 	case MSR_CORE_PERF_GLOBAL_CTRL:
 		if (pmu->global_ctrl == data)
@@ -386,7 +392,8 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
 		break;
 	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
 		if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
-			pmu->global_status &= ~data;
+			if (!msr_info->host_initiated)
+				pmu->global_status &= ~data;
 			pmu->global_ovf_ctrl = data;
 			return 0;
 		}
@@ -394,7 +401,8 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
 	default:
 		if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
 				(pmc = get_fixed_pmc(pmu, index))) {
-			data = (s64)(s32)data;
+			if (!msr_info->host_initiated)
+				data = (s64)(s32)data;
 			pmc->counter += data - read_pmc(pmc);
 			return 0;
 		} else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2aaba81..78c6f90 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2040,7 +2040,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_P6_EVNTSEL0:
 	case MSR_P6_EVNTSEL1:
 		if (kvm_pmu_msr(vcpu, msr))
-			return kvm_pmu_set_msr(vcpu, msr, data);
+			return kvm_pmu_set_msr(vcpu, msr_info);
 
 		if (pr || data != 0)
 			vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
@@ -2086,7 +2086,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
 			return xen_hvm_config(vcpu, data);
 		if (kvm_pmu_msr(vcpu, msr))
-			return kvm_pmu_set_msr(vcpu, msr, data);
+			return kvm_pmu_set_msr(vcpu, msr_info);
 		if (!ignore_msrs) {
 			vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
 				    msr, data);
-- 
cgit v1.1


From 450e0b411f56db9839c9725d4d49e70e24f87ba3 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Fri, 29 Mar 2013 14:05:26 +0900
Subject: Revert "KVM: MMU: Move kvm_mmu_free_some_pages() into
 kvm_mmu_alloc_page()"

With the following commit, shadow pages can be zapped at random during
a shadow page talbe walk:
  KVM: MMU: Move kvm_mmu_free_some_pages() into kvm_mmu_alloc_page()
  7ddca7e43c8f28f9419da81a0e7730b66aa60fe9

This patch reverts it and fixes __direct_map() and FNAME(fetch)().

Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/mmu.c         | 11 +++++++----
 arch/x86/kvm/paging_tmpl.h |  1 +
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 633e30c..004cc87 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1501,15 +1501,11 @@ static void drop_parent_pte(struct kvm_mmu_page *sp,
 	mmu_spte_clear_no_track(parent_pte);
 }
 
-static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
-
 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 					       u64 *parent_pte, int direct)
 {
 	struct kvm_mmu_page *sp;
 
-	make_mmu_pages_available(vcpu);
-
 	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
 	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
 	if (!direct)
@@ -2806,6 +2802,7 @@ exit:
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			 gva_t gva, pfn_t *pfn, bool write, bool *writable);
+static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
 
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 			 gfn_t gfn, bool prefault)
@@ -2847,6 +2844,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
 		goto out_unlock;
+	make_mmu_pages_available(vcpu);
 	if (likely(!force_pt_level))
 		transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
 	r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
@@ -2924,6 +2922,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
 		spin_lock(&vcpu->kvm->mmu_lock);
+		make_mmu_pages_available(vcpu);
 		sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
 				      1, ACC_ALL, NULL);
 		++sp->root_count;
@@ -2935,6 +2934,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 
 			ASSERT(!VALID_PAGE(root));
 			spin_lock(&vcpu->kvm->mmu_lock);
+			make_mmu_pages_available(vcpu);
 			sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
 					      i << 30,
 					      PT32_ROOT_LEVEL, 1, ACC_ALL,
@@ -2973,6 +2973,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 		ASSERT(!VALID_PAGE(root));
 
 		spin_lock(&vcpu->kvm->mmu_lock);
+		make_mmu_pages_available(vcpu);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
 				      0, ACC_ALL, NULL);
 		root = __pa(sp->spt);
@@ -3006,6 +3007,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 				return 1;
 		}
 		spin_lock(&vcpu->kvm->mmu_lock);
+		make_mmu_pages_available(vcpu);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
 				      PT32_ROOT_LEVEL, 0,
 				      ACC_ALL, NULL);
@@ -3311,6 +3313,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
 		goto out_unlock;
+	make_mmu_pages_available(vcpu);
 	if (likely(!force_pt_level))
 		transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
 	r = __direct_map(vcpu, gpa, write, map_writable,
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index af143f0..da20860 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -627,6 +627,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 		goto out_unlock;
 
 	kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
+	make_mmu_pages_available(vcpu);
 	if (!force_pt_level)
 		transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
 	r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
-- 
cgit v1.1


From 44944d4d28948c71b110b09a2e924e505cd39e8b Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Sun, 7 Apr 2013 08:25:18 +0800
Subject: KVM: Call kvm_apic_match_dest() to check destination vcpu

For a given vcpu, kvm_apic_match_dest() will tell you whether
the vcpu in the destination list quickly. Drop kvm_calculate_eoi_exitmap()
and use kvm_apic_match_dest() instead.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/lapic.c | 47 -----------------------------------------------
 arch/x86/kvm/lapic.h |  4 ----
 2 files changed, 51 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a8e9369..e227474 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -145,53 +145,6 @@ static inline int kvm_apic_id(struct kvm_lapic *apic)
 	return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
 }
 
-void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
-				struct kvm_lapic_irq *irq,
-				u64 *eoi_exit_bitmap)
-{
-	struct kvm_lapic **dst;
-	struct kvm_apic_map *map;
-	unsigned long bitmap = 1;
-	int i;
-
-	rcu_read_lock();
-	map = rcu_dereference(vcpu->kvm->arch.apic_map);
-
-	if (unlikely(!map)) {
-		__set_bit(irq->vector, (unsigned long *)eoi_exit_bitmap);
-		goto out;
-	}
-
-	if (irq->dest_mode == 0) { /* physical mode */
-		if (irq->delivery_mode == APIC_DM_LOWEST ||
-				irq->dest_id == 0xff) {
-			__set_bit(irq->vector,
-				  (unsigned long *)eoi_exit_bitmap);
-			goto out;
-		}
-		dst = &map->phys_map[irq->dest_id & 0xff];
-	} else {
-		u32 mda = irq->dest_id << (32 - map->ldr_bits);
-
-		dst = map->logical_map[apic_cluster_id(map, mda)];
-
-		bitmap = apic_logical_id(map, mda);
-	}
-
-	for_each_set_bit(i, &bitmap, 16) {
-		if (!dst[i])
-			continue;
-		if (dst[i]->vcpu == vcpu) {
-			__set_bit(irq->vector,
-				  (unsigned long *)eoi_exit_bitmap);
-			break;
-		}
-	}
-
-out:
-	rcu_read_unlock();
-}
-
 static void recalculate_apic_map(struct kvm *kvm)
 {
 	struct kvm_apic_map *new, *old = NULL;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 2c721b9..baa20cf 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -160,10 +160,6 @@ static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
 	return ldr & map->lid_mask;
 }
 
-void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
-				struct kvm_lapic_irq *irq,
-				u64 *eoi_bitmap);
-
 static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.apic->pending_events;
-- 
cgit v1.1


From b8c07d55d010702eff61562cf9a77366833d9da2 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sat, 6 Apr 2013 13:51:21 +0200
Subject: KVM: nVMX: Check exit control for VM_EXIT_SAVE_IA32_PAT, not entry
 controls

Obviously a copy&paste mistake: prepare_vmcs12 has to check L1's exit
controls for VM_EXIT_SAVE_IA32_PAT.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 03f5746..b67f354 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7402,7 +7402,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	/* TODO: These cannot have changed unless we have MSR bitmaps and
 	 * the relevant bit asks not to trap the change */
 	vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
-	if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT)
+	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
 		vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
 	vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
 	vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
-- 
cgit v1.1


From 458f212e36b291067b74c0cac2bdcf8278817ee7 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Mon, 8 Apr 2013 15:26:33 +0800
Subject: KVM: x86: fix memory leak in vmx_init

Free vmx_msr_bitmap_longmode_x2apic and vmx_msr_bitmap_longmode if
kvm_init() fails.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b67f354..1cf202c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7741,7 +7741,7 @@ static int __init vmx_init(void)
 	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
 		     __alignof__(struct vcpu_vmx), THIS_MODULE);
 	if (r)
-		goto out3;
+		goto out5;
 
 #ifdef CONFIG_KEXEC
 	rcu_assign_pointer(crash_vmclear_loaded_vmcss,
@@ -7789,6 +7789,8 @@ static int __init vmx_init(void)
 
 	return 0;
 
+out5:
+	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
 out4:
 	free_page((unsigned long)vmx_msr_bitmap_longmode);
 out3:
-- 
cgit v1.1


From a63cb56061239372fce2452dbedd35c95bd665aa Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 8 Apr 2013 11:07:46 +0200
Subject: KVM: VMX: Add missing braces to avoid redundant error check

The code was already properly aligned, now also add the braces to avoid
that err is checked even if alloc_apic_access_page didn't run and change
it. Found via Coccinelle by Fengguang Wu.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1cf202c..669b803 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6785,10 +6785,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	put_cpu();
 	if (err)
 		goto free_vmcs;
-	if (vm_need_virtualize_apic_accesses(kvm))
+	if (vm_need_virtualize_apic_accesses(kvm)) {
 		err = alloc_apic_access_page(kvm);
 		if (err)
 			goto free_vmcs;
+	}
 
 	if (enable_ept) {
 		if (!kvm->arch.ept_identity_map_addr)
-- 
cgit v1.1


From e3ba45b8041740f4ab8bbba3c6239876661aeed6 Mon Sep 17 00:00:00 2001
From: Geoff Levand <geoff@infradead.org>
Date: Fri, 5 Apr 2013 19:20:30 +0000
Subject: KVM: Move kvm_spurious_fault to x86.c

The routine kvm_spurious_fault() is an x86 specific routine, so
move it from virt/kvm/kvm_main.c to arch/x86/kvm/x86.c.

Fixes this sparse warning when building on arm64:

  virt/kvm/kvm_main.c:warning: symbol 'kvm_spurious_fault' was not declared. Should it be static?

Signed-off-by: Geoff Levand <geoff@infradead.org>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/x86.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 78c6f90..eb9927e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -261,6 +261,13 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
 }
 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 
+asmlinkage void kvm_spurious_fault(void)
+{
+	/* Fault while not rebooting.  We want the trace. */
+	BUG();
+}
+EXPORT_SYMBOL_GPL(kvm_spurious_fault);
+
 #define EXCPT_BENIGN		0
 #define EXCPT_CONTRIBUTORY	1
 #define EXCPT_PF		2
-- 
cgit v1.1


From f8da94e9e44b237fa5cc8521faeb714dc2e83b54 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 11 Apr 2013 14:06:03 +0200
Subject: KVM: x86 emulator: Fix segment loading in VM86

This fixes a regression introduced in commit 03ebebeb1 ("KVM: x86
emulator: Leave segment limit and attributs alone in real mode").

The mentioned commit changed the segment descriptors for both real mode
and VM86 to only update the segment base instead of creating a
completely new descriptor with limit 0xffff so that unreal mode keeps
working across a segment register reload.

This leads to an invalid segment descriptor in the eyes of VMX, which
seems to be okay for real mode because KVM will fix it up before the
next VM entry or emulate the state, but it doesn't do this if the guest
is in VM86, so we end up with:

  KVM: entry failed, hardware error 0x80000021

Fix this by effectively reverting commit 03ebebeb1 for VM86 and leaving
it only in place for real mode, which is where it's really needed.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/emulate.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a335cc6..069d799 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1578,12 +1578,21 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 
 	memset(&seg_desc, 0, sizeof seg_desc);
 
-	if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
-	    || ctxt->mode == X86EMUL_MODE_REAL) {
-		/* set real mode segment descriptor */
+	if (ctxt->mode == X86EMUL_MODE_REAL) {
+		/* set real mode segment descriptor (keep limit etc. for
+		 * unreal mode) */
 		ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg);
 		set_desc_base(&seg_desc, selector << 4);
 		goto load;
+	} else if (seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) {
+		/* VM86 needs a clean new segment descriptor */
+		set_desc_base(&seg_desc, selector << 4);
+		set_desc_limit(&seg_desc, 0xffff);
+		seg_desc.type = 3;
+		seg_desc.p = 1;
+		seg_desc.s = 1;
+		seg_desc.dpl = 3;
+		goto load;
 	}
 
 	rpl = selector & 3;
-- 
cgit v1.1


From 0b789eee2c0204da83278f181428560faf6efefb Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 11 Apr 2013 11:59:55 +0300
Subject: KVM: emulator: fix unimplemented instruction detection

Unimplemented instruction detection is broken for group instructions
since it relies on "flags" field of opcode to be zero, but all
instructions in a group inherit flags from a group encoding. Fix that by
having a separate flag for unimplemented instructions.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/emulate.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 069d799..ab6fda4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -132,8 +132,9 @@
 #define Priv        (1<<27) /* instruction generates #GP if current CPL != 0 */
 #define No64	    (1<<28)
 #define PageTable   (1 << 29)   /* instruction used to write page table */
+#define NotImpl     (1 << 30)   /* instruction is not implemented */
 /* Source 2 operand type */
-#define Src2Shift   (30)
+#define Src2Shift   (31)
 #define Src2None    (OpNone << Src2Shift)
 #define Src2CL      (OpCL << Src2Shift)
 #define Src2ImmByte (OpImmByte << Src2Shift)
@@ -3624,7 +3625,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 #define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i }
 #define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \
 		      .check_perm = (_p) }
-#define N    D(0)
+#define N    D(NotImpl)
 #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
 #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
 #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
@@ -4382,7 +4383,7 @@ done_prefixes:
 	ctxt->intercept = opcode.intercept;
 
 	/* Unrecognised? */
-	if (ctxt->d == 0 || (ctxt->d & Undefined))
+	if (ctxt->d == 0 || (ctxt->d & NotImpl) || (ctxt->d & Undefined))
 		return EMULATION_FAILED;
 
 	if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
-- 
cgit v1.1


From 991eebf9f8e523e7ff1e4d31ac80641582b2e57a Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 11 Apr 2013 12:10:51 +0300
Subject: KVM: VMX: do not try to reexecute failed instruction while emulating
 invalid guest state

During invalid guest state emulation vcpu cannot enter guest mode to try
to reexecute instruction that emulator failed to emulate, so emulation
will happen again and again.  Prevent that by telling the emulator that
instruction reexecution should not be attempted.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c |  2 +-
 arch/x86/kvm/x86.c | 13 +++++++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 669b803..d268677 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5189,7 +5189,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 		if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
 			return 1;
 
-		err = emulate_instruction(vcpu, 0);
+		err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
 
 		if (err == EMULATE_DO_MMIO) {
 			ret = 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index eb9927e..999d124 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4765,11 +4765,15 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 }
 
 static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
-				  bool write_fault_to_shadow_pgtable)
+				  bool write_fault_to_shadow_pgtable,
+				  int emulation_type)
 {
 	gpa_t gpa = cr2;
 	pfn_t pfn;
 
+	if (emulation_type & EMULTYPE_NO_REEXECUTE)
+		return false;
+
 	if (!vcpu->arch.mmu.direct_map) {
 		/*
 		 * Write permission should be allowed since only
@@ -4912,8 +4916,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 		if (r != EMULATION_OK)  {
 			if (emulation_type & EMULTYPE_TRAP_UD)
 				return EMULATE_FAIL;
-			if (reexecute_instruction(vcpu, cr2,
-						  write_fault_to_spt))
+			if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
+						emulation_type))
 				return EMULATE_DONE;
 			if (emulation_type & EMULTYPE_SKIP)
 				return EMULATE_FAIL;
@@ -4943,7 +4947,8 @@ restart:
 		return EMULATE_DONE;
 
 	if (r == EMULATION_FAILED) {
-		if (reexecute_instruction(vcpu, cr2, write_fault_to_spt))
+		if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
+					emulation_type))
 			return EMULATE_DONE;
 
 		return handle_emulation_failure(vcpu);
-- 
cgit v1.1


From 1146a78b8d88fc0e0ca2ab9c549821242e289432 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 11 Apr 2013 12:30:01 +0300
Subject: KVM: emulator: Do not fail on emulation of undefined opcode

Emulation of undefined opcode should inject #UD instead of causing
emulation failure. Do that by moving Undefined flag check to emulation
stage and injection #UD there.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/emulate.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ab6fda4..77b56d5 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4383,7 +4383,7 @@ done_prefixes:
 	ctxt->intercept = opcode.intercept;
 
 	/* Unrecognised? */
-	if (ctxt->d == 0 || (ctxt->d & NotImpl) || (ctxt->d & Undefined))
+	if (ctxt->d == 0 || (ctxt->d & NotImpl))
 		return EMULATION_FAILED;
 
 	if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
@@ -4521,7 +4521,8 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 
 	ctxt->mem_read.pos = 0;
 
-	if (ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) {
+	if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
+			(ctxt->d & Undefined)) {
 		rc = emulate_ud(ctxt);
 		goto done;
 	}
-- 
cgit v1.1


From 188424ba10313bbbc015727d418b7f18593becdd Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 11 Apr 2013 12:32:14 +0300
Subject: KVM: emulator: mark 0xff 0x7d opcode as undefined.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 77b56d5..46f63b8 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3723,7 +3723,7 @@ static const struct opcode group5[] = {
 	I(SrcMemFAddr | ImplicitOps | Stack,	em_call_far),
 	I(SrcMem | Stack,			em_grp45),
 	I(SrcMemFAddr | ImplicitOps,		em_grp45),
-	I(SrcMem | Stack,			em_grp45), N,
+	I(SrcMem | Stack,			em_grp45), D(Undefined),
 };
 
 static const struct opcode group6[] = {
-- 
cgit v1.1


From 3b656cf764cbc43d3efb9bf5f45c618d4cf0989f Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sun, 14 Apr 2013 12:12:45 +0200
Subject: KVM: nVMX: Fix injection of PENDING_INTERRUPT and NMI_WINDOW exits to
 L1

Check if the interrupt or NMI window exit is for L1 by testing if it has
the corresponding controls enabled. This is required when we allow
direct injection from L0 to L2

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d268677..5ae969b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6112,14 +6112,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 	case EXIT_REASON_TRIPLE_FAULT:
 		return 1;
 	case EXIT_REASON_PENDING_INTERRUPT:
+		return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
 	case EXIT_REASON_NMI_WINDOW:
-		/*
-		 * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit
-		 * (aka Interrupt Window Exiting) only when L1 turned it on,
-		 * so if we got a PENDING_INTERRUPT exit, this must be for L1.
-		 * Same for NMI Window Exiting.
-		 */
-		return 1;
+		return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
 	case EXIT_REASON_TASK_SWITCH:
 		return 1;
 	case EXIT_REASON_CPUID:
-- 
cgit v1.1


From 5f3d5799974b89100268ba813cec8db7bd0693fb Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sun, 14 Apr 2013 12:12:46 +0200
Subject: KVM: nVMX: Rework event injection and recovery

The basic idea is to always transfer the pending event injection on
vmexit into the architectural state of the VCPU and then drop it from
there if it turns out that we left L2 to enter L1, i.e. if we enter
prepare_vmcs12.

vmcs12_save_pending_events takes care to transfer pending L0 events into
the queue of L1. That is mandatory as L1 may decide to switch the guest
state completely, invalidating or preserving the pending events for
later injection (including on a different node, once we support
migration).

This concept is based on the rule that a pending vmlaunch/vmresume is
not canceled. Otherwise, we would risk to lose injected events or leak
them into the wrong queues. Encode this rule via a WARN_ON_ONCE at the
entry of nested_vmx_vmexit.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 99 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 64 insertions(+), 35 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5ae969b..4fb72a7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6493,8 +6493,6 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
 
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
-	if (is_guest_mode(&vmx->vcpu))
-		return;
 	__vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
 				  VM_EXIT_INSTRUCTION_LEN,
 				  IDT_VECTORING_ERROR_CODE);
@@ -6502,8 +6500,6 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 
 static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
 {
-	if (is_guest_mode(vcpu))
-		return;
 	__vmx_complete_interrupts(vcpu,
 				  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
 				  VM_ENTRY_INSTRUCTION_LEN,
@@ -6535,21 +6531,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	unsigned long debugctlmsr;
 
-	if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
-		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-		if (vmcs12->idt_vectoring_info_field &
-				VECTORING_INFO_VALID_MASK) {
-			vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-				vmcs12->idt_vectoring_info_field);
-			vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
-				vmcs12->vm_exit_instruction_len);
-			if (vmcs12->idt_vectoring_info_field &
-					VECTORING_INFO_DELIVER_CODE_MASK)
-				vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
-					vmcs12->idt_vectoring_error_code);
-		}
-	}
-
 	/* Record the guest's net vcpu time for enforced NMI injections. */
 	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
 		vmx->entry_time = ktime_get();
@@ -6708,17 +6689,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 
-	if (is_guest_mode(vcpu)) {
-		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-		vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info;
-		if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
-			vmcs12->idt_vectoring_error_code =
-				vmcs_read32(IDT_VECTORING_ERROR_CODE);
-			vmcs12->vm_exit_instruction_len =
-				vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
-		}
-	}
-
 	vmx->loaded_vmcs->launched = 1;
 
 	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
@@ -7326,6 +7296,48 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 			vcpu->arch.cr4_guest_owned_bits));
 }
 
+static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
+				       struct vmcs12 *vmcs12)
+{
+	u32 idt_vectoring;
+	unsigned int nr;
+
+	if (vcpu->arch.exception.pending) {
+		nr = vcpu->arch.exception.nr;
+		idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
+
+		if (kvm_exception_is_soft(nr)) {
+			vmcs12->vm_exit_instruction_len =
+				vcpu->arch.event_exit_inst_len;
+			idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
+		} else
+			idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
+
+		if (vcpu->arch.exception.has_error_code) {
+			idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
+			vmcs12->idt_vectoring_error_code =
+				vcpu->arch.exception.error_code;
+		}
+
+		vmcs12->idt_vectoring_info_field = idt_vectoring;
+	} else if (vcpu->arch.nmi_pending) {
+		vmcs12->idt_vectoring_info_field =
+			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
+	} else if (vcpu->arch.interrupt.pending) {
+		nr = vcpu->arch.interrupt.nr;
+		idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
+
+		if (vcpu->arch.interrupt.soft) {
+			idt_vectoring |= INTR_TYPE_SOFT_INTR;
+			vmcs12->vm_entry_instruction_len =
+				vcpu->arch.event_exit_inst_len;
+		} else
+			idt_vectoring |= INTR_TYPE_EXT_INTR;
+
+		vmcs12->idt_vectoring_info_field = idt_vectoring;
+	}
+}
+
 /*
  * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
  * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
@@ -7411,15 +7423,29 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
 	vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 	vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
-	vmcs12->idt_vectoring_info_field = to_vmx(vcpu)->idt_vectoring_info;
-	vmcs12->idt_vectoring_error_code =
-		vmcs_read32(IDT_VECTORING_ERROR_CODE);
+	vmcs12->idt_vectoring_info_field = 0;
 	vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
 	vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 
-	/* clear vm-entry fields which are to be cleared on exit */
-	if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+	if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
+		/* vm_entry_intr_info_field is cleared on exit. Emulate this
+		 * instead of reading the real value. */
 		vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
+
+		/*
+		 * Transfer the event that L0 or L1 may wanted to inject into
+		 * L2 to IDT_VECTORING_INFO_FIELD.
+		 */
+		vmcs12_save_pending_event(vcpu, vmcs12);
+	}
+
+	/*
+	 * Drop what we picked up for L2 via vmx_complete_interrupts. It is
+	 * preserved above and would only end up incorrectly in L1.
+	 */
+	vcpu->arch.nmi_injected = false;
+	kvm_clear_exception_queue(vcpu);
+	kvm_clear_interrupt_queue(vcpu);
 }
 
 /*
@@ -7519,6 +7545,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
 	int cpu;
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
+	/* trying to cancel vmlaunch/vmresume is a bug */
+	WARN_ON_ONCE(vmx->nested.nested_run_pending);
+
 	leave_guest_mode(vcpu);
 	prepare_vmcs12(vcpu, vmcs12);
 
-- 
cgit v1.1


From e8457c67a4ec1268ec616bd8be1d9f1cc20f1493 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sun, 14 Apr 2013 12:12:48 +0200
Subject: KVM: nVMX: Fix conditions for interrupt injection

If we are entering guest mode, we do not want L0 to interrupt this
vmentry with all its side effects on the vmcs. Therefore, injection
shall be disallowed during L1->L2 transitions, as in the previous
version. However, this check is conceptually independent of
nested_exit_on_intr, so decouple it.

If L1 traps external interrupts, we can kick the guest from L2 to L1,
also just like the previous code worked. But we no longer need to
consider L1's idt_vectoring_info_field. It will always be empty at this
point. Instead, if L2 has pending events, those are now found in the
architectural queues and will, thus, prevent vmx_interrupt_allowed from
being called at all.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4fb72a7..5e63911 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4325,16 +4325,20 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
-	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
+	if (is_guest_mode(vcpu)) {
 		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-		if (to_vmx(vcpu)->nested.nested_run_pending ||
-		    (vmcs12->idt_vectoring_info_field &
-		     VECTORING_INFO_VALID_MASK))
+
+		if (to_vmx(vcpu)->nested.nested_run_pending)
 			return 0;
-		nested_vmx_vmexit(vcpu);
-		vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
-		vmcs12->vm_exit_intr_info = 0;
-		/* fall through to normal code, but now in L1, not L2 */
+		if (nested_exit_on_intr(vcpu)) {
+			nested_vmx_vmexit(vcpu);
+			vmcs12->vm_exit_reason =
+				EXIT_REASON_EXTERNAL_INTERRUPT;
+			vmcs12->vm_exit_intr_info = 0;
+			/*
+			 * fall through to normal code, but now in L1, not L2
+			 */
+		}
 	}
 
 	return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
-- 
cgit v1.1


From c0d1c770c05ac7051df86914f9627b68f29c1d67 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sun, 14 Apr 2013 12:12:50 +0200
Subject: KVM: nVMX: Avoid reading VM_EXIT_INTR_ERROR_CODE needlessly on nested
 exits

We only need to update vm_exit_intr_error_code if there is a valid exit
interruption information and it comes with a valid error code.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5e63911..7175557 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7426,7 +7426,11 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
 	vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-	vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+	if ((vmcs12->vm_exit_intr_info &
+	     (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
+	    (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK))
+		vmcs12->vm_exit_intr_error_code =
+			vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
 	vmcs12->idt_vectoring_info_field = 0;
 	vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
 	vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
-- 
cgit v1.1


From 1fcc7890dbf32571c25278803ee19182c801c006 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:21:35 +0800
Subject: KVM: Add vcpu info to ioapic_update_eoi()

Add vcpu info to ioapic_update_eoi, so we can know which vcpu
issued this EOI.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index e227474..3e22536 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -739,7 +739,7 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
 			trigger_mode = IOAPIC_LEVEL_TRIG;
 		else
 			trigger_mode = IOAPIC_EDGE_TRIG;
-		kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+		kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
 	}
 }
 
-- 
cgit v1.1


From b4f2225c07dd4d8eef7aa7f5b36a3b72c3cbbe5b Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:21:37 +0800
Subject: KVM: Return destination vcpu on interrupt injection

Add a new parameter to know vcpus who received the interrupt.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c | 25 ++++++++++++++++---------
 arch/x86/kvm/lapic.h |  5 +++--
 2 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3e22536..0b73402 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -384,14 +384,16 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
 }
 
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
-			     int vector, int level, int trig_mode);
+			     int vector, int level, int trig_mode,
+			     unsigned long *dest_map);
 
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
+		unsigned long *dest_map)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
-			irq->level, irq->trig_mode);
+			irq->level, irq->trig_mode, dest_map);
 }
 
 static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
@@ -564,7 +566,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 }
 
 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
-		struct kvm_lapic_irq *irq, int *r)
+		struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map)
 {
 	struct kvm_apic_map *map;
 	unsigned long bitmap = 1;
@@ -575,7 +577,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 	*r = -1;
 
 	if (irq->shorthand == APIC_DEST_SELF) {
-		*r = kvm_apic_set_irq(src->vcpu, irq);
+		*r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
 		return true;
 	}
 
@@ -620,7 +622,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 			continue;
 		if (*r < 0)
 			*r = 0;
-		*r += kvm_apic_set_irq(dst[i]->vcpu, irq);
+		*r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
 	}
 
 	ret = true;
@@ -634,7 +636,8 @@ out:
  * Return 1 if successfully added and 0 if discarded.
  */
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
-			     int vector, int level, int trig_mode)
+			     int vector, int level, int trig_mode,
+			     unsigned long *dest_map)
 {
 	int result = 0;
 	struct kvm_vcpu *vcpu = apic->vcpu;
@@ -647,6 +650,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		if (unlikely(!apic_enabled(apic)))
 			break;
 
+		if (dest_map)
+			__set_bit(vcpu->vcpu_id, dest_map);
+
 		if (trig_mode) {
 			apic_debug("level trig mode for vector %d", vector);
 			apic_set_vector(vector, apic->regs + APIC_TMR);
@@ -805,7 +811,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 		   irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
 		   irq.vector);
 
-	kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
+	kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
 }
 
 static u32 apic_get_tmcct(struct kvm_lapic *apic)
@@ -1441,7 +1447,8 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
 		vector = reg & APIC_VECTOR_MASK;
 		mode = reg & APIC_MODE_MASK;
 		trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
-		return __apic_accept_irq(apic, mode, vector, 1, trig_mode);
+		return __apic_accept_irq(apic, mode, vector, 1, trig_mode,
+					NULL);
 	}
 	return 0;
 }
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index baa20cf..3e5a431 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -55,11 +55,12 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
+		unsigned long *dest_map);
 int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
 
 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
-		struct kvm_lapic_irq *irq, int *r);
+		struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map);
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
-- 
cgit v1.1


From 106069193ce501ea68649037bde7ea50ed492948 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:21:38 +0800
Subject: KVM: Add reset/restore rtc_status support

restore rtc_status from migration or save/restore

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c | 9 +++++++++
 arch/x86/kvm/lapic.h | 2 ++
 2 files changed, 11 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0b73402..6796218 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -94,6 +94,14 @@ static inline int apic_test_vector(int vec, void *bitmap)
 	return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
 }
 
+bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	return apic_test_vector(vector, apic->regs + APIC_ISR) ||
+		apic_test_vector(vector, apic->regs + APIC_IRR);
+}
+
 static inline void apic_set_vector(int vec, void *bitmap)
 {
 	set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -1618,6 +1626,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
 	apic->highest_isr_cache = -1;
 	kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic));
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
+	kvm_rtc_eoi_tracking_restore_one(vcpu);
 }
 
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 3e5a431..16304b1 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -166,4 +166,6 @@ static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
 	return vcpu->arch.apic->pending_events;
 }
 
+bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
+
 #endif
-- 
cgit v1.1


From aa2fbe6d44892070d78995f0df875ce930904e29 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:21:40 +0800
Subject: KVM: Let ioapic know the irq line status

Userspace may deliver RTC interrupt without query the status. So we
want to track RTC EOI for this case.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/i8254.c | 4 ++--
 arch/x86/kvm/x86.c   | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c1d30b2..412a5aa 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -290,8 +290,8 @@ static void pit_do_work(struct kthread_work *work)
 	}
 	spin_unlock(&ps->inject_lock);
 	if (inject) {
-		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
-		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
+		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1, false);
+		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0, false);
 
 		/*
 		 * Provides NMI watchdog support via Virtual Wire mode.
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 999d124..ae9744d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3491,13 +3491,15 @@ out:
 	return r;
 }
 
-int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
+			bool line_status)
 {
 	if (!irqchip_in_kernel(kvm))
 		return -ENXIO;
 
 	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
-					irq_event->irq, irq_event->level);
+					irq_event->irq, irq_event->level,
+					line_status);
 	return 0;
 }
 
-- 
cgit v1.1


From a547c6db4d2f16ba5ce8e7054bffad6acc248d40 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:25:10 +0800
Subject: KVM: VMX: Enable acknowledge interupt on vmexit

The "acknowledge interrupt on exit" feature controls processor behavior
for external interrupt acknowledgement. When this control is set, the
processor acknowledges the interrupt controller to acquire the
interrupt vector on VM exit.

After enabling this feature, an interrupt which arrived when target cpu is
running in vmx non-root mode will be handled by vmx handler instead of handler
in idt. Currently, vmx handler only fakes an interrupt stack and jump to idt
table to let real handler to handle it. Further, we will recognize the interrupt
and only delivery the interrupt which not belong to current vcpu through idt table.
The interrupt which belonged to current vcpu will be handled inside vmx handler.
This will reduce the interrupt handle cost of KVM.

Also, interrupt enable logic is changed if this feature is turnning on:
Before this patch, hypervior call local_irq_enable() to enable it directly.
Now IF bit is set on interrupt stack frame, and will be enabled on a return from
interrupt handler if exterrupt interrupt exists. If no external interrupt, still
call local_irq_enable() to enable it.

Refer to Intel SDM volum 3, chapter 33.2.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c |  6 ++++++
 arch/x86/kvm/vmx.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 arch/x86/kvm/x86.c |  4 +++-
 3 files changed, 63 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7a46c1f..2f8fe3f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4233,6 +4233,11 @@ out:
 	return ret;
 }
 
+static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
+{
+	local_irq_enable();
+}
+
 static struct kvm_x86_ops svm_x86_ops = {
 	.cpu_has_kvm_support = has_svm,
 	.disabled_by_bios = is_disabled,
@@ -4328,6 +4333,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.set_tdp_cr3 = set_tdp_cr3,
 
 	.check_intercept = svm_check_intercept,
+	.handle_external_intr = svm_handle_external_intr,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7175557..7a7605f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -378,6 +378,7 @@ struct vcpu_vmx {
 	struct shared_msr_entry *guest_msrs;
 	int                   nmsrs;
 	int                   save_nmsrs;
+	unsigned long	      host_idt_base;
 #ifdef CONFIG_X86_64
 	u64 		      msr_host_kernel_gs_base;
 	u64 		      msr_guest_kernel_gs_base;
@@ -2627,7 +2628,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 #ifdef CONFIG_X86_64
 	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
 #endif
-	opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
+	opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
+		VM_EXIT_ACK_INTR_ON_EXIT;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
 				&_vmexit_control) < 0)
 		return -EIO;
@@ -3879,7 +3881,7 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
  * Note that host-state that does change is set elsewhere. E.g., host-state
  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
  */
-static void vmx_set_constant_host_state(void)
+static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 {
 	u32 low32, high32;
 	unsigned long tmpl;
@@ -3907,6 +3909,7 @@ static void vmx_set_constant_host_state(void)
 
 	native_store_idt(&dt);
 	vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
+	vmx->host_idt_base = dt.address;
 
 	vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
 
@@ -4039,7 +4042,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
 	vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
 	vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
-	vmx_set_constant_host_state();
+	vmx_set_constant_host_state(vmx);
 #ifdef CONFIG_X86_64
 	rdmsrl(MSR_FS_BASE, a);
 	vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
@@ -6399,6 +6402,52 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 	}
 }
 
+static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
+{
+	u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+	/*
+	 * If external interrupt exists, IF bit is set in rflags/eflags on the
+	 * interrupt stack frame, and interrupt will be enabled on a return
+	 * from interrupt handler.
+	 */
+	if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
+			== (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
+		unsigned int vector;
+		unsigned long entry;
+		gate_desc *desc;
+		struct vcpu_vmx *vmx = to_vmx(vcpu);
+#ifdef CONFIG_X86_64
+		unsigned long tmp;
+#endif
+
+		vector =  exit_intr_info & INTR_INFO_VECTOR_MASK;
+		desc = (gate_desc *)vmx->host_idt_base + vector;
+		entry = gate_offset(*desc);
+		asm volatile(
+#ifdef CONFIG_X86_64
+			"mov %%" _ASM_SP ", %[sp]\n\t"
+			"and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
+			"push $%c[ss]\n\t"
+			"push %[sp]\n\t"
+#endif
+			"pushf\n\t"
+			"orl $0x200, (%%" _ASM_SP ")\n\t"
+			__ASM_SIZE(push) " $%c[cs]\n\t"
+			"call *%[entry]\n\t"
+			:
+#ifdef CONFIG_X86_64
+			[sp]"=&r"(tmp)
+#endif
+			:
+			[entry]"r"(entry),
+			[ss]"i"(__KERNEL_DS),
+			[cs]"i"(__KERNEL_CS)
+			);
+	} else
+		local_irq_enable();
+}
+
 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 {
 	u32 exit_intr_info;
@@ -7041,7 +7090,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	 * Other fields are different per CPU, and will be set later when
 	 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
 	 */
-	vmx_set_constant_host_state();
+	vmx_set_constant_host_state(vmx);
 
 	/*
 	 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
@@ -7718,6 +7767,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.set_tdp_cr3 = vmx_set_cr3,
 
 	.check_intercept = vmx_check_intercept,
+	.handle_external_intr = vmx_handle_external_intr,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ae9744d..1562671 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5820,7 +5820,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	vcpu->mode = OUTSIDE_GUEST_MODE;
 	smp_wmb();
-	local_irq_enable();
+
+	/* Interrupt is enabled by handle_external_intr() */
+	kvm_x86_ops->handle_external_intr(vcpu);
 
 	++vcpu->stat.exits;
 
-- 
cgit v1.1


From 01e439be7753c163932538276f04f95cb1b66697 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:25:12 +0800
Subject: KVM: VMX: Check the posted interrupt capability

Detect the posted interrupt feature. If it exists, then set it in vmcs_config.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 82 +++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 62 insertions(+), 20 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7a7605f..7607c6c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -84,7 +84,8 @@ module_param(vmm_exclusive, bool, S_IRUGO);
 static bool __read_mostly fasteoi = 1;
 module_param(fasteoi, bool, S_IRUGO);
 
-static bool __read_mostly enable_apicv_reg_vid;
+static bool __read_mostly enable_apicv;
+module_param(enable_apicv, bool, S_IRUGO);
 
 /*
  * If nested=1, nested virtualization is supported, i.e., guests may use
@@ -366,6 +367,14 @@ struct nested_vmx {
 	struct page *apic_access_page;
 };
 
+#define POSTED_INTR_ON  0
+/* Posted-Interrupt Descriptor */
+struct pi_desc {
+	u32 pir[8];     /* Posted interrupt requested */
+	u32 control;	/* bit 0 of control is outstanding notification bit */
+	u32 rsvd[7];
+} __aligned(64);
+
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	unsigned long         host_rsp;
@@ -430,6 +439,9 @@ struct vcpu_vmx {
 
 	bool rdtscp_enabled;
 
+	/* Posted interrupt descriptor */
+	struct pi_desc pi_desc;
+
 	/* Support for a guest hypervisor (nested VMX) */
 	struct nested_vmx nested;
 };
@@ -785,6 +797,18 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
 		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
 }
 
+static inline bool cpu_has_vmx_posted_intr(void)
+{
+	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
+}
+
+static inline bool cpu_has_vmx_apicv(void)
+{
+	return cpu_has_vmx_apic_register_virt() &&
+		cpu_has_vmx_virtual_intr_delivery() &&
+		cpu_has_vmx_posted_intr();
+}
+
 static inline bool cpu_has_vmx_flexpriority(void)
 {
 	return cpu_has_vmx_tpr_shadow() &&
@@ -2552,12 +2576,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	u32 _vmexit_control = 0;
 	u32 _vmentry_control = 0;
 
-	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
-	opt = PIN_BASED_VIRTUAL_NMIS;
-	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
-				&_pin_based_exec_control) < 0)
-		return -EIO;
-
 	min = CPU_BASED_HLT_EXITING |
 #ifdef CONFIG_X86_64
 	      CPU_BASED_CR8_LOAD_EXITING |
@@ -2634,6 +2652,17 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 				&_vmexit_control) < 0)
 		return -EIO;
 
+	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
+	opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
+	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
+				&_pin_based_exec_control) < 0)
+		return -EIO;
+
+	if (!(_cpu_based_2nd_exec_control &
+		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
+		!(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
+		_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
+
 	min = 0;
 	opt = VM_ENTRY_LOAD_IA32_PAT;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
@@ -2812,11 +2841,10 @@ static __init int hardware_setup(void)
 	if (!cpu_has_vmx_ple())
 		ple_gap = 0;
 
-	if (!cpu_has_vmx_apic_register_virt() ||
-				!cpu_has_vmx_virtual_intr_delivery())
-		enable_apicv_reg_vid = 0;
+	if (!cpu_has_vmx_apicv())
+		enable_apicv = 0;
 
-	if (enable_apicv_reg_vid)
+	if (enable_apicv)
 		kvm_x86_ops->update_cr8_intercept = NULL;
 	else
 		kvm_x86_ops->hwapic_irr_update = NULL;
@@ -3875,6 +3903,11 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
 			msr, MSR_TYPE_W);
 }
 
+static int vmx_vm_has_apicv(struct kvm *kvm)
+{
+	return enable_apicv && irqchip_in_kernel(kvm);
+}
+
 /*
  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
  * will not change in the lifetime of the guest.
@@ -3935,6 +3968,15 @@ static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
 	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
 }
 
+static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
+{
+	u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
+
+	if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+		pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+	return pin_based_exec_ctrl;
+}
+
 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
 {
 	u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
@@ -3952,11 +3994,6 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
 	return exec_control;
 }
 
-static int vmx_vm_has_apicv(struct kvm *kvm)
-{
-	return enable_apicv_reg_vid && irqchip_in_kernel(kvm);
-}
-
 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 {
 	u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
@@ -4012,8 +4049,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
 
 	/* Control */
-	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
-		vmcs_config.pin_based_exec_ctrl);
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
 
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
 
@@ -4022,13 +4058,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 				vmx_secondary_exec_control(vmx));
 	}
 
-	if (enable_apicv_reg_vid) {
+	if (vmx_vm_has_apicv(vmx->vcpu.kvm)) {
 		vmcs_write64(EOI_EXIT_BITMAP0, 0);
 		vmcs_write64(EOI_EXIT_BITMAP1, 0);
 		vmcs_write64(EOI_EXIT_BITMAP2, 0);
 		vmcs_write64(EOI_EXIT_BITMAP3, 0);
 
 		vmcs_write16(GUEST_INTR_STATUS, 0);
+
+		vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+		vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
 	}
 
 	if (ple_gap) {
@@ -4170,6 +4209,9 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 		vmcs_write64(APIC_ACCESS_ADDR,
 			     page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
 
+	if (vmx_vm_has_apicv(vcpu->kvm))
+		memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
+
 	if (vmx->vpid != 0)
 		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
 
@@ -7842,7 +7884,7 @@ static int __init vmx_init(void)
 	memcpy(vmx_msr_bitmap_longmode_x2apic,
 			vmx_msr_bitmap_longmode, PAGE_SIZE);
 
-	if (enable_apicv_reg_vid) {
+	if (enable_apicv) {
 		for (msr = 0x800; msr <= 0x8ff; msr++)
 			vmx_disable_intercept_msr_read_x2apic(msr);
 
-- 
cgit v1.1


From 3d81bc7e96d6bca0b8f8b7d1bf6ea72caa3aac57 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:25:13 +0800
Subject: KVM: Call common update function when ioapic entry changed.

Both TMR and EOI exit bitmap need to be updated when ioapic changed
or vcpu's id/ldr/dfr changed. So use common function instead eoi exit
bitmap specific function.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c |  2 +-
 arch/x86/kvm/vmx.c   |  3 +++
 arch/x86/kvm/x86.c   | 11 +++++++----
 3 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 6796218..34a8ca8 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -217,7 +217,7 @@ out:
 	if (old)
 		kfree_rcu(old, rcu);
 
-	kvm_ioapic_make_eoibitmap_request(kvm);
+	kvm_vcpu_request_scan_ioapic(kvm);
 }
 
 static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7607c6c..8535519 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6414,6 +6414,9 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
 
 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 {
+	if (!vmx_vm_has_apicv(vcpu->kvm))
+		return;
+
 	vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
 	vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
 	vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1562671..87a05df 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5661,13 +5661,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 #endif
 }
 
-static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
+static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 {
 	u64 eoi_exit_bitmap[4];
 
+	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
+		return;
+
 	memset(eoi_exit_bitmap, 0, 32);
 
-	kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap);
+	kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap);
 	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
 }
 
@@ -5724,8 +5727,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_handle_pmu_event(vcpu);
 		if (kvm_check_request(KVM_REQ_PMI, vcpu))
 			kvm_deliver_pmi(vcpu);
-		if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu))
-			update_eoi_exitmap(vcpu);
+		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
+			vcpu_scan_ioapic(vcpu);
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
-- 
cgit v1.1


From cf9e65b773394c5ad8fa7455c43268bc8ec2109f Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:25:14 +0800
Subject: KVM: Set TMR when programming ioapic entry

We already know the trigger mode of a given interrupt when programming
the ioapice entry. So it's not necessary to set it in each interrupt
delivery.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c | 15 +++++++++------
 arch/x86/kvm/lapic.h |  1 +
 arch/x86/kvm/x86.c   |  5 ++++-
 3 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 34a8ca8..d197579 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -468,6 +468,15 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
 	return result;
 }
 
+void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	int i;
+
+	for (i = 0; i < 8; i++)
+		apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]);
+}
+
 static void apic_update_ppr(struct kvm_lapic *apic)
 {
 	u32 tpr, isrv, ppr, old_ppr;
@@ -661,12 +670,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		if (dest_map)
 			__set_bit(vcpu->vcpu_id, dest_map);
 
-		if (trig_mode) {
-			apic_debug("level trig mode for vector %d", vector);
-			apic_set_vector(vector, apic->regs + APIC_TMR);
-		} else
-			apic_clear_vector(vector, apic->regs + APIC_TMR);
-
 		result = !apic_test_and_set_irr(vector, apic);
 		trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
 					  trig_mode, vector, !result);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 16304b1..7fe0c91 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -53,6 +53,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
 u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 
+void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 87a05df..276b4a9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5664,14 +5664,17 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 {
 	u64 eoi_exit_bitmap[4];
+	u32 tmr[8];
 
 	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
 		return;
 
 	memset(eoi_exit_bitmap, 0, 32);
+	memset(tmr, 0, 32);
 
-	kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap);
+	kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr);
 	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+	kvm_apic_update_tmr(vcpu, tmr);
 }
 
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
-- 
cgit v1.1


From a20ed54d6e470bf0d28921b7aadb6ca0da0ff0c3 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:25:15 +0800
Subject: KVM: VMX: Add the deliver posted interrupt algorithm

Only deliver the posted interrupt when target vcpu is running
and there is no previous interrupt pending in pir.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c | 13 +++++++++++
 arch/x86/kvm/lapic.h |  1 +
 arch/x86/kvm/svm.c   |  6 +++++
 arch/x86/kvm/vmx.c   | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 83 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d197579..dbf74c9 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -318,6 +318,19 @@ static u8 count_vectors(void *bitmap)
 	return count;
 }
 
+void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+{
+	u32 i, pir_val;
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	for (i = 0; i <= 7; i++) {
+		pir_val = xchg(&pir[i], 0);
+		if (pir_val)
+			*((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val;
+	}
+}
+EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
+
 static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
 {
 	apic->irr_pending = true;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 7fe0c91..c730ac9 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -54,6 +54,7 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 
 void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
+void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2f8fe3f..d6713e1 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3577,6 +3577,11 @@ static void svm_hwapic_isr_update(struct kvm *kvm, int isr)
 	return;
 }
 
+static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+{
+	return;
+}
+
 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -4305,6 +4310,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.vm_has_apicv = svm_vm_has_apicv,
 	.load_eoi_exitmap = svm_load_eoi_exitmap,
 	.hwapic_isr_update = svm_hwapic_isr_update,
+	.sync_pir_to_irr = svm_sync_pir_to_irr,
 
 	.set_tss_addr = svm_set_tss_addr,
 	.get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8535519..3a14d8a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -375,6 +375,23 @@ struct pi_desc {
 	u32 rsvd[7];
 } __aligned(64);
 
+static bool pi_test_and_set_on(struct pi_desc *pi_desc)
+{
+	return test_and_set_bit(POSTED_INTR_ON,
+			(unsigned long *)&pi_desc->control);
+}
+
+static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
+{
+	return test_and_clear_bit(POSTED_INTR_ON,
+			(unsigned long *)&pi_desc->control);
+}
+
+static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
+{
+	return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	unsigned long         host_rsp;
@@ -639,6 +656,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
 			    struct kvm_segment *var, int seg);
 static bool guest_state_valid(struct kvm_vcpu *vcpu);
 static u32 vmx_segment_access_rights(struct kvm_segment *var);
+static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -2846,8 +2864,11 @@ static __init int hardware_setup(void)
 
 	if (enable_apicv)
 		kvm_x86_ops->update_cr8_intercept = NULL;
-	else
+	else {
 		kvm_x86_ops->hwapic_irr_update = NULL;
+		kvm_x86_ops->deliver_posted_interrupt = NULL;
+		kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
+	}
 
 	if (nested)
 		nested_vmx_setup_ctls_msrs();
@@ -3909,6 +3930,45 @@ static int vmx_vm_has_apicv(struct kvm *kvm)
 }
 
 /*
+ * Send interrupt to vcpu via posted interrupt way.
+ * 1. If target vcpu is running(non-root mode), send posted interrupt
+ * notification to vcpu and hardware will sync PIR to vIRR atomically.
+ * 2. If target vcpu isn't running(root mode), kick it to pick up the
+ * interrupt from PIR in next vmentry.
+ */
+static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int r;
+
+	if (pi_test_and_set_pir(vector, &vmx->pi_desc))
+		return;
+
+	r = pi_test_and_set_on(&vmx->pi_desc);
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+	if (!r && (vcpu->mode == IN_GUEST_MODE))
+		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
+				POSTED_INTR_VECTOR);
+	else
+		kvm_vcpu_kick(vcpu);
+}
+
+static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!pi_test_and_clear_on(&vmx->pi_desc))
+		return;
+
+	kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
+}
+
+static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu)
+{
+	return;
+}
+
+/*
  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
  * will not change in the lifetime of the guest.
  * Note that host-state that does change is set elsewhere. E.g., host-state
@@ -7784,6 +7844,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.load_eoi_exitmap = vmx_load_eoi_exitmap,
 	.hwapic_irr_update = vmx_hwapic_irr_update,
 	.hwapic_isr_update = vmx_hwapic_isr_update,
+	.sync_pir_to_irr = vmx_sync_pir_to_irr,
+	.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
 
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
-- 
cgit v1.1


From 5a71785dde307f6ac80e83c0ad3fd694912010a1 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:25:16 +0800
Subject: KVM: VMX: Use posted interrupt to deliver virtual interrupt

If posted interrupt is avaliable, then uses it to inject virtual
interrupt to guest.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c | 30 +++++++++++++++++++-----------
 arch/x86/kvm/vmx.c   |  2 +-
 arch/x86/kvm/x86.c   |  1 +
 3 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index dbf74c9..e29883c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -353,6 +353,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
 	if (!apic->irr_pending)
 		return -1;
 
+	kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
 	result = apic_search_irr(apic);
 	ASSERT(result == -1 || result >= 16);
 
@@ -683,18 +684,25 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		if (dest_map)
 			__set_bit(vcpu->vcpu_id, dest_map);
 
-		result = !apic_test_and_set_irr(vector, apic);
-		trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
-					  trig_mode, vector, !result);
-		if (!result) {
-			if (trig_mode)
-				apic_debug("level trig mode repeatedly for "
-						"vector %d", vector);
-			break;
-		}
+		if (kvm_x86_ops->deliver_posted_interrupt) {
+			result = 1;
+			kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
+		} else {
+			result = !apic_test_and_set_irr(vector, apic);
 
-		kvm_make_request(KVM_REQ_EVENT, vcpu);
-		kvm_vcpu_kick(vcpu);
+			if (!result) {
+				if (trig_mode)
+					apic_debug("level trig mode repeatedly "
+						"for vector %d", vector);
+				goto out;
+			}
+
+			kvm_make_request(KVM_REQ_EVENT, vcpu);
+			kvm_vcpu_kick(vcpu);
+		}
+out:
+		trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
+				trig_mode, vector, !result);
 		break;
 
 	case APIC_DM_REMRD:
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3a14d8a..5a87a58 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -84,7 +84,7 @@ module_param(vmm_exclusive, bool, S_IRUGO);
 static bool __read_mostly fasteoi = 1;
 module_param(fasteoi, bool, S_IRUGO);
 
-static bool __read_mostly enable_apicv;
+static bool __read_mostly enable_apicv = 1;
 module_param(enable_apicv, bool, S_IRUGO);
 
 /*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 276b4a9..50e2e10 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2692,6 +2692,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 				    struct kvm_lapic_state *s)
 {
+	kvm_x86_ops->sync_pir_to_irr(vcpu);
 	memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
 
 	return 0;
-- 
cgit v1.1


From 26539bd0e446a54665f9d6f4c69a21140b2e1d85 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 15 Apr 2013 15:00:27 +0200
Subject: KVM: nVMX: check vmcs12 for valid activity state

KVM does not use the activity state VMCS field, and does not support
it in nested VMX either (the corresponding bits in the misc VMX feature
MSR are zero).  Fail entry if the activity state is set to anything but
"active".

Since the value will always be the same for L1 and L2, we do not need
to read and write the corresponding VMCS field on L1/L2 transitions,
either.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Reviewed-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5a87a58..8d52bcf 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7106,7 +7106,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 		vmcs12->vm_entry_instruction_len);
 	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
 		vmcs12->guest_interruptibility_info);
-	vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state);
 	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
 	kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
 	vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
@@ -7325,6 +7324,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 		return 1;
 	}
 
+	if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE) {
+		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+		return 1;
+	}
+
 	if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
 			!IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) {
 		/*TODO: Also verify bits beyond physical address width are 0*/
@@ -7555,7 +7559,6 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
 	vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
 
-	vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
 	vmcs12->guest_interruptibility_info =
 		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
 	vmcs12->guest_pending_dbg_exceptions =
-- 
cgit v1.1


From f13882d84df31a8567032b6bcbfbdd76ac378513 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 14 Apr 2013 16:07:37 +0300
Subject: KVM: VMX: Fix check guest state validity if a guest is in VM86 mode

If guest vcpu is in VM86 mode the vcpu state should be checked as if in
real mode.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8d52bcf..c84f0cb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3605,7 +3605,7 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu)
 		return true;
 
 	/* real mode guest state checks */
-	if (!is_protmode(vcpu)) {
+	if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
 		if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
 			return false;
 		if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
-- 
cgit v1.1


From 6ffbbbbab34ef84b89f8b9f50a5f0fed5795d79a Mon Sep 17 00:00:00 2001
From: "Zhang, Yang Z" <yang.z.zhang@intel.com>
Date: Wed, 17 Apr 2013 23:11:54 -0300
Subject: KVM: x86: Fix posted interrupt with CONFIG_SMP=n

->send_IPI_mask is not defined on UP.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c84f0cb..a05cca6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3946,10 +3946,12 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
 
 	r = pi_test_and_set_on(&vmx->pi_desc);
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
+#ifdef CONFIG_SMP
 	if (!r && (vcpu->mode == IN_GUEST_MODE))
 		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
 				POSTED_INTR_VECTOR);
 	else
+#endif
 		kvm_vcpu_kick(vcpu);
 }
 
-- 
cgit v1.1


From abc4fc58c5ba1a794092bcd97fdb1680b0b3398d Mon Sep 17 00:00:00 2001
From: Abel Gordon <abelg@il.ibm.com>
Date: Thu, 18 Apr 2013 14:35:25 +0300
Subject: KVM: nVMX: Detect shadow-vmcs capability

Add logic required to detect if shadow-vmcs is supported by the
processor. Introduce a new kernel module parameter to specify if L0 should use
shadow vmcs (or not) to run L1.

Signed-off-by: Abel Gordon <abelg@il.ibm.com>
Reviewed-by: Orit Wasserman <owasserm@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a05cca6..7042b69 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -87,6 +87,8 @@ module_param(fasteoi, bool, S_IRUGO);
 static bool __read_mostly enable_apicv = 1;
 module_param(enable_apicv, bool, S_IRUGO);
 
+static bool __read_mostly enable_shadow_vmcs = 1;
+module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
 /*
  * If nested=1, nested virtualization is supported, i.e., guests may use
  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -940,6 +942,18 @@ static inline bool cpu_has_vmx_wbinvd_exit(void)
 		SECONDARY_EXEC_WBINVD_EXITING;
 }
 
+static inline bool cpu_has_vmx_shadow_vmcs(void)
+{
+	u64 vmx_msr;
+	rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+	/* check if the cpu supports writing r/o exit information fields */
+	if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
+		return false;
+
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_SHADOW_VMCS;
+}
+
 static inline bool report_flexpriority(void)
 {
 	return flexpriority_enabled;
@@ -2632,7 +2646,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 			SECONDARY_EXEC_RDTSCP |
 			SECONDARY_EXEC_ENABLE_INVPCID |
 			SECONDARY_EXEC_APIC_REGISTER_VIRT |
-			SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
+			SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
+			SECONDARY_EXEC_SHADOW_VMCS;
 		if (adjust_vmx_controls(min2, opt2,
 					MSR_IA32_VMX_PROCBASED_CTLS2,
 					&_cpu_based_2nd_exec_control) < 0)
@@ -2833,6 +2848,8 @@ static __init int hardware_setup(void)
 
 	if (!cpu_has_vmx_vpid())
 		enable_vpid = 0;
+	if (!cpu_has_vmx_shadow_vmcs())
+		enable_shadow_vmcs = 0;
 
 	if (!cpu_has_vmx_ept() ||
 	    !cpu_has_vmx_ept_4levels()) {
@@ -4077,6 +4094,12 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 		exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
 				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+	/* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
+	   (handle_vmptrld).
+	   We can NOT enable shadow_vmcs here because we don't have yet
+	   a current VMCS12
+	*/
+	exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
 	return exec_control;
 }
 
-- 
cgit v1.1


From 4607c2d7a2ee90707de2b3b37e4d9156e05cdf29 Mon Sep 17 00:00:00 2001
From: Abel Gordon <abelg@il.ibm.com>
Date: Thu, 18 Apr 2013 14:35:55 +0300
Subject: KVM: nVMX: Introduce vmread and vmwrite bitmaps

Prepare vmread and vmwrite bitmaps according to a pre-specified list of fields.
These lists are intended to specifiy most frequent accessed fields so we can
minimize the number of fields that are copied from/to the software controlled
VMCS12 format to/from to processor-specific shadow vmcs. The lists were built
measuring the VMCS fields access rate after L2 Ubuntu 12.04 booted when it was
running on top of L1 KVM, also Ubuntu 12.04. Note that during boot there were
additional fields which were frequently modified but they were not added to
these lists because after boot these fields were not longer accessed by L1.

Signed-off-by: Abel Gordon <abelg@il.ibm.com>
Reviewed-by: Orit Wasserman <owasserm@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 89 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7042b69..7dc5996 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -484,6 +484,64 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 #define FIELD64(number, name)	[number] = VMCS12_OFFSET(name), \
 				[number##_HIGH] = VMCS12_OFFSET(name)+4
 
+
+static const unsigned long shadow_read_only_fields[] = {
+	/*
+	 * We do NOT shadow fields that are modified when L0
+	 * traps and emulates any vmx instruction (e.g. VMPTRLD,
+	 * VMXON...) executed by L1.
+	 * For example, VM_INSTRUCTION_ERROR is read
+	 * by L1 if a vmx instruction fails (part of the error path).
+	 * Note the code assumes this logic. If for some reason
+	 * we start shadowing these fields then we need to
+	 * force a shadow sync when L0 emulates vmx instructions
+	 * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
+	 * by nested_vmx_failValid)
+	 */
+	VM_EXIT_REASON,
+	VM_EXIT_INTR_INFO,
+	VM_EXIT_INSTRUCTION_LEN,
+	IDT_VECTORING_INFO_FIELD,
+	IDT_VECTORING_ERROR_CODE,
+	VM_EXIT_INTR_ERROR_CODE,
+	EXIT_QUALIFICATION,
+	GUEST_LINEAR_ADDRESS,
+	GUEST_PHYSICAL_ADDRESS
+};
+static const int max_shadow_read_only_fields =
+	ARRAY_SIZE(shadow_read_only_fields);
+
+static const unsigned long shadow_read_write_fields[] = {
+	GUEST_RIP,
+	GUEST_RSP,
+	GUEST_CR0,
+	GUEST_CR3,
+	GUEST_CR4,
+	GUEST_INTERRUPTIBILITY_INFO,
+	GUEST_RFLAGS,
+	GUEST_CS_SELECTOR,
+	GUEST_CS_AR_BYTES,
+	GUEST_CS_LIMIT,
+	GUEST_CS_BASE,
+	GUEST_ES_BASE,
+	CR0_GUEST_HOST_MASK,
+	CR0_READ_SHADOW,
+	CR4_READ_SHADOW,
+	TSC_OFFSET,
+	EXCEPTION_BITMAP,
+	CPU_BASED_VM_EXEC_CONTROL,
+	VM_ENTRY_EXCEPTION_ERROR_CODE,
+	VM_ENTRY_INTR_INFO_FIELD,
+	VM_ENTRY_INSTRUCTION_LEN,
+	VM_ENTRY_EXCEPTION_ERROR_CODE,
+	HOST_FS_BASE,
+	HOST_GS_BASE,
+	HOST_FS_SELECTOR,
+	HOST_GS_SELECTOR
+};
+static const int max_shadow_read_write_fields =
+	ARRAY_SIZE(shadow_read_write_fields);
+
 static const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
 	FIELD(GUEST_ES_SELECTOR, guest_es_selector),
@@ -675,6 +733,8 @@ static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;
 static unsigned long *vmx_msr_bitmap_legacy_x2apic;
 static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+static unsigned long *vmx_vmread_bitmap;
+static unsigned long *vmx_vmwrite_bitmap;
 
 static bool cpu_has_load_ia32_efer;
 static bool cpu_has_load_perf_global_ctrl;
@@ -4128,6 +4188,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
 	vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
 
+	if (enable_shadow_vmcs) {
+		vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
+		vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
+	}
 	if (cpu_has_vmx_msr_bitmap())
 		vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
 
@@ -7941,6 +8005,24 @@ static int __init vmx_init(void)
 				(unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_msr_bitmap_longmode_x2apic)
 		goto out4;
+	vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+	if (!vmx_vmread_bitmap)
+		goto out5;
+
+	vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+	if (!vmx_vmwrite_bitmap)
+		goto out6;
+
+	memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
+	memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
+	/* shadowed read/write fields */
+	for (i = 0; i < max_shadow_read_write_fields; i++) {
+		clear_bit(shadow_read_write_fields[i], vmx_vmwrite_bitmap);
+		clear_bit(shadow_read_write_fields[i], vmx_vmread_bitmap);
+	}
+	/* shadowed read only fields */
+	for (i = 0; i < max_shadow_read_only_fields; i++)
+		clear_bit(shadow_read_only_fields[i], vmx_vmread_bitmap);
 
 	/*
 	 * Allow direct access to the PC debug port (it is often used for I/O
@@ -7959,7 +8041,7 @@ static int __init vmx_init(void)
 	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
 		     __alignof__(struct vcpu_vmx), THIS_MODULE);
 	if (r)
-		goto out5;
+		goto out7;
 
 #ifdef CONFIG_KEXEC
 	rcu_assign_pointer(crash_vmclear_loaded_vmcss,
@@ -8007,6 +8089,10 @@ static int __init vmx_init(void)
 
 	return 0;
 
+out7:
+	free_page((unsigned long)vmx_vmwrite_bitmap);
+out6:
+	free_page((unsigned long)vmx_vmread_bitmap);
 out5:
 	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
 out4:
@@ -8030,6 +8116,8 @@ static void __exit vmx_exit(void)
 	free_page((unsigned long)vmx_msr_bitmap_longmode);
 	free_page((unsigned long)vmx_io_bitmap_b);
 	free_page((unsigned long)vmx_io_bitmap_a);
+	free_page((unsigned long)vmx_vmwrite_bitmap);
+	free_page((unsigned long)vmx_vmread_bitmap);
 
 #ifdef CONFIG_KEXEC
 	rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL);
-- 
cgit v1.1


From 20b97feaf615da2c43361e1dda7efd068a4446ef Mon Sep 17 00:00:00 2001
From: Abel Gordon <abelg@il.ibm.com>
Date: Thu, 18 Apr 2013 14:36:25 +0300
Subject: KVM: nVMX: Refactor handle_vmwrite

Refactor existent code so we re-use vmcs12_write_any to copy fields from the
shadow vmcs specified by the link pointer (used by the processor,
implementation-specific) to the VMCS12 software format used by L0 to hold
the fields in L1 memory address space.

Signed-off-by: Abel Gordon <abelg@il.ibm.com>
Reviewed-by: Orit Wasserman <owasserm@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 52 ++++++++++++++++++++++++++++------------------------
 1 file changed, 28 insertions(+), 24 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7dc5996..1ee63ca 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5842,6 +5842,33 @@ static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
 	}
 }
 
+
+static inline bool vmcs12_write_any(struct kvm_vcpu *vcpu,
+				    unsigned long field, u64 field_value){
+	short offset = vmcs_field_to_offset(field);
+	char *p = ((char *) get_vmcs12(vcpu)) + offset;
+	if (offset < 0)
+		return false;
+
+	switch (vmcs_field_type(field)) {
+	case VMCS_FIELD_TYPE_U16:
+		*(u16 *)p = field_value;
+		return true;
+	case VMCS_FIELD_TYPE_U32:
+		*(u32 *)p = field_value;
+		return true;
+	case VMCS_FIELD_TYPE_U64:
+		*(u64 *)p = field_value;
+		return true;
+	case VMCS_FIELD_TYPE_NATURAL_WIDTH:
+		*(natural_width *)p = field_value;
+		return true;
+	default:
+		return false; /* can never happen. */
+	}
+
+}
+
 /*
  * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
  * used before) all generate the same failure when it is missing.
@@ -5906,8 +5933,6 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 	gva_t gva;
 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
-	char *p;
-	short offset;
 	/* The value to write might be 32 or 64 bits, depending on L1's long
 	 * mode, and eventually we need to write that into a field of several
 	 * possible lengths. The code below first zero-extends the value to 64
@@ -5944,28 +5969,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
-	offset = vmcs_field_to_offset(field);
-	if (offset < 0) {
-		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
-		skip_emulated_instruction(vcpu);
-		return 1;
-	}
-	p = ((char *) get_vmcs12(vcpu)) + offset;
-
-	switch (vmcs_field_type(field)) {
-	case VMCS_FIELD_TYPE_U16:
-		*(u16 *)p = field_value;
-		break;
-	case VMCS_FIELD_TYPE_U32:
-		*(u32 *)p = field_value;
-		break;
-	case VMCS_FIELD_TYPE_U64:
-		*(u64 *)p = field_value;
-		break;
-	case VMCS_FIELD_TYPE_NATURAL_WIDTH:
-		*(natural_width *)p = field_value;
-		break;
-	default:
+	if (!vmcs12_write_any(vcpu, field, field_value)) {
 		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
 		skip_emulated_instruction(vcpu);
 		return 1;
-- 
cgit v1.1


From 145c28dd19a2a05aa798e77bd7e679847fe8bee3 Mon Sep 17 00:00:00 2001
From: Abel Gordon <abelg@il.ibm.com>
Date: Thu, 18 Apr 2013 14:36:55 +0300
Subject: KVM: nVMX: Fix VMXON emulation

handle_vmon doesn't check if L1 is already in root mode (VMXON
was previously called). This patch adds this missing check and calls
nested_vmx_failValid if VMX is already ON.
We need this check because L0 will allocate the shadow vmcs when L1
executes VMXON and we want to avoid host leaks (due to shadow vmcs
allocation) if L1 executes VMXON repeatedly.

Signed-off-by: Abel Gordon <abelg@il.ibm.com>
Reviewed-by: Orit Wasserman <owasserm@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1ee63ca..1cdfb5d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5512,6 +5512,9 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
 		free_loaded_vmcs(&vmx->vmcs01);
 }
 
+static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
+				 u32 vm_instruction_error);
+
 /*
  * Emulate the VMXON instruction.
  * Currently, we just remember that VMX is active, and do not save or even
@@ -5547,6 +5550,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		kvm_inject_gp(vcpu, 0);
 		return 1;
 	}
+	if (vmx->nested.vmxon) {
+		nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
+		skip_emulated_instruction(vcpu);
+		return 1;
+	}
 
 	INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
 	vmx->nested.vmcs02_num = 0;
-- 
cgit v1.1


From 8de48833708a7834f1ba65a80944b1045082553a Mon Sep 17 00:00:00 2001
From: Abel Gordon <abelg@il.ibm.com>
Date: Thu, 18 Apr 2013 14:37:25 +0300
Subject: KVM: nVMX: Allocate shadow vmcs

Allocate a shadow vmcs used by the processor to shadow part of the fields
stored in the software defined VMCS12 (let L1 access fields without causing
exits). Note we keep a shadow vmcs only for the current vmcs12.  Once a vmcs12
becomes non-current, its shadow vmcs is released.

Signed-off-by: Abel Gordon <abelg@il.ibm.com>
Reviewed-by: Orit Wasserman <owasserm@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1cdfb5d..7b27af9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -355,6 +355,7 @@ struct nested_vmx {
 	/* The host-usable pointer to the above */
 	struct page *current_vmcs12_page;
 	struct vmcs12 *current_vmcs12;
+	struct vmcs *current_shadow_vmcs;
 
 	/* vmcs02_list cache of VMCSs recently used to run L2 guests */
 	struct list_head vmcs02_pool;
@@ -5527,6 +5528,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 {
 	struct kvm_segment cs;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vmcs *shadow_vmcs;
 
 	/* The Intel VMX Instruction Reference lists a bunch of bits that
 	 * are prerequisite to running VMXON, most notably cr4.VMXE must be
@@ -5555,6 +5557,16 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		skip_emulated_instruction(vcpu);
 		return 1;
 	}
+	if (enable_shadow_vmcs) {
+		shadow_vmcs = alloc_vmcs();
+		if (!shadow_vmcs)
+			return -ENOMEM;
+		/* mark vmcs as shadow */
+		shadow_vmcs->revision_id |= (1u << 31);
+		/* init shadow vmcs */
+		vmcs_clear(shadow_vmcs);
+		vmx->nested.current_shadow_vmcs = shadow_vmcs;
+	}
 
 	INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
 	vmx->nested.vmcs02_num = 0;
-- 
cgit v1.1


From e7953d7fab47a72e0d82e5d4dbb022864d8c2a41 Mon Sep 17 00:00:00 2001
From: Abel Gordon <abelg@il.ibm.com>
Date: Thu, 18 Apr 2013 14:37:55 +0300
Subject: KVM: nVMX: Release shadow vmcs

Unmap vmcs12 and release the corresponding shadow vmcs

Signed-off-by: Abel Gordon <abelg@il.ibm.com>
Reviewed-by: Orit Wasserman <owasserm@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7b27af9..ad3957e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5607,6 +5607,12 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
+{
+	kunmap(vmx->nested.current_vmcs12_page);
+	nested_release_page(vmx->nested.current_vmcs12_page);
+}
+
 /*
  * Free whatever needs to be freed from vmx->nested when L1 goes down, or
  * just stops using VMX.
@@ -5617,11 +5623,12 @@ static void free_nested(struct vcpu_vmx *vmx)
 		return;
 	vmx->nested.vmxon = false;
 	if (vmx->nested.current_vmptr != -1ull) {
-		kunmap(vmx->nested.current_vmcs12_page);
-		nested_release_page(vmx->nested.current_vmcs12_page);
+		nested_release_vmcs12(vmx);
 		vmx->nested.current_vmptr = -1ull;
 		vmx->nested.current_vmcs12 = NULL;
 	}
+	if (enable_shadow_vmcs)
+		free_vmcs(vmx->nested.current_shadow_vmcs);
 	/* Unpin physical memory we referred to in current vmcs02 */
 	if (vmx->nested.apic_access_page) {
 		nested_release_page(vmx->nested.apic_access_page);
@@ -5762,8 +5769,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
 	}
 
 	if (vmptr == vmx->nested.current_vmptr) {
-		kunmap(vmx->nested.current_vmcs12_page);
-		nested_release_page(vmx->nested.current_vmcs12_page);
+		nested_release_vmcs12(vmx);
 		vmx->nested.current_vmptr = -1ull;
 		vmx->nested.current_vmcs12 = NULL;
 	}
@@ -6045,10 +6051,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 			skip_emulated_instruction(vcpu);
 			return 1;
 		}
-		if (vmx->nested.current_vmptr != -1ull) {
-			kunmap(vmx->nested.current_vmcs12_page);
-			nested_release_page(vmx->nested.current_vmcs12_page);
-		}
+		if (vmx->nested.current_vmptr != -1ull)
+			nested_release_vmcs12(vmx);
 
 		vmx->nested.current_vmptr = vmptr;
 		vmx->nested.current_vmcs12 = new_vmcs12;
-- 
cgit v1.1


From 16f5b9034b699dea2f8862c324a0fd3275b4a50f Mon Sep 17 00:00:00 2001
From: Abel Gordon <abelg@il.ibm.com>
Date: Thu, 18 Apr 2013 14:38:25 +0300
Subject: KVM: nVMX: Copy processor-specific shadow-vmcs to VMCS12

Introduce a function used to copy fields from the processor-specific shadow
vmcs to the software controlled VMCS12

Signed-off-by: Abel Gordon <abelg@il.ibm.com>
Reviewed-by: Orit Wasserman <owasserm@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ad3957e..628d012 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -718,6 +718,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
 static bool guest_state_valid(struct kvm_vcpu *vcpu);
 static u32 vmx_segment_access_rights(struct kvm_segment *var);
 static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
+static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -5895,6 +5896,40 @@ static inline bool vmcs12_write_any(struct kvm_vcpu *vcpu,
 
 }
 
+static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
+{
+	int i;
+	unsigned long field;
+	u64 field_value;
+	struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
+	unsigned long *fields = (unsigned long *)shadow_read_write_fields;
+	int num_fields = max_shadow_read_write_fields;
+
+	vmcs_load(shadow_vmcs);
+
+	for (i = 0; i < num_fields; i++) {
+		field = fields[i];
+		switch (vmcs_field_type(field)) {
+		case VMCS_FIELD_TYPE_U16:
+			field_value = vmcs_read16(field);
+			break;
+		case VMCS_FIELD_TYPE_U32:
+			field_value = vmcs_read32(field);
+			break;
+		case VMCS_FIELD_TYPE_U64:
+			field_value = vmcs_read64(field);
+			break;
+		case VMCS_FIELD_TYPE_NATURAL_WIDTH:
+			field_value = vmcs_readl(field);
+			break;
+		}
+		vmcs12_write_any(&vmx->vcpu, field, field_value);
+	}
+
+	vmcs_clear(shadow_vmcs);
+	vmcs_load(vmx->loaded_vmcs->vmcs);
+}
+
 /*
  * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
  * used before) all generate the same failure when it is missing.
-- 
cgit v1.1


From c3114420d1c7a6075fb0cfdc69b567423e5cfc13 Mon Sep 17 00:00:00 2001
From: Abel Gordon <abelg@il.ibm.com>
Date: Thu, 18 Apr 2013 14:38:55 +0300
Subject: KVM: nVMX: Copy VMCS12 to processor-specific shadow vmcs

Introduce a function used to copy fields from the software controlled VMCS12
to the processor-specific shadow vmcs

Signed-off-by: Abel Gordon <abelg@il.ibm.com>
Reviewed-by: Orit Wasserman <owasserm@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 628d012..8dc59aa 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -718,6 +718,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
 static bool guest_state_valid(struct kvm_vcpu *vcpu);
 static u32 vmx_segment_access_rights(struct kvm_segment *var);
 static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
+static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
@@ -5930,6 +5931,50 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
 	vmcs_load(vmx->loaded_vmcs->vmcs);
 }
 
+static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
+{
+	unsigned long *fields[] = {
+		(unsigned long *)shadow_read_write_fields,
+		(unsigned long *)shadow_read_only_fields
+	};
+	int num_lists =  ARRAY_SIZE(fields);
+	int max_fields[] = {
+		max_shadow_read_write_fields,
+		max_shadow_read_only_fields
+	};
+	int i, q;
+	unsigned long field;
+	u64 field_value = 0;
+	struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
+
+	vmcs_load(shadow_vmcs);
+
+	for (q = 0; q < num_lists; q++) {
+		for (i = 0; i < max_fields[q]; i++) {
+			field = fields[q][i];
+			vmcs12_read_any(&vmx->vcpu, field, &field_value);
+
+			switch (vmcs_field_type(field)) {
+			case VMCS_FIELD_TYPE_U16:
+				vmcs_write16(field, (u16)field_value);
+				break;
+			case VMCS_FIELD_TYPE_U32:
+				vmcs_write32(field, (u32)field_value);
+				break;
+			case VMCS_FIELD_TYPE_U64:
+				vmcs_write64(field, (u64)field_value);
+				break;
+			case VMCS_FIELD_TYPE_NATURAL_WIDTH:
+				vmcs_writel(field, (long)field_value);
+				break;
+			}
+		}
+	}
+
+	vmcs_clear(shadow_vmcs);
+	vmcs_load(vmx->loaded_vmcs->vmcs);
+}
+
 /*
  * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
  * used before) all generate the same failure when it is missing.
-- 
cgit v1.1


From 012f83cb2f8d7b9b7ad3b65e7e53a9365a357014 Mon Sep 17 00:00:00 2001
From: Abel Gordon <abelg@il.ibm.com>
Date: Thu, 18 Apr 2013 14:39:25 +0300
Subject: KVM: nVMX: Synchronize VMCS12 content with the shadow vmcs

Synchronize between the VMCS12 software controlled structure and the
processor-specific shadow vmcs

Signed-off-by: Abel Gordon <abelg@il.ibm.com>
Reviewed-by: Orit Wasserman <owasserm@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8dc59aa..c5baecc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -356,6 +356,11 @@ struct nested_vmx {
 	struct page *current_vmcs12_page;
 	struct vmcs12 *current_vmcs12;
 	struct vmcs *current_shadow_vmcs;
+	/*
+	 * Indicates if the shadow vmcs must be updated with the
+	 * data hold by vmcs12
+	 */
+	bool sync_shadow_vmcs;
 
 	/* vmcs02_list cache of VMCSs recently used to run L2 guests */
 	struct list_head vmcs02_pool;
@@ -5611,6 +5616,14 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
 
 static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
 {
+	if (enable_shadow_vmcs) {
+		if (vmx->nested.current_vmcs12 != NULL) {
+			/* copy to memory all shadowed fields in case
+			   they were modified */
+			copy_shadow_to_vmcs12(vmx);
+			vmx->nested.sync_shadow_vmcs = false;
+		}
+	}
 	kunmap(vmx->nested.current_vmcs12_page);
 	nested_release_page(vmx->nested.current_vmcs12_page);
 }
@@ -5739,6 +5752,10 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
 			    X86_EFLAGS_SF | X86_EFLAGS_OF))
 			| X86_EFLAGS_ZF);
 	get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
+	/*
+	 * We don't need to force a shadow sync because
+	 * VM_INSTRUCTION_ERROR is not shadowed
+	 */
 }
 
 /* Emulate the VMCLEAR instruction */
@@ -6137,6 +6154,9 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 		vmx->nested.current_vmptr = vmptr;
 		vmx->nested.current_vmcs12 = new_vmcs12;
 		vmx->nested.current_vmcs12_page = page;
+		if (enable_shadow_vmcs) {
+			vmx->nested.sync_shadow_vmcs = true;
+		}
 	}
 
 	nested_vmx_succeed(vcpu);
@@ -6895,6 +6915,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (vmx->emulation_required)
 		return;
 
+	if (vmx->nested.sync_shadow_vmcs) {
+		copy_vmcs12_to_shadow(vmx);
+		vmx->nested.sync_shadow_vmcs = false;
+	}
+
 	if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
 		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
 	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
@@ -7504,6 +7529,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	skip_emulated_instruction(vcpu);
 	vmcs12 = get_vmcs12(vcpu);
 
+	if (enable_shadow_vmcs)
+		copy_shadow_to_vmcs12(vmx);
+
 	/*
 	 * The nested entry process starts with enforcing various prerequisites
 	 * on vmcs12 as required by the Intel SDM, and act appropriately when
@@ -7950,6 +7978,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
 		nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR));
 	} else
 		nested_vmx_succeed(vcpu);
+	if (enable_shadow_vmcs)
+		vmx->nested.sync_shadow_vmcs = true;
 }
 
 /*
@@ -7967,6 +7997,8 @@ static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
 	vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
 	vmcs12->exit_qualification = qualification;
 	nested_vmx_succeed(vcpu);
+	if (enable_shadow_vmcs)
+		to_vmx(vcpu)->nested.sync_shadow_vmcs = true;
 }
 
 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
-- 
cgit v1.1


From 8a1b9dd0006bce5cc770fd80bc95f9916670c151 Mon Sep 17 00:00:00 2001
From: Abel Gordon <abelg@il.ibm.com>
Date: Thu, 18 Apr 2013 14:39:55 +0300
Subject: KVM: nVMX: Enable and disable shadow vmcs functionality

Once L1 loads VMCS12 we enable shadow-vmcs capability and copy all the VMCS12
shadowed fields to the shadow vmcs.  When we release the VMCS12, we also
disable shadow-vmcs capability.

Signed-off-by: Abel Gordon <abelg@il.ibm.com>
Reviewed-by: Orit Wasserman <owasserm@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c5baecc..6abce2d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5616,12 +5616,17 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
 
 static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
 {
+	u32 exec_control;
 	if (enable_shadow_vmcs) {
 		if (vmx->nested.current_vmcs12 != NULL) {
 			/* copy to memory all shadowed fields in case
 			   they were modified */
 			copy_shadow_to_vmcs12(vmx);
 			vmx->nested.sync_shadow_vmcs = false;
+			exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+			exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
+			vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+			vmcs_write64(VMCS_LINK_POINTER, -1ull);
 		}
 	}
 	kunmap(vmx->nested.current_vmcs12_page);
@@ -6110,6 +6115,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 	gva_t gva;
 	gpa_t vmptr;
 	struct x86_exception e;
+	u32 exec_control;
 
 	if (!nested_vmx_check_permission(vcpu))
 		return 1;
@@ -6155,6 +6161,11 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 		vmx->nested.current_vmcs12 = new_vmcs12;
 		vmx->nested.current_vmcs12_page = page;
 		if (enable_shadow_vmcs) {
+			exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+			exec_control |= SECONDARY_EXEC_SHADOW_VMCS;
+			vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+			vmcs_write64(VMCS_LINK_POINTER,
+				     __pa(vmx->nested.current_shadow_vmcs));
 			vmx->nested.sync_shadow_vmcs = true;
 		}
 	}
-- 
cgit v1.1


From f1797359216c1daa145a354d07b8b2b7459668f4 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Thu, 18 Apr 2013 07:41:00 +0800
Subject: KVM: x86: fix error return code in kvm_arch_vcpu_init()

Fix to return a negative error code from the error handling
case instead of 0, as returned elsewhere in this function.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/x86.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 50e2e10..cbe16b4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6752,8 +6752,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	}
 	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
 
-	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
+	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
+		r = -ENOMEM;
 		goto fail_free_mce_banks;
+	}
 
 	r = fx_init(vcpu);
 	if (r)
-- 
cgit v1.1


From 27469d29b3caf889ddf81c7d89f0676e45eb551d Mon Sep 17 00:00:00 2001
From: Andrew Honig <ahonig@google.com>
Date: Thu, 18 Apr 2013 09:38:14 -0700
Subject: KVM: x86: Fix memory leak in vmx.c

If userspace creates and destroys multiple VMs within the same process
we leak 20k of memory in the userspace process context per VM.  This
patch frees the memory in kvm_arch_destroy_vm.  If the process exits
without closing the VM file descriptor or the file descriptor has been
shared with another process then we don't free the memory.

It's still possible for a user space process to leak memory if the last
process to close the fd for the VM is not the process that created it.
However, this is an unexpected case that's only caused by a user space
process that's misbehaving.

Signed-off-by: Andrew Honig <ahonig@google.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/x86.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cbe16b4..e730a46 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6859,6 +6859,23 @@ void kvm_arch_sync_events(struct kvm *kvm)
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+	if (current->mm == kvm->mm) {
+		/*
+		 * Free memory regions allocated on behalf of userspace,
+		 * unless the the memory map has changed due to process exit
+		 * or fd copying.
+		 */
+		struct kvm_userspace_memory_region mem;
+		memset(&mem, 0, sizeof(mem));
+		mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
+		kvm_set_memory_region(kvm, &mem);
+
+		mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
+		kvm_set_memory_region(kvm, &mem);
+
+		mem.slot = TSS_PRIVATE_MEMSLOT;
+		kvm_set_memory_region(kvm, &mem);
+	}
 	kvm_iommu_unmap_guest(kvm);
 	kfree(kvm->arch.vpic);
 	kfree(kvm->arch.vioapic);
-- 
cgit v1.1


From 2505dc9fad44491ac4ce06290f358c030abe6be3 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sun, 14 Apr 2013 12:12:47 +0200
Subject: KVM: VMX: Move vmx_nmi_allowed after vmx_set_nmi_mask

vmx_set_nmi_mask will soon be used by vmx_nmi_allowed. No functional
changes.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6abce2d..451ab2a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4486,16 +4486,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
 }
 
-static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
-{
-	if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
-		return 0;
-
-	return	!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-		  (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
-		   | GUEST_INTR_STATE_NMI));
-}
-
 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
 {
 	if (!cpu_has_virtual_nmis())
@@ -4525,6 +4515,16 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 	}
 }
 
+static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
+{
+	if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
+		return 0;
+
+	return	!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+		  (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
+		   | GUEST_INTR_STATE_NMI));
+}
+
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
 	if (is_guest_mode(vcpu)) {
-- 
cgit v1.1


From ea8ceb8354e1c84a13cf2a8e915dc74f96759393 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sun, 14 Apr 2013 21:04:26 +0200
Subject: KVM: nVMX: Fix conditions for NMI injection

The logic for checking if interrupts can be injected has to be applied
also on NMIs. The difference is that if NMI interception is on these
events are consumed and blocked by the VM exit.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 451ab2a..11ea3cb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4392,6 +4392,12 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
 		PIN_BASED_EXT_INTR_MASK;
 }
 
+static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
+{
+	return get_vmcs12(vcpu)->pin_based_vm_exec_control &
+		PIN_BASED_NMI_EXITING;
+}
+
 static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	u32 cpu_based_vm_exec_control;
@@ -4517,6 +4523,26 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 
 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 {
+	if (is_guest_mode(vcpu)) {
+		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+		if (to_vmx(vcpu)->nested.nested_run_pending)
+			return 0;
+		if (nested_exit_on_nmi(vcpu)) {
+			nested_vmx_vmexit(vcpu);
+			vmcs12->vm_exit_reason = EXIT_REASON_EXCEPTION_NMI;
+			vmcs12->vm_exit_intr_info = NMI_VECTOR |
+				INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK;
+			/*
+			 * The NMI-triggered VM exit counts as injection:
+			 * clear this one and block further NMIs.
+			 */
+			vcpu->arch.nmi_pending = 0;
+			vmx_set_nmi_mask(vcpu, true);
+			return 0;
+		}
+	}
+
 	if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
 		return 0;
 
-- 
cgit v1.1


From 384bb783275145b70d769acf4c687957d1c61802 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@web.de>
Date: Sat, 20 Apr 2013 10:52:36 +0200
Subject: KVM: nVMX: Validate EFER values for VM_ENTRY/EXIT_LOAD_IA32_EFER

As we may emulate the loading of EFER on VM-entry and VM-exit, implement
the checks that VMX performs on the guest and host values on vmlaunch/
vmresume. Factor out kvm_valid_efer for this purpose which checks for
set reserved bits.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 40 ++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c | 29 +++++++++++++++++++----------
 2 files changed, 59 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 11ea3cb..d7ef556 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7558,6 +7558,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	int cpu;
 	struct loaded_vmcs *vmcs02;
+	bool ia32e;
 
 	if (!nested_vmx_check_permission(vcpu) ||
 	    !nested_vmx_check_vmcs12(vcpu))
@@ -7649,6 +7650,45 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	}
 
 	/*
+	 * If the �load IA32_EFER� VM-entry control is 1, the following checks
+	 * are performed on the field for the IA32_EFER MSR:
+	 * - Bits reserved in the IA32_EFER MSR must be 0.
+	 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
+	 *   the IA-32e mode guest VM-exit control. It must also be identical
+	 *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
+	 *   CR0.PG) is 1.
+	 */
+	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) {
+		ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
+		if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
+		    ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
+		    ((vmcs12->guest_cr0 & X86_CR0_PG) &&
+		     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) {
+			nested_vmx_entry_failure(vcpu, vmcs12,
+				EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+			return 1;
+		}
+	}
+
+	/*
+	 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
+	 * IA32_EFER MSR must be 0 in the field for that register. In addition,
+	 * the values of the LMA and LME bits in the field must each be that of
+	 * the host address-space size VM-exit control.
+	 */
+	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
+		ia32e = (vmcs12->vm_exit_controls &
+			 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
+		if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
+		    ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
+		    ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) {
+			nested_vmx_entry_failure(vcpu, vmcs12,
+				EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+			return 1;
+		}
+	}
+
+	/*
 	 * We're finally done with prerequisite checking, and can start with
 	 * the nested entry.
 	 */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e730a46..2a434bf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -845,23 +845,17 @@ static const u32 emulated_msrs[] = {
 	MSR_IA32_MCG_CTL,
 };
 
-static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
+bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
-	u64 old_efer = vcpu->arch.efer;
-
 	if (efer & efer_reserved_bits)
-		return 1;
-
-	if (is_paging(vcpu)
-	    && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
-		return 1;
+		return false;
 
 	if (efer & EFER_FFXSR) {
 		struct kvm_cpuid_entry2 *feat;
 
 		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
-			return 1;
+			return false;
 	}
 
 	if (efer & EFER_SVME) {
@@ -869,9 +863,24 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 
 		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
-			return 1;
+			return false;
 	}
 
+	return true;
+}
+EXPORT_SYMBOL_GPL(kvm_valid_efer);
+
+static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+	u64 old_efer = vcpu->arch.efer;
+
+	if (!kvm_valid_efer(vcpu, efer))
+		return 1;
+
+	if (is_paging(vcpu)
+	    && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
+		return 1;
+
 	efer &= ~EFER_LMA;
 	efer |= vcpu->arch.efer & EFER_LMA;
 
-- 
cgit v1.1


From d1fa0352a151a597e8749e7be84121a1ff0d3502 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sun, 14 Apr 2013 12:44:54 +0200
Subject: KVM: nVMX: VM_ENTRY/EXIT_LOAD_IA32_EFER overrides EFER.LMA settings

If we load the complete EFER MSR on entry or exit, EFER.LMA (and LME)
loading is skipped. Their consistency is already checked now before
starting the transition.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d7ef556..5863adf 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7519,7 +7519,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
 	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
 		vcpu->arch.efer = vmcs12->guest_ia32_efer;
-	if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
+	else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
 		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
 	else
 		vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
@@ -7929,7 +7929,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 {
 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
 		vcpu->arch.efer = vmcs12->host_ia32_efer;
-	if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
+	else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
 		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
 	else
 		vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
-- 
cgit v1.1


From 660696d1d16a71e15549ce1bf74953be1592bcd3 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 24 Apr 2013 13:38:36 +0300
Subject: KVM: X86 emulator: fix source operand decoding for 8bit mov[zs]x
 instructions

Source operand for one byte mov[zs]x is decoded incorrectly if it is in
high byte register. Fix that.

Cc: stable@vger.kernel.org
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 46f63b8..8e517bb 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4172,6 +4172,10 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
 		break;
 	case OpMem8:
 		ctxt->memop.bytes = 1;
+		if (ctxt->memop.type == OP_REG) {
+			ctxt->memop.addr.reg = decode_register(ctxt, ctxt->modrm_rm, 1);
+			fetch_register_operand(&ctxt->memop);
+		}
 		goto mem_common;
 	case OpMem16:
 		ctxt->memop.bytes = 2;
-- 
cgit v1.1


From a725d56a02ec3582bb5b9756f261fdc6962c79ee Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Wed, 17 Apr 2013 13:29:30 +0200
Subject: KVM: Introduce CONFIG_HAVE_KVM_IRQ_ROUTING

Quite a bit of code in KVM has been conditionalized on availability of
IOAPIC emulation. However, most of it is generically applicable to
platforms that don't have an IOPIC, but a different type of irq chip.

Make code that only relies on IRQ routing, not an APIC itself, on
CONFIG_HAVE_KVM_IRQ_ROUTING, so that we can reuse it later.

Signed-off-by: Alexander Graf <agraf@suse.de>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
---
 arch/x86/kvm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 586f000..9d50efd 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -29,6 +29,7 @@ config KVM
 	select MMU_NOTIFIER
 	select ANON_INODES
 	select HAVE_KVM_IRQCHIP
+	select HAVE_KVM_IRQ_ROUTING
 	select HAVE_KVM_EVENTFD
 	select KVM_APIC_ARCHITECTURE
 	select KVM_ASYNC_PF
-- 
cgit v1.1


From 1c9f8520bda73c07fed9bcdb307854b45a3a60c4 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 15 Apr 2013 23:04:10 +0200
Subject: KVM: Extract generic irqchip logic into irqchip.c

The current irq_comm.c file contains pieces of code that are generic
across different irqchip implementations, as well as code that is
fully IOAPIC specific.

Split the generic bits out into irqchip.c.

Signed-off-by: Alexander Graf <agraf@suse.de>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
---
 arch/x86/kvm/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 04d3040..a797b8e 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
 
 kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 				coalesced_mmio.o irq_comm.o eventfd.o \
-				assigned-dev.o)
+				assigned-dev.o irqchip.o)
 kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
 
-- 
cgit v1.1


From 7df35f549606e8a9004a77ef31dc80dfa893a590 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 16 Apr 2013 12:12:49 +0200
Subject: KVM: Move irqfd resample cap handling to generic code

Now that we have most irqfd code completely platform agnostic, let's move
irqfd's resample capability return to generic code as well.

Signed-off-by: Alexander Graf <agraf@suse.de>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
---
 arch/x86/kvm/x86.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2a434bf..f6c3f03 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2522,7 +2522,6 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PCI_2_3:
 	case KVM_CAP_KVMCLOCK_CTRL:
 	case KVM_CAP_READONLY_MEM:
-	case KVM_CAP_IRQFD_RESAMPLE:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
-- 
cgit v1.1


From cb0c8cda1301842498d1154bd6d0d1c44d1e7030 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sat, 27 Apr 2013 12:58:00 +0200
Subject: KVM: VMX: remove unprintable characters from comment

Slipped in while copy&pasting from the SDM.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5863adf..0f0cb31 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7650,7 +7650,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	}
 
 	/*
-	 * If the �load IA32_EFER� VM-entry control is 1, the following checks
+	 * If the load IA32_EFER VM-entry control is 1, the following checks
 	 * are performed on the field for the IA32_EFER MSR:
 	 * - Bits reserved in the IA32_EFER MSR must be 0.
 	 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
-- 
cgit v1.1


From 6614c7d042eb1096d4eba253b4952bec349f8593 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 26 Apr 2013 00:22:01 +0200
Subject: kvm, svm: Fix typo in printk message

It is "exit_int_info". It is actually EXITINTINFO in the official docs
but we don't like screaming docs.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d6713e1..15c9ccc 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3473,7 +3473,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 	    exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
 	    exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
 	    exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
-		printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
+		printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
 		       "exit_code 0x%x\n",
 		       __func__, svm->vmcb->control.exit_int_info,
 		       exit_code);
-- 
cgit v1.1


From 730dca42c1d363c939da18c1499c7327c66e2b37 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sun, 28 Apr 2013 10:50:52 +0200
Subject: KVM: x86: Rework request for immediate exit

The VMX implementation of enable_irq_window raised
KVM_REQ_IMMEDIATE_EXIT after we checked it in vcpu_enter_guest. This
caused infinite loops on vmentry. Fix it by letting enable_irq_window
signal the need for an immediate exit via its return value and drop
KVM_REQ_IMMEDIATE_EXIT.

This issue only affects nested VMX scenarios.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/svm.c |  3 ++-
 arch/x86/kvm/vmx.c | 15 ++++++++-------
 arch/x86/kvm/x86.c |  7 +++----
 3 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 15c9ccc..7f896cb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3632,7 +3632,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
 	return ret;
 }
 
-static void enable_irq_window(struct kvm_vcpu *vcpu)
+static int enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -3646,6 +3646,7 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
 		svm_set_vintr(svm);
 		svm_inject_irq(svm, 0x0);
 	}
+	return 0;
 }
 
 static void enable_nmi_window(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0f0cb31..74c525e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4398,22 +4398,23 @@ static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
 		PIN_BASED_NMI_EXITING;
 }
 
-static void enable_irq_window(struct kvm_vcpu *vcpu)
+static int enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	u32 cpu_based_vm_exec_control;
-	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
+
+	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
 		/*
 		 * We get here if vmx_interrupt_allowed() said we can't
-		 * inject to L1 now because L2 must run. Ask L2 to exit
-		 * right after entry, so we can inject to L1 more promptly.
+		 * inject to L1 now because L2 must run. The caller will have
+		 * to make L2 exit right after entry, so we can inject to L1
+		 * more promptly.
 		 */
-		kvm_make_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
-		return;
-	}
+		return -EBUSY;
 
 	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+	return 0;
 }
 
 static void enable_nmi_window(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2a434bf..c522260 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5692,7 +5692,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	int r;
 	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
 		vcpu->run->request_interrupt_window;
-	bool req_immediate_exit = 0;
+	bool req_immediate_exit = false;
 
 	if (vcpu->requests) {
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
@@ -5734,8 +5734,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			record_steal_time(vcpu);
 		if (kvm_check_request(KVM_REQ_NMI, vcpu))
 			process_nmi(vcpu);
-		req_immediate_exit =
-			kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
 		if (kvm_check_request(KVM_REQ_PMU, vcpu))
 			kvm_handle_pmu_event(vcpu);
 		if (kvm_check_request(KVM_REQ_PMI, vcpu))
@@ -5757,7 +5755,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		if (vcpu->arch.nmi_pending)
 			kvm_x86_ops->enable_nmi_window(vcpu);
 		else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
-			kvm_x86_ops->enable_irq_window(vcpu);
+			req_immediate_exit =
+				kvm_x86_ops->enable_irq_window(vcpu) != 0;
 
 		if (kvm_lapic_enabled(vcpu)) {
 			/*
-- 
cgit v1.1


From 2a5bab1004729f3302c776e53ee7c895b98bb1ce Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 16 Apr 2013 13:49:18 -0600
Subject: kvm: Allow build-time configuration of KVM device assignment

We hope to at some point deprecate KVM legacy device assignment in
favor of VFIO-based assignment.  Towards that end, allow legacy
device assignment to be deconfigured.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Alexander Graf <agraf@suse.de>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/Kconfig  | 13 +++++++++++--
 arch/x86/kvm/Makefile |  5 +++--
 arch/x86/kvm/x86.c    |  6 ++++--
 3 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 9d50efd..a47a3e5 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -21,8 +21,6 @@ config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
 	depends on HAVE_KVM
 	depends on HIGH_RES_TIMERS
-	# for device assignment:
-	depends on PCI
 	# for TASKSTATS/TASK_DELAY_ACCT:
 	depends on NET
 	select PREEMPT_NOTIFIERS
@@ -83,6 +81,17 @@ config KVM_MMU_AUDIT
 	 This option adds a R/W kVM module parameter 'mmu_audit', which allows
 	 audit  KVM MMU at runtime.
 
+config KVM_DEVICE_ASSIGNMENT
+	bool "KVM legacy PCI device assignment support"
+	depends on KVM && PCI && IOMMU_API
+	default y
+	---help---
+	  Provide support for legacy PCI device assignment through KVM.  The
+	  kernel now also supports a full featured userspace device driver
+	  framework through VFIO, which supersedes much of this support.
+
+	  If unsure, say Y.
+
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index a797b8e..d609e1d 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,8 +7,9 @@ CFLAGS_vmx.o := -I.
 
 kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 				coalesced_mmio.o irq_comm.o eventfd.o \
-				assigned-dev.o irqchip.o)
-kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
+				irqchip.o)
+kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)	+= $(addprefix ../../../virt/kvm/, \
+				assigned-dev.o iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
 
 kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 145b1c8..8747fef 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2501,7 +2501,6 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_USER_NMI:
 	case KVM_CAP_REINJECT_CONTROL:
 	case KVM_CAP_IRQ_INJECT_STATUS:
-	case KVM_CAP_ASSIGN_DEV_IRQ:
 	case KVM_CAP_IRQFD:
 	case KVM_CAP_IOEVENTFD:
 	case KVM_CAP_PIT2:
@@ -2519,9 +2518,12 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_XSAVE:
 	case KVM_CAP_ASYNC_PF:
 	case KVM_CAP_GET_TSC_KHZ:
-	case KVM_CAP_PCI_2_3:
 	case KVM_CAP_KVMCLOCK_CTRL:
 	case KVM_CAP_READONLY_MEM:
+#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
+	case KVM_CAP_ASSIGN_DEV_IRQ:
+	case KVM_CAP_PCI_2_3:
+#endif
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
-- 
cgit v1.1


From 5a2892ce72e010e3cb96b438d7cdddce0c88e0e6 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sun, 28 Apr 2013 09:24:41 +0200
Subject: KVM: nVMX: Skip PF interception check when queuing during nested run

While a nested run is pending, vmx_queue_exception is only called to
requeue exceptions that were previously picked up via
vmx_cancel_injection. Therefore, we must not check for PF interception
by L1, possibly causing a bogus nested vmexit.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 74c525e..e10217e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1917,7 +1917,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
 	if (nr == PF_VECTOR && is_guest_mode(vcpu) &&
-		nested_pf_handled(vcpu))
+	    !vmx->nested.nested_run_pending && nested_pf_handled(vcpu))
 		return;
 
 	if (has_error_code) {
-- 
cgit v1.1


From 4cee4b72f1e2600e19779a14d4d9a4f4016ce49f Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 29 Apr 2013 10:54:08 -0600
Subject: kvm: KVM_CAP_IOMMU only available with device assignment

Fix build with CONFIG_PCI unset by linking KVM_CAP_IOMMU to
device assignment config option.  It has no purpose otherwise.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8747fef..96f914e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2544,9 +2544,11 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PV_MMU:	/* obsolete */
 		r = 0;
 		break;
+#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
 	case KVM_CAP_IOMMU:
 		r = iommu_present(&pci_bus_type);
 		break;
+#endif
 	case KVM_CAP_MCE:
 		r = KVM_MAX_MCE_BANKS;
 		break;
-- 
cgit v1.1


From 03b28f8133165dbe4cd922054d599e26b8119508 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 29 Apr 2013 16:46:42 +0200
Subject: KVM: x86: Account for failing enable_irq_window for NMI window
 request

With VMX, enable_irq_window can now return -EBUSY, in which case an
immediate exit shall be requested before entering the guest. Account for
this also in enable_nmi_window which uses enable_irq_window in absence
of vnmi support, e.g.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c |  5 +++--
 arch/x86/kvm/vmx.c | 16 +++++++---------
 arch/x86/kvm/x86.c |  3 ++-
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kvm')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7f896cb..3421d5a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3649,13 +3649,13 @@ static int enable_irq_window(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
+static int enable_nmi_window(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
 	    == HF_NMI_MASK)
-		return; /* IRET will cause a vm exit */
+		return 0; /* IRET will cause a vm exit */
 
 	/*
 	 * Something prevents NMI from been injected. Single step over possible
@@ -3664,6 +3664,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
 	svm->nmi_singlestep = true;
 	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
 	update_db_bp_intercept(vcpu);
+	return 0;
 }
 
 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e10217e..e53a5f7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4417,22 +4417,20 @@ static int enable_irq_window(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
+static int enable_nmi_window(struct kvm_vcpu *vcpu)
 {
 	u32 cpu_based_vm_exec_control;
 
-	if (!cpu_has_virtual_nmis()) {
-		enable_irq_window(vcpu);
-		return;
-	}
+	if (!cpu_has_virtual_nmis())
+		return enable_irq_window(vcpu);
+
+	if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI)
+		return enable_irq_window(vcpu);
 
-	if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
-		enable_irq_window(vcpu);
-		return;
-	}
 	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+	return 0;
 }
 
 static void vmx_inject_irq(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 96f914e..94f35d2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5756,7 +5756,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 		/* enable NMI/IRQ window open exits if needed */
 		if (vcpu->arch.nmi_pending)
-			kvm_x86_ops->enable_nmi_window(vcpu);
+			req_immediate_exit =
+				kvm_x86_ops->enable_nmi_window(vcpu) != 0;
 		else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
 			req_immediate_exit =
 				kvm_x86_ops->enable_irq_window(vcpu) != 0;
-- 
cgit v1.1