From f0b85051d0f0891378a83db22c0786f1d756fbff Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 25 Nov 2008 20:17:01 +0100
Subject: KVM: SVM: Clean up VINTR setting

The current VINTR intercept setters don't look clean to me. To make
the code easier to read and enable the possibilty to trap on a VINTR
set, this uses a helper function to set the VINTR intercept.

v2 uses two distinct functions for setting and clearing the bit

Acked-by: Joerg Roedel <joro@8bytes.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a9e769e..33407d9 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -718,6 +718,16 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 	to_svm(vcpu)->vmcb->save.rflags = rflags;
 }
 
+static void svm_set_vintr(struct vcpu_svm *svm)
+{
+	svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR;
+}
+
+static void svm_clear_vintr(struct vcpu_svm *svm)
+{
+	svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
+}
+
 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
 {
 	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
@@ -1380,7 +1390,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
 {
 	KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler);
 
-	svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
+	svm_clear_vintr(svm);
 	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
 	/*
 	 * If the user space waits to inject interrupts, exit as soon as
@@ -1593,7 +1603,7 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu)
 	    (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
 	    (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
 		/* unable to deliver irq, set pending irq */
-		vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR);
+		svm_set_vintr(svm);
 		svm_inject_irq(svm, 0x0);
 		goto out;
 	}
@@ -1652,9 +1662,9 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 	 */
 	if (!svm->vcpu.arch.interrupt_window_open &&
 	    (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
-		control->intercept |= 1ULL << INTERCEPT_VINTR;
-	 else
-		control->intercept &= ~(1ULL << INTERCEPT_VINTR);
+		svm_set_vintr(svm);
+	else
+		svm_clear_vintr(svm);
 }
 
 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
-- 
cgit v1.1


From 9962d032bbff0268f22068787831405f8468c8b4 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 25 Nov 2008 20:17:02 +0100
Subject: KVM: SVM: Move EFER and MSR constants to generic x86 code

MSR_EFER_SVME_MASK, MSR_VM_CR and MSR_VM_HSAVE_PA are set in KVM
specific headers. Linux does have nice header files to collect
EFER bits and MSR IDs, so IMHO we should put them there.

While at it, I also changed the naming scheme to match that
of the other defines.

(introduced in v6)

Acked-by: Joerg Roedel <joro@8bytes.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h  | 1 +
 arch/x86/include/asm/msr-index.h | 7 +++++++
 arch/x86/include/asm/svm.h       | 4 ----
 arch/x86/include/asm/virtext.h   | 2 +-
 arch/x86/kvm/svm.c               | 6 +++---
 5 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 730843d..2998efe 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -22,6 +22,7 @@
 #include <asm/pvclock-abi.h>
 #include <asm/desc.h>
 #include <asm/mtrr.h>
+#include <asm/msr-index.h>
 
 #define KVM_MAX_VCPUS 16
 #define KVM_MEMORY_SLOTS 32
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 358acc5..46e9646 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -18,11 +18,13 @@
 #define _EFER_LME		8  /* Long mode enable */
 #define _EFER_LMA		10 /* Long mode active (read-only) */
 #define _EFER_NX		11 /* No execute enable */
+#define _EFER_SVME		12 /* Enable virtualization */
 
 #define EFER_SCE		(1<<_EFER_SCE)
 #define EFER_LME		(1<<_EFER_LME)
 #define EFER_LMA		(1<<_EFER_LMA)
 #define EFER_NX			(1<<_EFER_NX)
+#define EFER_SVME		(1<<_EFER_SVME)
 
 /* Intel MSRs. Some also available on other CPUs */
 #define MSR_IA32_PERFCTR0		0x000000c1
@@ -360,4 +362,9 @@
 #define MSR_IA32_VMX_PROCBASED_CTLS2    0x0000048b
 #define MSR_IA32_VMX_EPT_VPID_CAP       0x0000048c
 
+/* AMD-V MSRs */
+
+#define MSR_VM_CR                       0xc0010114
+#define MSR_VM_HSAVE_PA                 0xc0010117
+
 #endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 1b8afa7..82ada75 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -174,10 +174,6 @@ struct __attribute__ ((__packed__)) vmcb {
 #define SVM_CPUID_FEATURE_SHIFT 2
 #define SVM_CPUID_FUNC 0x8000000a
 
-#define MSR_EFER_SVME_MASK (1ULL << 12)
-#define MSR_VM_CR       0xc0010114
-#define MSR_VM_HSAVE_PA 0xc0010117ULL
-
 #define SVM_VM_CR_SVM_DISABLE 4
 
 #define SVM_SELECTOR_S_SHIFT 4
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 5936362..e0f9aa1 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -118,7 +118,7 @@ static inline void cpu_svm_disable(void)
 
 	wrmsrl(MSR_VM_HSAVE_PA, 0);
 	rdmsrl(MSR_EFER, efer);
-	wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
+	wrmsrl(MSR_EFER, efer & ~EFER_SVME);
 }
 
 /** Makes sure SVM is disabled, if it is supported on the CPU
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 33407d9..e4eb3fd 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -198,7 +198,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 	if (!npt_enabled && !(efer & EFER_LMA))
 		efer &= ~EFER_LME;
 
-	to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
+	to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
 	vcpu->arch.shadow_efer = efer;
 }
 
@@ -292,7 +292,7 @@ static void svm_hardware_enable(void *garbage)
 	svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 
 	rdmsrl(MSR_EFER, efer);
-	wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK);
+	wrmsrl(MSR_EFER, efer | EFER_SVME);
 
 	wrmsrl(MSR_VM_HSAVE_PA,
 	       page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
@@ -559,7 +559,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
 	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
 
-	save->efer = MSR_EFER_SVME_MASK;
+	save->efer = EFER_SVME;
 	save->dr6 = 0xffff0ff0;
 	save->dr7 = 0x400;
 	save->rflags = 2;
-- 
cgit v1.1


From c0725420cfdcf6dd9705b164a8c6cba86684080d Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 25 Nov 2008 20:17:03 +0100
Subject: KVM: SVM: Add helper functions for nested SVM

These are helpers for the nested SVM implementation.

- nsvm_printk implements a debug printk variant
- nested_svm_do calls a handler that can accesses gpa-based memory

v3 makes use of the new permission checker
v6 changes:
- streamline nsvm_debug()
- remove printk(KERN_ERR)
- SVME check before CPL check
- give GP error code
- use new EFER constant

Acked-by: Joerg Roedel <joro@8bytes.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e4eb3fd..87debdc 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -50,6 +50,15 @@ MODULE_LICENSE("GPL");
 
 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
 
+/* Turn on to get debugging output*/
+/* #define NESTED_DEBUG */
+
+#ifdef NESTED_DEBUG
+#define nsvm_printk(fmt, args...) printk(KERN_INFO fmt, ## args)
+#else
+#define nsvm_printk(fmt, args...) do {} while(0)
+#endif
+
 /* enable NPT for AMD64 and X86 with PAE */
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 static bool npt_enabled = true;
@@ -1149,6 +1158,82 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	return 1;
 }
 
+static int nested_svm_check_permissions(struct vcpu_svm *svm)
+{
+	if (!(svm->vcpu.arch.shadow_efer & EFER_SVME)
+	    || !is_paging(&svm->vcpu)) {
+		kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	if (svm->vmcb->save.cpl) {
+		kvm_inject_gp(&svm->vcpu, 0);
+		return 1;
+	}
+
+       return 0;
+}
+
+static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa)
+{
+	struct page *page;
+
+	down_read(&current->mm->mmap_sem);
+	page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
+	up_read(&current->mm->mmap_sem);
+
+	if (is_error_page(page)) {
+		printk(KERN_INFO "%s: could not find page at 0x%llx\n",
+		       __func__, gpa);
+		kvm_release_page_clean(page);
+		kvm_inject_gp(&svm->vcpu, 0);
+		return NULL;
+	}
+	return page;
+}
+
+static int nested_svm_do(struct vcpu_svm *svm,
+			 u64 arg1_gpa, u64 arg2_gpa, void *opaque,
+			 int (*handler)(struct vcpu_svm *svm,
+					void *arg1,
+					void *arg2,
+					void *opaque))
+{
+	struct page *arg1_page;
+	struct page *arg2_page = NULL;
+	void *arg1;
+	void *arg2 = NULL;
+	int retval;
+
+	arg1_page = nested_svm_get_page(svm, arg1_gpa);
+	if(arg1_page == NULL)
+		return 1;
+
+	if (arg2_gpa) {
+		arg2_page = nested_svm_get_page(svm, arg2_gpa);
+		if(arg2_page == NULL) {
+			kvm_release_page_clean(arg1_page);
+			return 1;
+		}
+	}
+
+	arg1 = kmap_atomic(arg1_page, KM_USER0);
+	if (arg2_gpa)
+		arg2 = kmap_atomic(arg2_page, KM_USER1);
+
+	retval = handler(svm, arg1, arg2, opaque);
+
+	kunmap_atomic(arg1, KM_USER0);
+	if (arg2_gpa)
+		kunmap_atomic(arg2, KM_USER1);
+
+	kvm_release_page_dirty(arg1_page);
+	if (arg2_gpa)
+		kvm_release_page_dirty(arg2_page);
+
+	return retval;
+}
+
 static int invalid_op_interception(struct vcpu_svm *svm,
 				   struct kvm_run *kvm_run)
 {
-- 
cgit v1.1


From 1371d90460189d02bf1bcca19dbfe6bd10dc6031 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 25 Nov 2008 20:17:04 +0100
Subject: KVM: SVM: Implement GIF, clgi and stgi

This patch implements the GIF flag and the clgi and stgi instructions that
set this flag. Only if the flag is set (default), interrupts can be received by
the CPU.

To keep the information about that somewhere, this patch adds a new hidden
flags vector. that is used to store information that does not go into the
vmcb, but is SVM specific.

I tried to write some code to make -no-kvm-irqchip work too, but the first
level guest won't even boot with that atm, so I ditched it.

v2 moves the hflags to x86 generic code
v3 makes use of the new permission helper
v6 only enables interrupt_window if GIF=1

Acked-by: Joerg Roedel <joro@8bytes.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 +++
 arch/x86/kvm/svm.c              | 47 ++++++++++++++++++++++++++++++++++++-----
 2 files changed, 45 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2998efe..29e4157 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -259,6 +259,7 @@ struct kvm_vcpu_arch {
 	unsigned long cr3;
 	unsigned long cr4;
 	unsigned long cr8;
+	u32 hflags;
 	u64 pdptrs[4]; /* pae */
 	u64 shadow_efer;
 	u64 apic_base;
@@ -738,6 +739,8 @@ enum {
 	TASK_SWITCH_GATE = 3,
 };
 
+#define HF_GIF_MASK		(1 << 0)
+
 /*
  * Hardware virtualization extension instructions may fault if a
  * reboot turns off virtualization while processes are running.
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 87debdc..79cc06b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -251,7 +251,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	kvm_rip_write(vcpu, svm->next_rip);
 	svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 
-	vcpu->arch.interrupt_window_open = 1;
+	vcpu->arch.interrupt_window_open = (svm->vcpu.arch.hflags & HF_GIF_MASK);
 }
 
 static int has_svm(void)
@@ -600,6 +600,8 @@ static void init_vmcb(struct vcpu_svm *svm)
 		save->cr4 = 0;
 	}
 	force_new_asid(&svm->vcpu);
+
+	svm->vcpu.arch.hflags = HF_GIF_MASK;
 }
 
 static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
@@ -1234,6 +1236,36 @@ static int nested_svm_do(struct vcpu_svm *svm,
 	return retval;
 }
 
+static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	if (nested_svm_check_permissions(svm))
+		return 1;
+
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+	skip_emulated_instruction(&svm->vcpu);
+
+	svm->vcpu.arch.hflags |= HF_GIF_MASK;
+
+	return 1;
+}
+
+static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	if (nested_svm_check_permissions(svm))
+		return 1;
+
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+	skip_emulated_instruction(&svm->vcpu);
+
+	svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
+
+	/* After a CLGI no interrupts should come */
+	svm_clear_vintr(svm);
+	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+
+	return 1;
+}
+
 static int invalid_op_interception(struct vcpu_svm *svm,
 				   struct kvm_run *kvm_run)
 {
@@ -1535,8 +1567,8 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 	[SVM_EXIT_VMMCALL]			= vmmcall_interception,
 	[SVM_EXIT_VMLOAD]			= invalid_op_interception,
 	[SVM_EXIT_VMSAVE]			= invalid_op_interception,
-	[SVM_EXIT_STGI]				= invalid_op_interception,
-	[SVM_EXIT_CLGI]				= invalid_op_interception,
+	[SVM_EXIT_STGI]				= stgi_interception,
+	[SVM_EXIT_CLGI]				= clgi_interception,
 	[SVM_EXIT_SKINIT]			= invalid_op_interception,
 	[SVM_EXIT_WBINVD]                       = emulate_on_interception,
 	[SVM_EXIT_MONITOR]			= invalid_op_interception,
@@ -1684,6 +1716,9 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu)
 	if (!kvm_cpu_has_interrupt(vcpu))
 		goto out;
 
+	if (!(svm->vcpu.arch.hflags & HF_GIF_MASK))
+		goto out;
+
 	if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
 	    (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
 	    (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
@@ -1710,7 +1745,8 @@ static void kvm_reput_irq(struct vcpu_svm *svm)
 	}
 
 	svm->vcpu.arch.interrupt_window_open =
-		!(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
+		!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+		 (svm->vcpu.arch.hflags & HF_GIF_MASK);
 }
 
 static void svm_do_inject_vector(struct vcpu_svm *svm)
@@ -1734,7 +1770,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 
 	svm->vcpu.arch.interrupt_window_open =
 		(!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
-		 (svm->vmcb->save.rflags & X86_EFLAGS_IF));
+		 (svm->vmcb->save.rflags & X86_EFLAGS_IF) &&
+		 (svm->vcpu.arch.hflags & HF_GIF_MASK));
 
 	if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
 		/*
-- 
cgit v1.1


From b286d5d8b0836e76832dafcc5a18b0e8e5a3bc5e Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 25 Nov 2008 20:17:05 +0100
Subject: KVM: SVM: Implement hsave

Implement the hsave MSR, that gives the VCPU a GPA to save the
old guest state in.

v2 allows userspace to save/restore hsave
v4 dummys out the hsave MSR, so we use a host page
v6 remembers the guest's hsave and exports the MSR

Acked-by: Joerg Roedel <joro@8bytes.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/kvm_svm.h |  2 ++
 arch/x86/kvm/svm.c     | 13 +++++++++++++
 arch/x86/kvm/x86.c     |  2 +-
 3 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
index 8e5ee99..a0877ca 100644
--- a/arch/x86/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -41,6 +41,8 @@ struct vcpu_svm {
 	unsigned long host_dr7;
 
 	u32 *msrpm;
+	struct vmcb *hsave;
+	u64 hsave_msr;
 };
 
 #endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 79cc06b..59aaff1 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -626,6 +626,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	struct vcpu_svm *svm;
 	struct page *page;
 	struct page *msrpm_pages;
+	struct page *hsave_page;
 	int err;
 
 	svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
@@ -651,6 +652,11 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	svm->msrpm = page_address(msrpm_pages);
 	svm_vcpu_init_msrpm(svm->msrpm);
 
+	hsave_page = alloc_page(GFP_KERNEL);
+	if (!hsave_page)
+		goto uninit;
+	svm->hsave = page_address(hsave_page);
+
 	svm->vmcb = page_address(page);
 	clear_page(svm->vmcb);
 	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
@@ -680,6 +686,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
 
 	__free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
 	__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
+	__free_page(virt_to_page(svm->hsave));
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, svm);
 }
@@ -1377,6 +1384,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 	case MSR_IA32_LASTINTTOIP:
 		*data = svm->vmcb->save.last_excp_to;
 		break;
+	case MSR_VM_HSAVE_PA:
+		*data = svm->hsave_msr;
+		break;
 	default:
 		return kvm_get_msr_common(vcpu, ecx, data);
 	}
@@ -1471,6 +1481,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 		pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data);
 
 		break;
+	case MSR_VM_HSAVE_PA:
+		svm->hsave_msr = data;
+		break;
 	default:
 		return kvm_set_msr_common(vcpu, ecx, data);
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 758b7a1..99165a9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -456,7 +456,7 @@ static u32 msrs_to_save[] = {
 	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
 	MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
-	MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT
+	MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
 };
 
 static unsigned num_msrs_to_save;
-- 
cgit v1.1


From 5542675baa7e62ca4d18278c8758b6a4ec410639 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 25 Nov 2008 20:17:06 +0100
Subject: KVM: SVM: Add VMLOAD and VMSAVE handlers

This implements the VMLOAD and VMSAVE instructions, that usually surround
the VMRUN instructions. Both instructions load / restore the same elements,
so we only need to implement them once.

v2 fixes CPL checking and replaces memcpy by assignments
v3 makes use of the new permission checking

Acked-by: Joerg Roedel <joro@8bytes.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 58 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 59aaff1..a83c94e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1243,6 +1243,62 @@ static int nested_svm_do(struct vcpu_svm *svm,
 	return retval;
 }
 
+static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+{
+	to_vmcb->save.fs = from_vmcb->save.fs;
+	to_vmcb->save.gs = from_vmcb->save.gs;
+	to_vmcb->save.tr = from_vmcb->save.tr;
+	to_vmcb->save.ldtr = from_vmcb->save.ldtr;
+	to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
+	to_vmcb->save.star = from_vmcb->save.star;
+	to_vmcb->save.lstar = from_vmcb->save.lstar;
+	to_vmcb->save.cstar = from_vmcb->save.cstar;
+	to_vmcb->save.sfmask = from_vmcb->save.sfmask;
+	to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
+	to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
+	to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
+
+	return 1;
+}
+
+static int nested_svm_vmload(struct vcpu_svm *svm, void *nested_vmcb,
+			     void *arg2, void *opaque)
+{
+	return nested_svm_vmloadsave((struct vmcb *)nested_vmcb, svm->vmcb);
+}
+
+static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
+			     void *arg2, void *opaque)
+{
+	return nested_svm_vmloadsave(svm->vmcb, (struct vmcb *)nested_vmcb);
+}
+
+static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	if (nested_svm_check_permissions(svm))
+		return 1;
+
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+	skip_emulated_instruction(&svm->vcpu);
+
+	nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmload);
+
+	return 1;
+}
+
+static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	if (nested_svm_check_permissions(svm))
+		return 1;
+
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+	skip_emulated_instruction(&svm->vcpu);
+
+	nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmsave);
+
+	return 1;
+}
+
 static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
 	if (nested_svm_check_permissions(svm))
@@ -1578,8 +1634,8 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 	[SVM_EXIT_SHUTDOWN]			= shutdown_interception,
 	[SVM_EXIT_VMRUN]			= invalid_op_interception,
 	[SVM_EXIT_VMMCALL]			= vmmcall_interception,
-	[SVM_EXIT_VMLOAD]			= invalid_op_interception,
-	[SVM_EXIT_VMSAVE]			= invalid_op_interception,
+	[SVM_EXIT_VMLOAD]			= vmload_interception,
+	[SVM_EXIT_VMSAVE]			= vmsave_interception,
 	[SVM_EXIT_STGI]				= stgi_interception,
 	[SVM_EXIT_CLGI]				= clgi_interception,
 	[SVM_EXIT_SKINIT]			= invalid_op_interception,
-- 
cgit v1.1


From 3d6368ef580a4dff012960834bba4e28d3c1430c Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 25 Nov 2008 20:17:07 +0100
Subject: KVM: SVM: Add VMRUN handler

This patch implements VMRUN. VMRUN enters a virtual CPU and runs that
in the same context as the normal guest CPU would run.
So basically it is implemented the same way, a normal CPU would do it.

We also prepare all intercepts that get OR'ed with the original
intercepts, as we do not allow a level 2 guest to be intercepted less
than the first level guest.

v2 implements the following improvements:

- fixes the CPL check
- does not allocate iopm when not used
- remembers the host's IF in the HIF bit in the hflags

v3:

- make use of the new permission checking
- add support for V_INTR_MASKING_MASK

v4:

- use host page backed hsave

v5:

- remove IOPM merging code

v6:

- save cr4 so PAE l1 guests work

v7:

- return 0 on vmrun so we check the MSRs too
- fix MSR check to use the correct variable

Acked-by: Joerg Roedel <joro@8bytes.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   2 +
 arch/x86/kvm/kvm_svm.h          |   8 ++
 arch/x86/kvm/svm.c              | 157 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 165 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 29e4157..5377930 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -740,6 +740,8 @@ enum {
 };
 
 #define HF_GIF_MASK		(1 << 0)
+#define HF_HIF_MASK		(1 << 1)
+#define HF_VINTR_MASK		(1 << 2)
 
 /*
  * Hardware virtualization extension instructions may fault if a
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
index a0877ca..9167341 100644
--- a/arch/x86/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -43,6 +43,14 @@ struct vcpu_svm {
 	u32 *msrpm;
 	struct vmcb *hsave;
 	u64 hsave_msr;
+
+	u64 nested_vmcb;
+
+	/* These are the merged vectors */
+	u32 *nested_msrpm;
+
+	/* gpa pointers to the real vectors */
+	u64 nested_vmcb_msrpm;
 };
 
 #endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a83c94e..fad187c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -77,6 +77,11 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_svm, vcpu);
 }
 
+static inline bool is_nested(struct vcpu_svm *svm)
+{
+	return svm->nested_vmcb;
+}
+
 static unsigned long iopm_base;
 
 struct kvm_ldttss_desc {
@@ -601,6 +606,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 	}
 	force_new_asid(&svm->vcpu);
 
+	svm->nested_vmcb = 0;
 	svm->vcpu.arch.hflags = HF_GIF_MASK;
 }
 
@@ -627,6 +633,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	struct page *page;
 	struct page *msrpm_pages;
 	struct page *hsave_page;
+	struct page *nested_msrpm_pages;
 	int err;
 
 	svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
@@ -649,6 +656,11 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
 	if (!msrpm_pages)
 		goto uninit;
+
+	nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+	if (!nested_msrpm_pages)
+		goto uninit;
+
 	svm->msrpm = page_address(msrpm_pages);
 	svm_vcpu_init_msrpm(svm->msrpm);
 
@@ -657,6 +669,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 		goto uninit;
 	svm->hsave = page_address(hsave_page);
 
+	svm->nested_msrpm = page_address(nested_msrpm_pages);
+
 	svm->vmcb = page_address(page);
 	clear_page(svm->vmcb);
 	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
@@ -687,6 +701,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
 	__free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
 	__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
 	__free_page(virt_to_page(svm->hsave));
+	__free_pages(virt_to_page(svm->nested_msrpm), MSRPM_ALLOC_ORDER);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, svm);
 }
@@ -1243,6 +1258,123 @@ static int nested_svm_do(struct vcpu_svm *svm,
 	return retval;
 }
 
+
+static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1,
+				  void *arg2, void *opaque)
+{
+	int i;
+	u32 *nested_msrpm = (u32*)arg1;
+	for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
+		svm->nested_msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
+	svm->vmcb->control.msrpm_base_pa = __pa(svm->nested_msrpm);
+
+	return 0;
+}
+
+static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
+			    void *arg2, void *opaque)
+{
+	struct vmcb *nested_vmcb = (struct vmcb *)arg1;
+	struct vmcb *hsave = svm->hsave;
+
+	/* nested_vmcb is our indicator if nested SVM is activated */
+	svm->nested_vmcb = svm->vmcb->save.rax;
+
+	/* Clear internal status */
+	svm->vcpu.arch.exception.pending = false;
+
+	/* Save the old vmcb, so we don't need to pick what we save, but
+	   can restore everything when a VMEXIT occurs */
+	memcpy(hsave, svm->vmcb, sizeof(struct vmcb));
+	/* We need to remember the original CR3 in the SPT case */
+	if (!npt_enabled)
+		hsave->save.cr3 = svm->vcpu.arch.cr3;
+	hsave->save.cr4 = svm->vcpu.arch.cr4;
+	hsave->save.rip = svm->next_rip;
+
+	if (svm->vmcb->save.rflags & X86_EFLAGS_IF)
+		svm->vcpu.arch.hflags |= HF_HIF_MASK;
+	else
+		svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
+
+	/* Load the nested guest state */
+	svm->vmcb->save.es = nested_vmcb->save.es;
+	svm->vmcb->save.cs = nested_vmcb->save.cs;
+	svm->vmcb->save.ss = nested_vmcb->save.ss;
+	svm->vmcb->save.ds = nested_vmcb->save.ds;
+	svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
+	svm->vmcb->save.idtr = nested_vmcb->save.idtr;
+	svm->vmcb->save.rflags = nested_vmcb->save.rflags;
+	svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
+	svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
+	svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
+	if (npt_enabled) {
+		svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
+		svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
+	} else {
+		kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
+		kvm_mmu_reset_context(&svm->vcpu);
+	}
+	svm->vmcb->save.cr2 = nested_vmcb->save.cr2;
+	kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
+	kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
+	kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
+	/* In case we don't even reach vcpu_run, the fields are not updated */
+	svm->vmcb->save.rax = nested_vmcb->save.rax;
+	svm->vmcb->save.rsp = nested_vmcb->save.rsp;
+	svm->vmcb->save.rip = nested_vmcb->save.rip;
+	svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
+	svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
+	svm->vmcb->save.cpl = nested_vmcb->save.cpl;
+
+	/* We don't want a nested guest to be more powerful than the guest,
+	   so all intercepts are ORed */
+	svm->vmcb->control.intercept_cr_read |=
+		nested_vmcb->control.intercept_cr_read;
+	svm->vmcb->control.intercept_cr_write |=
+		nested_vmcb->control.intercept_cr_write;
+	svm->vmcb->control.intercept_dr_read |=
+		nested_vmcb->control.intercept_dr_read;
+	svm->vmcb->control.intercept_dr_write |=
+		nested_vmcb->control.intercept_dr_write;
+	svm->vmcb->control.intercept_exceptions |=
+		nested_vmcb->control.intercept_exceptions;
+
+	svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
+
+	svm->nested_vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
+
+	force_new_asid(&svm->vcpu);
+	svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info;
+	svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err;
+	svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
+	if (nested_vmcb->control.int_ctl & V_IRQ_MASK) {
+		nsvm_printk("nSVM Injecting Interrupt: 0x%x\n",
+				nested_vmcb->control.int_ctl);
+	}
+	if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
+		svm->vcpu.arch.hflags |= HF_VINTR_MASK;
+	else
+		svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
+
+	nsvm_printk("nSVM exit_int_info: 0x%x | int_state: 0x%x\n",
+			nested_vmcb->control.exit_int_info,
+			nested_vmcb->control.int_state);
+
+	svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
+	svm->vmcb->control.int_state = nested_vmcb->control.int_state;
+	svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
+	if (nested_vmcb->control.event_inj & SVM_EVTINJ_VALID)
+		nsvm_printk("Injecting Event: 0x%x\n",
+				nested_vmcb->control.event_inj);
+	svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
+	svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
+
+	svm->vcpu.arch.hflags |= HF_GIF_MASK;
+
+	return 0;
+}
+
 static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
 {
 	to_vmcb->save.fs = from_vmcb->save.fs;
@@ -1299,6 +1431,26 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	return 1;
 }
 
+static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	nsvm_printk("VMrun\n");
+	if (nested_svm_check_permissions(svm))
+		return 1;
+
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+	skip_emulated_instruction(&svm->vcpu);
+
+	if (nested_svm_do(svm, svm->vmcb->save.rax, 0,
+			  NULL, nested_svm_vmrun))
+		return 1;
+
+	if (nested_svm_do(svm, svm->nested_vmcb_msrpm, 0,
+		      NULL, nested_svm_vmrun_msrpm))
+		return 1;
+
+	return 1;
+}
+
 static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
 	if (nested_svm_check_permissions(svm))
@@ -1632,7 +1784,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 	[SVM_EXIT_MSR]				= msr_interception,
 	[SVM_EXIT_TASK_SWITCH]			= task_switch_interception,
 	[SVM_EXIT_SHUTDOWN]			= shutdown_interception,
-	[SVM_EXIT_VMRUN]			= invalid_op_interception,
+	[SVM_EXIT_VMRUN]			= vmrun_interception,
 	[SVM_EXIT_VMMCALL]			= vmmcall_interception,
 	[SVM_EXIT_VMLOAD]			= vmload_interception,
 	[SVM_EXIT_VMSAVE]			= vmsave_interception,
@@ -1939,7 +2091,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	svm->host_cr2 = kvm_read_cr2();
 	svm->host_dr6 = read_dr6();
 	svm->host_dr7 = read_dr7();
-	svm->vmcb->save.cr2 = vcpu->arch.cr2;
+	if (!is_nested(svm))
+		svm->vmcb->save.cr2 = vcpu->arch.cr2;
 	/* required for live migration with NPT */
 	if (npt_enabled)
 		svm->vmcb->save.cr3 = vcpu->arch.cr3;
-- 
cgit v1.1


From cf74a78b229d07f77416d2fe1f029f183a8a31df Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 25 Nov 2008 20:17:08 +0100
Subject: KVM: SVM: Add VMEXIT handler and intercepts

This adds the #VMEXIT intercept, so we return to the level 1 guest
when something happens in the level 2 guest that should return to
the level 1 guest.

v2 implements HIF handling and cleans up exception interception
v3 adds support for V_INTR_MASKING_MASK
v4 uses the host page hsave
v5 removes IOPM merging code
v6 moves mmu code out of the atomic section

Acked-by: Joerg Roedel <joro@8bytes.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 293 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 293 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index fad187c..4cb2920 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -72,6 +72,13 @@ module_param(npt, int, S_IRUGO);
 static void kvm_reput_irq(struct vcpu_svm *svm);
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 
+static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override);
+static int nested_svm_vmexit(struct vcpu_svm *svm);
+static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
+			     void *arg2, void *opaque);
+static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
+				      bool has_error_code, u32 error_code);
+
 static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 {
 	return container_of(vcpu, struct vcpu_svm, vcpu);
@@ -221,6 +228,11 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	/* If we are within a nested VM we'd better #VMEXIT and let the
+	   guest handle the exception */
+	if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
+		return;
+
 	svm->vmcb->control.event_inj = nr
 		| SVM_EVTINJ_VALID
 		| (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
@@ -1198,6 +1210,46 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
        return 0;
 }
 
+static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
+				      bool has_error_code, u32 error_code)
+{
+	if (is_nested(svm)) {
+		svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
+		svm->vmcb->control.exit_code_hi = 0;
+		svm->vmcb->control.exit_info_1 = error_code;
+		svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
+		if (nested_svm_exit_handled(svm, false)) {
+			nsvm_printk("VMexit -> EXCP 0x%x\n", nr);
+
+			nested_svm_vmexit(svm);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static inline int nested_svm_intr(struct vcpu_svm *svm)
+{
+	if (is_nested(svm)) {
+		if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
+			return 0;
+
+		if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
+			return 0;
+
+		svm->vmcb->control.exit_code = SVM_EXIT_INTR;
+
+		if (nested_svm_exit_handled(svm, false)) {
+			nsvm_printk("VMexit -> INTR\n");
+			nested_svm_vmexit(svm);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
 static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa)
 {
 	struct page *page;
@@ -1258,6 +1310,228 @@ static int nested_svm_do(struct vcpu_svm *svm,
 	return retval;
 }
 
+static int nested_svm_exit_handled_real(struct vcpu_svm *svm,
+					void *arg1,
+					void *arg2,
+					void *opaque)
+{
+	struct vmcb *nested_vmcb = (struct vmcb *)arg1;
+	bool kvm_overrides = *(bool *)opaque;
+	u32 exit_code = svm->vmcb->control.exit_code;
+
+	if (kvm_overrides) {
+		switch (exit_code) {
+		case SVM_EXIT_INTR:
+		case SVM_EXIT_NMI:
+			return 0;
+		/* For now we are always handling NPFs when using them */
+		case SVM_EXIT_NPF:
+			if (npt_enabled)
+				return 0;
+			break;
+		/* When we're shadowing, trap PFs */
+		case SVM_EXIT_EXCP_BASE + PF_VECTOR:
+			if (!npt_enabled)
+				return 0;
+			break;
+		default:
+			break;
+		}
+	}
+
+	switch (exit_code) {
+	case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
+		u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
+		if (nested_vmcb->control.intercept_cr_read & cr_bits)
+			return 1;
+		break;
+	}
+	case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
+		u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0);
+		if (nested_vmcb->control.intercept_cr_write & cr_bits)
+			return 1;
+		break;
+	}
+	case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
+		u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
+		if (nested_vmcb->control.intercept_dr_read & dr_bits)
+			return 1;
+		break;
+	}
+	case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
+		u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
+		if (nested_vmcb->control.intercept_dr_write & dr_bits)
+			return 1;
+		break;
+	}
+	case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
+		u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
+		if (nested_vmcb->control.intercept_exceptions & excp_bits)
+			return 1;
+		break;
+	}
+	default: {
+		u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
+		nsvm_printk("exit code: 0x%x\n", exit_code);
+		if (nested_vmcb->control.intercept & exit_bits)
+			return 1;
+	}
+	}
+
+	return 0;
+}
+
+static int nested_svm_exit_handled_msr(struct vcpu_svm *svm,
+				       void *arg1, void *arg2,
+				       void *opaque)
+{
+	struct vmcb *nested_vmcb = (struct vmcb *)arg1;
+	u8 *msrpm = (u8 *)arg2;
+        u32 t0, t1;
+	u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+	u32 param = svm->vmcb->control.exit_info_1 & 1;
+
+	if (!(nested_vmcb->control.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+		return 0;
+
+	switch(msr) {
+	case 0 ... 0x1fff:
+		t0 = (msr * 2) % 8;
+		t1 = msr / 8;
+		break;
+	case 0xc0000000 ... 0xc0001fff:
+		t0 = (8192 + msr - 0xc0000000) * 2;
+		t1 = (t0 / 8);
+		t0 %= 8;
+		break;
+	case 0xc0010000 ... 0xc0011fff:
+		t0 = (16384 + msr - 0xc0010000) * 2;
+		t1 = (t0 / 8);
+		t0 %= 8;
+		break;
+	default:
+		return 1;
+		break;
+	}
+	if (msrpm[t1] & ((1 << param) << t0))
+		return 1;
+
+	return 0;
+}
+
+static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override)
+{
+	bool k = kvm_override;
+
+	switch (svm->vmcb->control.exit_code) {
+	case SVM_EXIT_MSR:
+		return nested_svm_do(svm, svm->nested_vmcb,
+				     svm->nested_vmcb_msrpm, NULL,
+				     nested_svm_exit_handled_msr);
+	default: break;
+	}
+
+	return nested_svm_do(svm, svm->nested_vmcb, 0, &k,
+			     nested_svm_exit_handled_real);
+}
+
+static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1,
+				  void *arg2, void *opaque)
+{
+	struct vmcb *nested_vmcb = (struct vmcb *)arg1;
+	struct vmcb *hsave = svm->hsave;
+	u64 nested_save[] = { nested_vmcb->save.cr0,
+			      nested_vmcb->save.cr3,
+			      nested_vmcb->save.cr4,
+			      nested_vmcb->save.efer,
+			      nested_vmcb->control.intercept_cr_read,
+			      nested_vmcb->control.intercept_cr_write,
+			      nested_vmcb->control.intercept_dr_read,
+			      nested_vmcb->control.intercept_dr_write,
+			      nested_vmcb->control.intercept_exceptions,
+			      nested_vmcb->control.intercept,
+			      nested_vmcb->control.msrpm_base_pa,
+			      nested_vmcb->control.iopm_base_pa,
+			      nested_vmcb->control.tsc_offset };
+
+	/* Give the current vmcb to the guest */
+	memcpy(nested_vmcb, svm->vmcb, sizeof(struct vmcb));
+	nested_vmcb->save.cr0 = nested_save[0];
+	if (!npt_enabled)
+		nested_vmcb->save.cr3 = nested_save[1];
+	nested_vmcb->save.cr4 = nested_save[2];
+	nested_vmcb->save.efer = nested_save[3];
+	nested_vmcb->control.intercept_cr_read = nested_save[4];
+	nested_vmcb->control.intercept_cr_write = nested_save[5];
+	nested_vmcb->control.intercept_dr_read = nested_save[6];
+	nested_vmcb->control.intercept_dr_write = nested_save[7];
+	nested_vmcb->control.intercept_exceptions = nested_save[8];
+	nested_vmcb->control.intercept = nested_save[9];
+	nested_vmcb->control.msrpm_base_pa = nested_save[10];
+	nested_vmcb->control.iopm_base_pa = nested_save[11];
+	nested_vmcb->control.tsc_offset = nested_save[12];
+
+	/* We always set V_INTR_MASKING and remember the old value in hflags */
+	if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
+		nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
+
+	if ((nested_vmcb->control.int_ctl & V_IRQ_MASK) &&
+	    (nested_vmcb->control.int_vector)) {
+		nsvm_printk("WARNING: IRQ 0x%x still enabled on #VMEXIT\n",
+				nested_vmcb->control.int_vector);
+	}
+
+	/* Restore the original control entries */
+	svm->vmcb->control = hsave->control;
+
+	/* Kill any pending exceptions */
+	if (svm->vcpu.arch.exception.pending == true)
+		nsvm_printk("WARNING: Pending Exception\n");
+	svm->vcpu.arch.exception.pending = false;
+
+	/* Restore selected save entries */
+	svm->vmcb->save.es = hsave->save.es;
+	svm->vmcb->save.cs = hsave->save.cs;
+	svm->vmcb->save.ss = hsave->save.ss;
+	svm->vmcb->save.ds = hsave->save.ds;
+	svm->vmcb->save.gdtr = hsave->save.gdtr;
+	svm->vmcb->save.idtr = hsave->save.idtr;
+	svm->vmcb->save.rflags = hsave->save.rflags;
+	svm_set_efer(&svm->vcpu, hsave->save.efer);
+	svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
+	svm_set_cr4(&svm->vcpu, hsave->save.cr4);
+	if (npt_enabled) {
+		svm->vmcb->save.cr3 = hsave->save.cr3;
+		svm->vcpu.arch.cr3 = hsave->save.cr3;
+	} else {
+		kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
+	}
+	kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
+	kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
+	kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
+	svm->vmcb->save.dr7 = 0;
+	svm->vmcb->save.cpl = 0;
+	svm->vmcb->control.exit_int_info = 0;
+
+	svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
+	/* Exit nested SVM mode */
+	svm->nested_vmcb = 0;
+
+	return 0;
+}
+
+static int nested_svm_vmexit(struct vcpu_svm *svm)
+{
+	nsvm_printk("VMexit\n");
+	if (nested_svm_do(svm, svm->nested_vmcb, 0,
+			  NULL, nested_svm_vmexit_real))
+		return 1;
+
+	kvm_mmu_reset_context(&svm->vcpu);
+	kvm_mmu_load(&svm->vcpu);
+
+	return 0;
+}
 
 static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1,
 				  void *arg2, void *opaque)
@@ -1805,6 +2079,17 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip,
 		    (u32)((u64)svm->vmcb->save.rip >> 32), entryexit);
 
+	if (is_nested(svm)) {
+		nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n",
+			    exit_code, svm->vmcb->control.exit_info_1,
+			    svm->vmcb->control.exit_info_2, svm->vmcb->save.rip);
+		if (nested_svm_exit_handled(svm, true)) {
+			nested_svm_vmexit(svm);
+			nsvm_printk("-> #VMEXIT\n");
+			return 1;
+		}
+	}
+
 	if (npt_enabled) {
 		int mmu_reload = 0;
 		if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
@@ -1892,6 +2177,8 @@ static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	nested_svm_intr(svm);
+
 	svm_inject_irq(svm, irq);
 }
 
@@ -1937,6 +2224,9 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu)
 	if (!kvm_cpu_has_interrupt(vcpu))
 		goto out;
 
+	if (nested_svm_intr(svm))
+		goto out;
+
 	if (!(svm->vcpu.arch.hflags & HF_GIF_MASK))
 		goto out;
 
@@ -1989,6 +2279,9 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb_control_area *control = &svm->vmcb->control;
 
+	if (nested_svm_intr(svm))
+		return;
+
 	svm->vcpu.arch.interrupt_window_open =
 		(!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
 		 (svm->vmcb->save.rflags & X86_EFLAGS_IF) &&
-- 
cgit v1.1


From eb6f302edff9da963e687bf6c15dcf88843b1c3b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 25 Nov 2008 20:17:09 +0100
Subject: KVM: SVM: Allow read access to MSR_VM_VR

KVM tries to read the VM_CR MSR to find out if SVM was disabled by
the BIOS. So implement read support for this MSR to make nested
SVM running.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 4cb2920..df5b411 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1869,6 +1869,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 	case MSR_VM_HSAVE_PA:
 		*data = svm->hsave_msr;
 		break;
+	case MSR_VM_CR:
+		*data = 0;
+		break;
 	default:
 		return kvm_get_msr_common(vcpu, ecx, data);
 	}
-- 
cgit v1.1


From 236de05553a7ef8f6940de8686ae9bf1272cd2cf Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 25 Nov 2008 20:17:10 +0100
Subject: KVM: SVM: Allow setting the SVME bit

Normally setting the SVME bit in EFER is not allowed, as we did
not support SVM. Not since we do, we should also allow enabling
SVM mode.

v2 comes as last patch, so we don't enable half-ready code
v4 introduces a module option to enable SVM
v6 warns that nesting is enabled

Acked-by: Joerg Roedel <joro@8bytes.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index df5b411..0fbbde5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -69,6 +69,9 @@ static int npt = 1;
 
 module_param(npt, int, S_IRUGO);
 
+static int nested = 0;
+module_param(nested, int, S_IRUGO);
+
 static void kvm_reput_irq(struct vcpu_svm *svm);
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 
@@ -443,6 +446,11 @@ static __init int svm_hardware_setup(void)
 	if (boot_cpu_has(X86_FEATURE_NX))
 		kvm_enable_efer_bits(EFER_NX);
 
+	if (nested) {
+		printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
+		kvm_enable_efer_bits(EFER_SVME);
+	}
+
 	for_each_online_cpu(cpu) {
 		r = svm_cpu_init(cpu);
 		if (r)
-- 
cgit v1.1


From d80174745ba3938bc6abb8f95ed670ac0631a182 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 25 Nov 2008 20:17:11 +0100
Subject: KVM: SVM: Only allow setting of EFER_SVME when CPUID SVM is set

Userspace has to tell the kernel module somehow that nested SVM should be used.
The easiest way that doesn't break anything I could think of is to implement

if (cpuid & svm)
    allow write to efer
else
    deny write to efer

Old userspaces mask the SVM capability bit, so they don't break.
In order to find out that the SVM capability is set, I had to split the
kvm_emulate_cpuid into a finding and an emulating part.

(introduced in v6)

Acked-by: Joerg Roedel <joro@8bytes.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 58 +++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 42 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 99165a9..b5e9932 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -69,6 +69,8 @@ static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
 
 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 				    struct kvm_cpuid_entry2 __user *entries);
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+					      u32 function, u32 index);
 
 struct kvm_x86_ops *kvm_x86_ops;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
@@ -173,6 +175,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 			   u32 error_code)
 {
 	++vcpu->stat.pf_guest;
+
 	if (vcpu->arch.exception.pending) {
 		if (vcpu->arch.exception.nr == PF_VECTOR) {
 			printk(KERN_DEBUG "kvm: inject_page_fault:"
@@ -442,6 +445,11 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 
+static inline u32 bit(int bitno)
+{
+	return 1 << (bitno & 31);
+}
+
 /*
  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
@@ -481,6 +489,17 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 		return;
 	}
 
+	if (efer & EFER_SVME) {
+		struct kvm_cpuid_entry2 *feat;
+
+		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
+			printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
+			kvm_inject_gp(vcpu, 0);
+			return;
+		}
+	}
+
 	kvm_x86_ops->set_efer(vcpu, efer);
 
 	efer &= ~EFER_LMA;
@@ -1181,11 +1200,6 @@ out:
 	return r;
 }
 
-static inline u32 bit(int bitno)
-{
-	return 1 << (bitno & 31);
-}
-
 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			  u32 index)
 {
@@ -1228,7 +1242,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	const u32 kvm_supported_word3_x86_features =
 		bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
 	const u32 kvm_supported_word6_x86_features =
-		bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
+		bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) |
+		bit(X86_FEATURE_SVM);
 
 	/* all func 2 cpuid_count() should be called on the same cpu */
 	get_cpu();
@@ -2832,20 +2847,15 @@ static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
 	return 1;
 }
 
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+					      u32 function, u32 index)
 {
 	int i;
-	u32 function, index;
-	struct kvm_cpuid_entry2 *e, *best;
+	struct kvm_cpuid_entry2 *best = NULL;
 
-	function = kvm_register_read(vcpu, VCPU_REGS_RAX);
-	index = kvm_register_read(vcpu, VCPU_REGS_RCX);
-	kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
-	kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
-	kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
-	kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
-	best = NULL;
 	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+		struct kvm_cpuid_entry2 *e;
+
 		e = &vcpu->arch.cpuid_entries[i];
 		if (is_matching_cpuid_entry(e, function, index)) {
 			if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
@@ -2860,6 +2870,22 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 			if (!best || e->function > best->function)
 				best = e;
 	}
+
+	return best;
+}
+
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+	u32 function, index;
+	struct kvm_cpuid_entry2 *best;
+
+	function = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	index = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
+	best = kvm_find_cpuid_entry(vcpu, function, index);
 	if (best) {
 		kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
 		kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
-- 
cgit v1.1


From 8ab2d2e231062814bd89bba2d6d92563190aa2bb Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 15 Dec 2008 13:52:10 +0100
Subject: KVM: VMX: Support for injecting software exceptions

VMX differentiates between processor and software generated exceptions
when injecting them into the guest. Extend vmx_queue_exception
accordingly (and refactor related constants) so that we can use this
service reliably for the new guest debugging framework.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/vmx.h |  3 ++-
 arch/x86/kvm/vmx.c         | 35 ++++++++++++++++++++---------------
 2 files changed, 22 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index d0238e6..32159f0 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -270,8 +270,9 @@ enum vmcs_field {
 
 #define INTR_TYPE_EXT_INTR              (0 << 8) /* external interrupt */
 #define INTR_TYPE_NMI_INTR		(2 << 8) /* NMI */
-#define INTR_TYPE_EXCEPTION             (3 << 8) /* processor exception */
+#define INTR_TYPE_HARD_EXCEPTION	(3 << 8) /* processor exception */
 #define INTR_TYPE_SOFT_INTR             (4 << 8) /* software interrupt */
+#define INTR_TYPE_SOFT_EXCEPTION	(6 << 8) /* software exception */
 
 /* GUEST_INTERRUPTIBILITY_INFO flags. */
 #define GUEST_INTR_STATE_STI		0x00000001
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7611af5..1d974c1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -189,21 +189,21 @@ static inline int is_page_fault(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
-		(INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
+		(INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
 }
 
 static inline int is_no_device(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
-		(INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
+		(INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
 }
 
 static inline int is_invalid_opcode(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
-		(INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+		(INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
 }
 
 static inline int is_external_interrupt(u32 intr_info)
@@ -747,29 +747,33 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
-	if (has_error_code)
+	if (has_error_code) {
 		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
+	}
 
 	if (vcpu->arch.rmode.active) {
 		vmx->rmode.irq.pending = true;
 		vmx->rmode.irq.vector = nr;
 		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
-		if (nr == BP_VECTOR)
+		if (nr == BP_VECTOR || nr == OF_VECTOR)
 			vmx->rmode.irq.rip++;
-		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-			     nr | INTR_TYPE_SOFT_INTR
-			     | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
-			     | INTR_INFO_VALID_MASK);
+		intr_info |= INTR_TYPE_SOFT_INTR;
+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
 		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
 		kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
 		return;
 	}
 
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-		     nr | INTR_TYPE_EXCEPTION
-		     | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
-		     | INTR_INFO_VALID_MASK);
+	if (nr == BP_VECTOR || nr == OF_VECTOR) {
+		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+		intr_info |= INTR_TYPE_SOFT_EXCEPTION;
+	} else
+		intr_info |= INTR_TYPE_HARD_EXCEPTION;
+
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
 }
 
 static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
@@ -2650,7 +2654,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 
 	if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
-	    (INTR_TYPE_EXCEPTION | 1)) {
+	    (INTR_TYPE_HARD_EXCEPTION | 1)) {
 		kvm_run->exit_reason = KVM_EXIT_DEBUG;
 		return 0;
 	}
@@ -3238,7 +3242,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 			vmx->vcpu.arch.nmi_injected = false;
 	}
 	kvm_clear_exception_queue(&vmx->vcpu);
-	if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) {
+	if (idtv_info_valid && (type == INTR_TYPE_HARD_EXCEPTION ||
+				type == INTR_TYPE_SOFT_EXCEPTION)) {
 		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
 			error = vmcs_read32(IDT_VECTORING_ERROR_CODE);
 			kvm_queue_exception_e(&vmx->vcpu, vector, error);
-- 
cgit v1.1


From d0bfb940ecabf0b44fb1fd80d8d60594e569e5ec Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 15 Dec 2008 13:52:10 +0100
Subject: KVM: New guest debug interface

This rips out the support for KVM_DEBUG_GUEST and introduces a new IOCTL
instead: KVM_SET_GUEST_DEBUG. The IOCTL payload consists of a generic
part, controlling the "main switch" and the single-step feature. The
arch specific part adds an x86 interface for intercepting both types of
debug exceptions separately and re-injecting them when the host was not
interested. Moveover, the foundation for guest debugging via debug
registers is layed.

To signal breakpoint events properly back to userland, an arch-specific
data block is now returned along KVM_EXIT_DEBUG. For x86, the arch block
contains the PC, the debug exception, and relevant debug registers to
tell debug events properly apart.

The availability of this new interface is signaled by
KVM_CAP_SET_GUEST_DEBUG. Empty stubs for not yet supported archs are
provided.

Note that both SVM and VTX are supported, but only the latter was tested
yet. Based on the experience with all those VTX corner case, I would be
fairly surprised if SVM will work out of the box.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm.h      | 18 ++++++++
 arch/x86/include/asm/kvm_host.h |  9 +---
 arch/x86/kvm/svm.c              | 50 +++++++++++++++++++++-
 arch/x86/kvm/vmx.c              | 93 ++++++++++++++++-------------------------
 arch/x86/kvm/x86.c              | 14 ++++---
 5 files changed, 111 insertions(+), 73 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 886c940..32eb96c 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -212,6 +212,24 @@ struct kvm_pit_channel_state {
 	__s64 count_load_time;
 };
 
+struct kvm_debug_exit_arch {
+	__u32 exception;
+	__u32 pad;
+	__u64 pc;
+	__u64 dr6;
+	__u64 dr7;
+};
+
+#define KVM_GUESTDBG_USE_SW_BP		0x00010000
+#define KVM_GUESTDBG_USE_HW_BP		0x00020000
+#define KVM_GUESTDBG_INJECT_DB		0x00040000
+#define KVM_GUESTDBG_INJECT_BP		0x00080000
+
+/* for KVM_SET_GUEST_DEBUG */
+struct kvm_guest_debug_arch {
+	__u64 debugreg[8];
+};
+
 struct kvm_pit_state {
 	struct kvm_pit_channel_state channels[3];
 };
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5377930..c430cd5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -135,12 +135,6 @@ enum {
 
 #define KVM_NR_MEM_OBJS 40
 
-struct kvm_guest_debug {
-	int enabled;
-	unsigned long bp[4];
-	int singlestep;
-};
-
 /*
  * We don't want allocation failures within the mmu code, so we preallocate
  * enough memory for a single page fault in a cache.
@@ -448,8 +442,7 @@ struct kvm_x86_ops {
 	void (*vcpu_put)(struct kvm_vcpu *vcpu);
 
 	int (*set_guest_debug)(struct kvm_vcpu *vcpu,
-			       struct kvm_debug_guest *dbg);
-	void (*guest_debug_pre)(struct kvm_vcpu *vcpu);
+			       struct kvm_guest_debug *dbg);
 	int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
 	int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
 	u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0fbbde5..88d9062 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -968,9 +968,32 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
 
 }
 
-static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
+static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 {
-	return -EOPNOTSUPP;
+	int old_debug = vcpu->guest_debug;
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	vcpu->guest_debug = dbg->control;
+
+	svm->vmcb->control.intercept_exceptions &=
+		~((1 << DB_VECTOR) | (1 << BP_VECTOR));
+	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
+		if (vcpu->guest_debug &
+		    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+			svm->vmcb->control.intercept_exceptions |=
+				1 << DB_VECTOR;
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+			svm->vmcb->control.intercept_exceptions |=
+				1 << BP_VECTOR;
+	} else
+		vcpu->guest_debug = 0;
+
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
+	else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
+		svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+
+	return 0;
 }
 
 static int svm_get_irq(struct kvm_vcpu *vcpu)
@@ -1094,6 +1117,27 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
 }
 
+static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	if (!(svm->vcpu.guest_debug &
+	      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+		kvm_queue_exception(&svm->vcpu, DB_VECTOR);
+		return 1;
+	}
+	kvm_run->exit_reason = KVM_EXIT_DEBUG;
+	kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+	kvm_run->debug.arch.exception = DB_VECTOR;
+	return 0;
+}
+
+static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	kvm_run->exit_reason = KVM_EXIT_DEBUG;
+	kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+	kvm_run->debug.arch.exception = BP_VECTOR;
+	return 0;
+}
+
 static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
 	int er;
@@ -2050,6 +2094,8 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 	[SVM_EXIT_WRITE_DR3]			= emulate_on_interception,
 	[SVM_EXIT_WRITE_DR5]			= emulate_on_interception,
 	[SVM_EXIT_WRITE_DR7]			= emulate_on_interception,
+	[SVM_EXIT_EXCP_BASE + DB_VECTOR]	= db_interception,
+	[SVM_EXIT_EXCP_BASE + BP_VECTOR]	= bp_interception,
 	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception,
 	[SVM_EXIT_EXCP_BASE + PF_VECTOR] 	= pf_interception,
 	[SVM_EXIT_EXCP_BASE + NM_VECTOR] 	= nm_interception,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1d974c1..f55690d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -480,8 +480,13 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
 	if (!vcpu->fpu_active)
 		eb |= 1u << NM_VECTOR;
-	if (vcpu->guest_debug.enabled)
-		eb |= 1u << DB_VECTOR;
+	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
+		if (vcpu->guest_debug &
+		    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+			eb |= 1u << DB_VECTOR;
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+			eb |= 1u << BP_VECTOR;
+	}
 	if (vcpu->arch.rmode.active)
 		eb = ~0;
 	if (vm_need_ept())
@@ -1003,40 +1008,23 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 	}
 }
 
-static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
+static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 {
-	unsigned long dr7 = 0x400;
-	int old_singlestep;
-
-	old_singlestep = vcpu->guest_debug.singlestep;
-
-	vcpu->guest_debug.enabled = dbg->enabled;
-	if (vcpu->guest_debug.enabled) {
-		int i;
-
-		dr7 |= 0x200;  /* exact */
-		for (i = 0; i < 4; ++i) {
-			if (!dbg->breakpoints[i].enabled)
-				continue;
-			vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
-			dr7 |= 2 << (i*2);    /* global enable */
-			dr7 |= 0 << (i*4+16); /* execution breakpoint */
-		}
-
-		vcpu->guest_debug.singlestep = dbg->singlestep;
-	} else
-		vcpu->guest_debug.singlestep = 0;
+	int old_debug = vcpu->guest_debug;
+	unsigned long flags;
 
-	if (old_singlestep && !vcpu->guest_debug.singlestep) {
-		unsigned long flags;
+	vcpu->guest_debug = dbg->control;
+	if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
+		vcpu->guest_debug = 0;
 
-		flags = vmcs_readl(GUEST_RFLAGS);
+	flags = vmcs_readl(GUEST_RFLAGS);
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
+	else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
 		flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
-		vmcs_writel(GUEST_RFLAGS, flags);
-	}
+	vmcs_writel(GUEST_RFLAGS, flags);
 
 	update_exception_bitmap(vcpu);
-	vmcs_writel(GUEST_DR7, dr7);
 
 	return 0;
 }
@@ -2540,24 +2528,6 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
 	return 0;
 }
 
-static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
-{
-	struct kvm_guest_debug *dbg = &vcpu->guest_debug;
-
-	set_debugreg(dbg->bp[0], 0);
-	set_debugreg(dbg->bp[1], 1);
-	set_debugreg(dbg->bp[2], 2);
-	set_debugreg(dbg->bp[3], 3);
-
-	if (dbg->singlestep) {
-		unsigned long flags;
-
-		flags = vmcs_readl(GUEST_RFLAGS);
-		flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
-		vmcs_writel(GUEST_RFLAGS, flags);
-	}
-}
-
 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 				  int vec, u32 err_code)
 {
@@ -2574,9 +2544,17 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 	 *        the required debugging infrastructure rework.
 	 */
 	switch (vec) {
-	case DE_VECTOR:
 	case DB_VECTOR:
+		if (vcpu->guest_debug &
+		    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+			return 0;
+		kvm_queue_exception(vcpu, vec);
+		return 1;
 	case BP_VECTOR:
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+			return 0;
+		/* fall through */
+	case DE_VECTOR:
 	case OF_VECTOR:
 	case BR_VECTOR:
 	case UD_VECTOR:
@@ -2593,7 +2571,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 intr_info, error_code;
+	u32 intr_info, ex_no, error_code;
 	unsigned long cr2, rip;
 	u32 vect_info;
 	enum emulation_result er;
@@ -2653,14 +2631,16 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		return 1;
 	}
 
-	if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
-	    (INTR_TYPE_HARD_EXCEPTION | 1)) {
+	ex_no = intr_info & INTR_INFO_VECTOR_MASK;
+	if (ex_no == DB_VECTOR || ex_no == BP_VECTOR) {
 		kvm_run->exit_reason = KVM_EXIT_DEBUG;
-		return 0;
+		kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
+		kvm_run->debug.arch.exception = ex_no;
+	} else {
+		kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
+		kvm_run->ex.exception = ex_no;
+		kvm_run->ex.error_code = error_code;
 	}
-	kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
-	kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
-	kvm_run->ex.error_code = error_code;
 	return 0;
 }
 
@@ -3600,7 +3580,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.vcpu_put = vmx_vcpu_put,
 
 	.set_guest_debug = set_guest_debug,
-	.guest_debug_pre = kvm_guest_debug_pre,
 	.get_msr = vmx_get_msr,
 	.set_msr = vmx_set_msr,
 	.get_segment_base = vmx_get_segment_base,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b5e9932..e990d16 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3005,9 +3005,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		goto out;
 	}
 
-	if (vcpu->guest_debug.enabled)
-		kvm_x86_ops->guest_debug_pre(vcpu);
-
 	vcpu->guest_mode = 1;
 	/*
 	 * Make sure that guest_mode assignment won't happen after
@@ -3218,7 +3215,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	/*
 	 * Don't leak debug flags in case they were set for guest debugging
 	 */
-	if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
 
 	vcpu_put(vcpu);
@@ -3837,8 +3834,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
-				    struct kvm_debug_guest *dbg)
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+					struct kvm_guest_debug *dbg)
 {
 	int r;
 
@@ -3846,6 +3843,11 @@ int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
 
 	r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
 
+	if (dbg->control & KVM_GUESTDBG_INJECT_DB)
+		kvm_queue_exception(vcpu, DB_VECTOR);
+	else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
+		kvm_queue_exception(vcpu, BP_VECTOR);
+
 	vcpu_put(vcpu);
 
 	return r;
-- 
cgit v1.1


From 55934c0bd3bb232a9cf902820dd63ad18ed65e49 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 15 Dec 2008 13:52:10 +0100
Subject: KVM: VMX: Allow single-stepping when uninterruptible

When single-stepping over STI and MOV SS, we must clear the
corresponding interruptibility bits in the guest state. Otherwise
vmentry fails as it then expects bit 14 (BS) in pending debug exceptions
being set, but that's not correct for the guest debugging case.

Note that clearing those bits is safe as we check for interruptibility
based on the original state and do not inject interrupts or NMIs if
guest interruptibility was blocked.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f55690d..c776868 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2478,6 +2478,11 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 {
 	vmx_update_window_states(vcpu);
 
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+				GUEST_INTR_STATE_STI |
+				GUEST_INTR_STATE_MOV_SS);
+
 	if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
 		if (vcpu->arch.interrupt.pending) {
 			enable_nmi_window(vcpu);
@@ -3244,6 +3249,11 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 
 	vmx_update_window_states(vcpu);
 
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+				GUEST_INTR_STATE_STI |
+				GUEST_INTR_STATE_MOV_SS);
+
 	if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
 		if (vcpu->arch.interrupt.pending) {
 			enable_nmi_window(vcpu);
-- 
cgit v1.1


From 42dbaa5a057736bf8b5c22aa42dbe975bf1080e5 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 15 Dec 2008 13:52:10 +0100
Subject: KVM: x86: Virtualize debug registers

So far KVM only had basic x86 debug register support, once introduced to
realize guest debugging that way. The guest itself was not able to use
those registers.

This patch now adds (almost) full support for guest self-debugging via
hardware registers. It refactors the code, moving generic parts out of
SVM (VMX was already cleaned up by the KVM_SET_GUEST_DEBUG patches), and
it ensures that the registers are properly switched between host and
guest.

This patch also prepares debug register usage by the host. The latter
will (once wired-up by the following patch) allow for hardware
breakpoints/watchpoints in guest code. If this is enabled, the guest
will only see faked debug registers without functionality, but with
content reflecting the guest's modifications.

Tested on Intel only, but SVM /should/ work as well, but who knows...

Known limitations: Trapping on tss switch won't work - most probably on
Intel.

Credits also go to Joerg Roedel - I used his once posted debugging
series as platform for this patch.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  22 ++++++++
 arch/x86/include/asm/vmx.h      |   2 +-
 arch/x86/kvm/kvm_svm.h          |   6 ---
 arch/x86/kvm/svm.c              | 116 +++++++++++++++-------------------------
 arch/x86/kvm/vmx.c              | 114 +++++++++++++++++++++++++++++++++------
 arch/x86/kvm/x86.c              |  29 ++++++++++
 6 files changed, 193 insertions(+), 96 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c430cd5..0a4dab2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -135,6 +135,19 @@ enum {
 
 #define KVM_NR_MEM_OBJS 40
 
+#define KVM_NR_DB_REGS	4
+
+#define DR6_BD		(1 << 13)
+#define DR6_BS		(1 << 14)
+#define DR6_FIXED_1	0xffff0ff0
+#define DR6_VOLATILE	0x0000e00f
+
+#define DR7_BP_EN_MASK	0x000000ff
+#define DR7_GE		(1 << 9)
+#define DR7_GD		(1 << 13)
+#define DR7_FIXED_1	0x00000400
+#define DR7_VOLATILE	0xffff23ff
+
 /*
  * We don't want allocation failures within the mmu code, so we preallocate
  * enough memory for a single page fault in a cache.
@@ -334,6 +347,15 @@ struct kvm_vcpu_arch {
 
 	struct mtrr_state_type mtrr_state;
 	u32 pat;
+
+	int switch_db_regs;
+	unsigned long host_db[KVM_NR_DB_REGS];
+	unsigned long host_dr6;
+	unsigned long host_dr7;
+	unsigned long db[KVM_NR_DB_REGS];
+	unsigned long dr6;
+	unsigned long dr7;
+	unsigned long eff_db[KVM_NR_DB_REGS];
 };
 
 struct kvm_mem_alias {
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 32159f0..498f944 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -312,7 +312,7 @@ enum vmcs_field {
 #define DEBUG_REG_ACCESS_TYPE           0x10    /* 4, direction of access */
 #define TYPE_MOV_TO_DR                  (0 << 4)
 #define TYPE_MOV_FROM_DR                (1 << 4)
-#define DEBUG_REG_ACCESS_REG            0xf00   /* 11:8, general purpose reg. */
+#define DEBUG_REG_ACCESS_REG(eq)        (((eq) >> 8) & 0xf) /* 11:8, general purpose reg. */
 
 
 /* segment AR */
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
index 9167341..ed66e4c 100644
--- a/arch/x86/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -18,7 +18,6 @@ static const u32 host_save_user_msrs[] = {
 };
 
 #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
-#define NUM_DB_REGS 4
 
 struct kvm_vcpu;
 
@@ -29,16 +28,11 @@ struct vcpu_svm {
 	struct svm_cpu_data *svm_data;
 	uint64_t asid_generation;
 
-	unsigned long db_regs[NUM_DB_REGS];
-
 	u64 next_rip;
 
 	u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
 	u64 host_gs_base;
 	unsigned long host_cr2;
-	unsigned long host_db_regs[NUM_DB_REGS];
-	unsigned long host_dr6;
-	unsigned long host_dr7;
 
 	u32 *msrpm;
 	struct vmcb *hsave;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 88d9062..815f50e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -38,9 +38,6 @@ MODULE_LICENSE("GPL");
 #define IOPM_ALLOC_ORDER 2
 #define MSRPM_ALLOC_ORDER 1
 
-#define DR7_GD_MASK (1 << 13)
-#define DR6_BD_MASK (1 << 13)
-
 #define SEG_TYPE_LDT 2
 #define SEG_TYPE_BUSY_TSS16 3
 
@@ -181,32 +178,6 @@ static inline void kvm_write_cr2(unsigned long val)
 	asm volatile ("mov %0, %%cr2" :: "r" (val));
 }
 
-static inline unsigned long read_dr6(void)
-{
-	unsigned long dr6;
-
-	asm volatile ("mov %%dr6, %0" : "=r" (dr6));
-	return dr6;
-}
-
-static inline void write_dr6(unsigned long val)
-{
-	asm volatile ("mov %0, %%dr6" :: "r" (val));
-}
-
-static inline unsigned long read_dr7(void)
-{
-	unsigned long dr7;
-
-	asm volatile ("mov %%dr7, %0" : "=r" (dr7));
-	return dr7;
-}
-
-static inline void write_dr7(unsigned long val)
-{
-	asm volatile ("mov %0, %%dr7" :: "r" (val));
-}
-
 static inline void force_new_asid(struct kvm_vcpu *vcpu)
 {
 	to_svm(vcpu)->asid_generation--;
@@ -695,7 +666,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	clear_page(svm->vmcb);
 	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
 	svm->asid_generation = 0;
-	memset(svm->db_regs, 0, sizeof(svm->db_regs));
 	init_vmcb(svm);
 
 	fx_init(&svm->vcpu);
@@ -1035,7 +1005,29 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
 
 static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
 {
-	unsigned long val = to_svm(vcpu)->db_regs[dr];
+	struct vcpu_svm *svm = to_svm(vcpu);
+	unsigned long val;
+
+	switch (dr) {
+	case 0 ... 3:
+		val = vcpu->arch.db[dr];
+		break;
+	case 6:
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+			val = vcpu->arch.dr6;
+		else
+			val = svm->vmcb->save.dr6;
+		break;
+	case 7:
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+			val = vcpu->arch.dr7;
+		else
+			val = svm->vmcb->save.dr7;
+		break;
+	default:
+		val = 0;
+	}
+
 	KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
 	return val;
 }
@@ -1045,33 +1037,40 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	*exception = 0;
+	KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)value, handler);
 
-	if (svm->vmcb->save.dr7 & DR7_GD_MASK) {
-		svm->vmcb->save.dr7 &= ~DR7_GD_MASK;
-		svm->vmcb->save.dr6 |= DR6_BD_MASK;
-		*exception = DB_VECTOR;
-		return;
-	}
+	*exception = 0;
 
 	switch (dr) {
 	case 0 ... 3:
-		svm->db_regs[dr] = value;
+		vcpu->arch.db[dr] = value;
+		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+			vcpu->arch.eff_db[dr] = value;
 		return;
 	case 4 ... 5:
-		if (vcpu->arch.cr4 & X86_CR4_DE) {
+		if (vcpu->arch.cr4 & X86_CR4_DE)
 			*exception = UD_VECTOR;
+		return;
+	case 6:
+		if (value & 0xffffffff00000000ULL) {
+			*exception = GP_VECTOR;
 			return;
 		}
-	case 7: {
-		if (value & ~((1ULL << 32) - 1)) {
+		vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
+		return;
+	case 7:
+		if (value & 0xffffffff00000000ULL) {
 			*exception = GP_VECTOR;
 			return;
 		}
-		svm->vmcb->save.dr7 = value;
+		vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
+		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+			svm->vmcb->save.dr7 = vcpu->arch.dr7;
+			vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
+		}
 		return;
-	}
 	default:
+		/* FIXME: Possible case? */
 		printk(KERN_DEBUG "%s: unexpected dr %u\n",
 		       __func__, dr);
 		*exception = UD_VECTOR;
@@ -2365,22 +2364,6 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
 	return 0;
 }
 
-static void save_db_regs(unsigned long *db_regs)
-{
-	asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
-	asm volatile ("mov %%dr1, %0" : "=r"(db_regs[1]));
-	asm volatile ("mov %%dr2, %0" : "=r"(db_regs[2]));
-	asm volatile ("mov %%dr3, %0" : "=r"(db_regs[3]));
-}
-
-static void load_db_regs(unsigned long *db_regs)
-{
-	asm volatile ("mov %0, %%dr0" : : "r"(db_regs[0]));
-	asm volatile ("mov %0, %%dr1" : : "r"(db_regs[1]));
-	asm volatile ("mov %0, %%dr2" : : "r"(db_regs[2]));
-	asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3]));
-}
-
 static void svm_flush_tlb(struct kvm_vcpu *vcpu)
 {
 	force_new_asid(vcpu);
@@ -2439,20 +2422,12 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	gs_selector = kvm_read_gs();
 	ldt_selector = kvm_read_ldt();
 	svm->host_cr2 = kvm_read_cr2();
-	svm->host_dr6 = read_dr6();
-	svm->host_dr7 = read_dr7();
 	if (!is_nested(svm))
 		svm->vmcb->save.cr2 = vcpu->arch.cr2;
 	/* required for live migration with NPT */
 	if (npt_enabled)
 		svm->vmcb->save.cr3 = vcpu->arch.cr3;
 
-	if (svm->vmcb->save.dr7 & 0xff) {
-		write_dr7(0);
-		save_db_regs(svm->host_db_regs);
-		load_db_regs(svm->db_regs);
-	}
-
 	clgi();
 
 	local_irq_enable();
@@ -2528,16 +2503,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 #endif
 		);
 
-	if ((svm->vmcb->save.dr7 & 0xff))
-		load_db_regs(svm->host_db_regs);
-
 	vcpu->arch.cr2 = svm->vmcb->save.cr2;
 	vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
 	vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
 	vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
 
-	write_dr6(svm->host_dr6);
-	write_dr7(svm->host_dr7);
 	kvm_write_cr2(svm->host_cr2);
 
 	kvm_load_fs(fs_selector);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c776868..0989776 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2311,7 +2311,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 		kvm_rip_write(vcpu, 0);
 	kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
 
-	/* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
 	vmcs_writel(GUEST_DR7, 0x400);
 
 	vmcs_writel(GUEST_GDTR_BASE, 0);
@@ -2577,7 +2576,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 intr_info, ex_no, error_code;
-	unsigned long cr2, rip;
+	unsigned long cr2, rip, dr6;
 	u32 vect_info;
 	enum emulation_result er;
 
@@ -2637,14 +2636,28 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 
 	ex_no = intr_info & INTR_INFO_VECTOR_MASK;
-	if (ex_no == DB_VECTOR || ex_no == BP_VECTOR) {
+	switch (ex_no) {
+	case DB_VECTOR:
+		dr6 = vmcs_readl(EXIT_QUALIFICATION);
+		if (!(vcpu->guest_debug &
+		      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+			vcpu->arch.dr6 = dr6 | DR6_FIXED_1;
+			kvm_queue_exception(vcpu, DB_VECTOR);
+			return 1;
+		}
+		kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+		kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
+		/* fall through */
+	case BP_VECTOR:
 		kvm_run->exit_reason = KVM_EXIT_DEBUG;
 		kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
 		kvm_run->debug.arch.exception = ex_no;
-	} else {
+		break;
+	default:
 		kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
 		kvm_run->ex.exception = ex_no;
 		kvm_run->ex.error_code = error_code;
+		break;
 	}
 	return 0;
 }
@@ -2784,21 +2797,44 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	unsigned long val;
 	int dr, reg;
 
-	/*
-	 * FIXME: this code assumes the host is debugging the guest.
-	 *        need to deal with guest debugging itself too.
-	 */
+	dr = vmcs_readl(GUEST_DR7);
+	if (dr & DR7_GD) {
+		/*
+		 * As the vm-exit takes precedence over the debug trap, we
+		 * need to emulate the latter, either for the host or the
+		 * guest debugging itself.
+		 */
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
+			kvm_run->debug.arch.dr6 = vcpu->arch.dr6;
+			kvm_run->debug.arch.dr7 = dr;
+			kvm_run->debug.arch.pc =
+				vmcs_readl(GUEST_CS_BASE) +
+				vmcs_readl(GUEST_RIP);
+			kvm_run->debug.arch.exception = DB_VECTOR;
+			kvm_run->exit_reason = KVM_EXIT_DEBUG;
+			return 0;
+		} else {
+			vcpu->arch.dr7 &= ~DR7_GD;
+			vcpu->arch.dr6 |= DR6_BD;
+			vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
+			kvm_queue_exception(vcpu, DB_VECTOR);
+			return 1;
+		}
+	}
+
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-	dr = exit_qualification & 7;
-	reg = (exit_qualification >> 8) & 15;
-	if (exit_qualification & 16) {
-		/* mov from dr */
+	dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
+	reg = DEBUG_REG_ACCESS_REG(exit_qualification);
+	if (exit_qualification & TYPE_MOV_FROM_DR) {
 		switch (dr) {
+		case 0 ... 3:
+			val = vcpu->arch.db[dr];
+			break;
 		case 6:
-			val = 0xffff0ff0;
+			val = vcpu->arch.dr6;
 			break;
 		case 7:
-			val = 0x400;
+			val = vcpu->arch.dr7;
 			break;
 		default:
 			val = 0;
@@ -2806,7 +2842,38 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		kvm_register_write(vcpu, reg, val);
 		KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
 	} else {
-		/* mov to dr */
+		val = vcpu->arch.regs[reg];
+		switch (dr) {
+		case 0 ... 3:
+			vcpu->arch.db[dr] = val;
+			if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+				vcpu->arch.eff_db[dr] = val;
+			break;
+		case 4 ... 5:
+			if (vcpu->arch.cr4 & X86_CR4_DE)
+				kvm_queue_exception(vcpu, UD_VECTOR);
+			break;
+		case 6:
+			if (val & 0xffffffff00000000ULL) {
+				kvm_queue_exception(vcpu, GP_VECTOR);
+				break;
+			}
+			vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
+			break;
+		case 7:
+			if (val & 0xffffffff00000000ULL) {
+				kvm_queue_exception(vcpu, GP_VECTOR);
+				break;
+			}
+			vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
+			if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+				vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
+				vcpu->arch.switch_db_regs =
+					(val & DR7_BP_EN_MASK);
+			}
+			break;
+		}
+		KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)val, handler);
 	}
 	skip_emulated_instruction(vcpu);
 	return 1;
@@ -2957,7 +3024,18 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 	tss_selector = exit_qualification;
 
-	return kvm_task_switch(vcpu, tss_selector, reason);
+	if (!kvm_task_switch(vcpu, tss_selector, reason))
+		return 0;
+
+	/* clear all local breakpoint enable flags */
+	vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
+
+	/*
+	 * TODO: What about debug traps on tss switch?
+	 *       Are we supposed to inject them and update dr6?
+	 */
+
+	return 1;
 }
 
 static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -3342,6 +3420,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	 */
 	vmcs_writel(HOST_CR0, read_cr0());
 
+	set_debugreg(vcpu->arch.dr6, 6);
+
 	asm(
 		/* Store host registers */
 		"push %%"R"dx; push %%"R"bp;"
@@ -3436,6 +3516,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
 	vcpu->arch.regs_dirty = 0;
 
+	get_debugreg(vcpu->arch.dr6, 6);
+
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 	if (vmx->rmode.irq.pending)
 		fixup_rmode_irq(vmx);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e990d16..300bc4d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3025,10 +3025,34 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	kvm_guest_enter();
 
+	get_debugreg(vcpu->arch.host_dr6, 6);
+	get_debugreg(vcpu->arch.host_dr7, 7);
+	if (unlikely(vcpu->arch.switch_db_regs)) {
+		get_debugreg(vcpu->arch.host_db[0], 0);
+		get_debugreg(vcpu->arch.host_db[1], 1);
+		get_debugreg(vcpu->arch.host_db[2], 2);
+		get_debugreg(vcpu->arch.host_db[3], 3);
+
+		set_debugreg(0, 7);
+		set_debugreg(vcpu->arch.eff_db[0], 0);
+		set_debugreg(vcpu->arch.eff_db[1], 1);
+		set_debugreg(vcpu->arch.eff_db[2], 2);
+		set_debugreg(vcpu->arch.eff_db[3], 3);
+	}
 
 	KVMTRACE_0D(VMENTRY, vcpu, entryexit);
 	kvm_x86_ops->run(vcpu, kvm_run);
 
+	if (unlikely(vcpu->arch.switch_db_regs)) {
+		set_debugreg(0, 7);
+		set_debugreg(vcpu->arch.host_db[0], 0);
+		set_debugreg(vcpu->arch.host_db[1], 1);
+		set_debugreg(vcpu->arch.host_db[2], 2);
+		set_debugreg(vcpu->arch.host_db[3], 3);
+	}
+	set_debugreg(vcpu->arch.host_dr6, 6);
+	set_debugreg(vcpu->arch.host_dr7, 7);
+
 	vcpu->guest_mode = 0;
 	local_irq_enable();
 
@@ -4035,6 +4059,11 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 	vcpu->arch.nmi_pending = false;
 	vcpu->arch.nmi_injected = false;
 
+	vcpu->arch.switch_db_regs = 0;
+	memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
+	vcpu->arch.dr6 = DR6_FIXED_1;
+	vcpu->arch.dr7 = DR7_FIXED_1;
+
 	return kvm_x86_ops->vcpu_reset(vcpu);
 }
 
-- 
cgit v1.1


From ae675ef01cd86014acf8da5dee87876b71122495 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 15 Dec 2008 13:52:10 +0100
Subject: KVM: x86: Wire-up hardware breakpoints for guest debugging

Add the remaining bits to make use of debug registers also for guest
debugging, thus enabling the use of hardware breakpoints and
watchpoints.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c |  5 +++++
 arch/x86/kvm/vmx.c |  5 +++++
 arch/x86/kvm/x86.c | 14 +++++++++++++-
 3 files changed, 23 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 815f50e..b3a7a31 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -958,6 +958,11 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 	} else
 		vcpu->guest_debug = 0;
 
+	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+		svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
+	else
+		svm->vmcb->save.dr7 = vcpu->arch.dr7;
+
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
 	else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0989776..cee81c9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1017,6 +1017,11 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 	if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
 		vcpu->guest_debug = 0;
 
+	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+		vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
+	else
+		vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
+
 	flags = vmcs_readl(GUEST_RFLAGS);
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 300bc4d..2477e87 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3861,10 +3861,22 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 					struct kvm_guest_debug *dbg)
 {
-	int r;
+	int i, r;
 
 	vcpu_load(vcpu);
 
+	if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) ==
+	    (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) {
+		for (i = 0; i < KVM_NR_DB_REGS; ++i)
+			vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
+		vcpu->arch.switch_db_regs =
+			(dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
+	} else {
+		for (i = 0; i < KVM_NR_DB_REGS; i++)
+			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+		vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
+	}
+
 	r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
 
 	if (dbg->control & KVM_GUESTDBG_INJECT_DB)
-- 
cgit v1.1


From a770f6f28b1a9287189f3dc8333eb694d9a2f0ab Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 21 Dec 2008 19:20:09 +0200
Subject: KVM: MMU: Inherit a shadow page's guest level count from vcpu setup

Instead of "calculating" it on every shadow page allocation, set it once
when switching modes, and copy it when allocating pages.

This doesn't buy us much, but sets up the stage for inheriting more
information related to the mmu setup.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.c              | 17 +++++++++++------
 2 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0a4dab2..28f875f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -244,6 +244,7 @@ struct kvm_mmu {
 	hpa_t root_hpa;
 	int root_level;
 	int shadow_root_level;
+	union kvm_mmu_page_role base_role;
 
 	u64 *pae_root;
 };
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2d4477c..f15023c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1204,8 +1204,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node, *tmp;
 
-	role.word = 0;
-	role.glevels = vcpu->arch.mmu.root_level;
+	role = vcpu->arch.mmu.base_role;
 	role.level = level;
 	role.metaphysical = metaphysical;
 	role.access = access;
@@ -2251,17 +2250,23 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
+	int r;
+
 	ASSERT(vcpu);
 	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
 	if (!is_paging(vcpu))
-		return nonpaging_init_context(vcpu);
+		r = nonpaging_init_context(vcpu);
 	else if (is_long_mode(vcpu))
-		return paging64_init_context(vcpu);
+		r = paging64_init_context(vcpu);
 	else if (is_pae(vcpu))
-		return paging32E_init_context(vcpu);
+		r = paging32E_init_context(vcpu);
 	else
-		return paging32_init_context(vcpu);
+		r = paging32_init_context(vcpu);
+
+	vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level;
+
+	return r;
 }
 
 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
-- 
cgit v1.1


From 2f0b3d60b2c43aef7cd10169c425c052169c622a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 21 Dec 2008 19:27:36 +0200
Subject: KVM: MMU: Segregate mmu pages created with different cr4.pge settings

Don't allow a vcpu with cr4.pge cleared to use a shadow page created with
cr4.pge set; this might cause a cr3 switch not to sync ptes that have the
global bit set (the global bit has no effect if !cr4.pge).

This can only occur on smp with different cr4.pge settings for different
vcpus (since a cr4 change will resync the shadow ptes), but there's no
cost to being correct here.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/x86.c              | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 28f875f..c2a01d0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -183,6 +183,7 @@ union kvm_mmu_page_role {
 		unsigned metaphysical:1;
 		unsigned access:3;
 		unsigned invalid:1;
+		unsigned cr4_pge:1;
 	};
 };
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2477e87..873602b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -364,6 +364,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	}
 	kvm_x86_ops->set_cr4(vcpu, cr4);
 	vcpu->arch.cr4 = cr4;
+	vcpu->arch.mmu.base_role.cr4_pge = !!(cr4 & X86_CR4_PGE);
 	kvm_mmu_sync_global(vcpu);
 	kvm_mmu_reset_context(vcpu);
 }
-- 
cgit v1.1


From e2078318042569682d0496cfd7bd962a766e82b1 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 21 Dec 2008 19:36:59 +0200
Subject: KVM: MMU: Initialize a shadow page's global attribute from cr4.pge

If cr4.pge is cleared, we ought to treat any ptes in the page as non-global.
This allows us to remove the check from set_spte().

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f15023c..5f03ec3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -797,7 +797,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 	ASSERT(is_empty_shadow_page(sp->spt));
 	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
 	sp->multimapped = 0;
-	sp->global = 1;
 	sp->parent_pte = parent_pte;
 	--vcpu->kvm->arch.n_free_mmu_pages;
 	return sp;
@@ -1241,6 +1240,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
 	sp->gfn = gfn;
 	sp->role = role;
+	sp->global = role.cr4_pge;
 	hlist_add_head(&sp->hash_link, bucket);
 	if (!metaphysical) {
 		if (rmap_write_protect(vcpu->kvm, gfn))
@@ -1668,8 +1668,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 	u64 mt_mask = shadow_mt_mask;
 	struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
 
-	if (!(vcpu->arch.cr4 & X86_CR4_PGE))
-		global = 0;
 	if (!global && sp->global) {
 		sp->global = 0;
 		if (sp->unsync) {
-- 
cgit v1.1


From 2b3d2a206037b1471de6a6dc51427af034cfdb47 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 23 Dec 2008 19:46:01 +0200
Subject: KVM: Fix vmload and friends misinterpreted as lidt

The AMD SVM instruction family all overload the 0f 01 /3 opcode, further
multiplexing on the three r/m bits.  But the code decided that anything that
isn't a vmmcall must be an lidt (which shares the 0f 01 /3 opcode, for the
case that mod = 3).

Fix by aborting emulation if this isn't a vmmcall.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index d174db7..54fb098 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1908,11 +1908,16 @@ twobyte_insn:
 			c->dst.type = OP_NONE;
 			break;
 		case 3: /* lidt/vmmcall */
-			if (c->modrm_mod == 3 && c->modrm_rm == 1) {
-				rc = kvm_fix_hypercall(ctxt->vcpu);
-				if (rc)
-					goto done;
-				kvm_emulate_hypercall(ctxt->vcpu);
+			if (c->modrm_mod == 3) {
+				switch (c->modrm_rm) {
+				case 1:
+					rc = kvm_fix_hypercall(ctxt->vcpu);
+					if (rc)
+						goto done;
+					break;
+				default:
+					goto cannot_emulate;
+				}
 			} else {
 				rc = read_descriptor(ctxt, ops, c->src.ptr,
 						     &size, &address,
-- 
cgit v1.1


From 2d11123a77e54d5cea262c958e8498f4a08bce3d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 25 Dec 2008 14:39:47 +0200
Subject: KVM: MMU: Add for_each_shadow_entry(), a simpler alternative to
 walk_shadow()

Using a for_each loop style removes the need to write callback and nasty
casts.

Implement the walk_shadow() using the for_each_shadow_entry().

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 69 ++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 49 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5f03ec3..c669f2a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -150,6 +150,20 @@ struct kvm_shadow_walk {
 		     u64 addr, u64 *spte, int level);
 };
 
+struct kvm_shadow_walk_iterator {
+	u64 addr;
+	hpa_t shadow_addr;
+	int level;
+	u64 *sptep;
+	unsigned index;
+};
+
+#define for_each_shadow_entry(_vcpu, _addr, _walker)    \
+	for (shadow_walk_init(&(_walker), _vcpu, _addr);	\
+	     shadow_walk_okay(&(_walker));			\
+	     shadow_walk_next(&(_walker)))
+
+
 struct kvm_unsync_walk {
 	int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
 };
@@ -1254,33 +1268,48 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	return sp;
 }
 
+static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
+			     struct kvm_vcpu *vcpu, u64 addr)
+{
+	iterator->addr = addr;
+	iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
+	iterator->level = vcpu->arch.mmu.shadow_root_level;
+	if (iterator->level == PT32E_ROOT_LEVEL) {
+		iterator->shadow_addr
+			= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+		iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
+		--iterator->level;
+		if (!iterator->shadow_addr)
+			iterator->level = 0;
+	}
+}
+
+static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
+{
+	if (iterator->level < PT_PAGE_TABLE_LEVEL)
+		return false;
+	iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
+	iterator->sptep	= ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
+	return true;
+}
+
+static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+{
+	iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
+	--iterator->level;
+}
+
 static int walk_shadow(struct kvm_shadow_walk *walker,
 		       struct kvm_vcpu *vcpu, u64 addr)
 {
-	hpa_t shadow_addr;
-	int level;
+	struct kvm_shadow_walk_iterator iterator;
 	int r;
-	u64 *sptep;
-	unsigned index;
-
-	shadow_addr = vcpu->arch.mmu.root_hpa;
-	level = vcpu->arch.mmu.shadow_root_level;
-	if (level == PT32E_ROOT_LEVEL) {
-		shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
-		shadow_addr &= PT64_BASE_ADDR_MASK;
-		if (!shadow_addr)
-			return 1;
-		--level;
-	}
 
-	while (level >= PT_PAGE_TABLE_LEVEL) {
-		index = SHADOW_PT_INDEX(addr, level);
-		sptep = ((u64 *)__va(shadow_addr)) + index;
-		r = walker->entry(walker, vcpu, addr, sptep, level);
+	for_each_shadow_entry(vcpu, addr, iterator) {
+		r = walker->entry(walker, vcpu, addr,
+				  iterator.sptep, iterator.level);
 		if (r)
 			return r;
-		shadow_addr = *sptep & PT64_BASE_ADDR_MASK;
-		--level;
 	}
 	return 0;
 }
-- 
cgit v1.1


From 9f652d21c3f887075a33abae85bf53fec64e67b1 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 25 Dec 2008 14:54:25 +0200
Subject: KVM: MMU: Use for_each_shadow_entry() in __direct_map()

Eliminating a callback and a useless structure.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 83 +++++++++++++++++++-----------------------------------
 1 file changed, 29 insertions(+), 54 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c669f2a..a25e1ad 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1846,67 +1846,42 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 {
 }
 
-struct direct_shadow_walk {
-	struct kvm_shadow_walk walker;
-	pfn_t pfn;
-	int write;
-	int largepage;
-	int pt_write;
-};
-
-static int direct_map_entry(struct kvm_shadow_walk *_walk,
-			    struct kvm_vcpu *vcpu,
-			    u64 addr, u64 *sptep, int level)
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
+			int largepage, gfn_t gfn, pfn_t pfn)
 {
-	struct direct_shadow_walk *walk =
-		container_of(_walk, struct direct_shadow_walk, walker);
+	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_mmu_page *sp;
+	int pt_write = 0;
 	gfn_t pseudo_gfn;
-	gfn_t gfn = addr >> PAGE_SHIFT;
-
-	if (level == PT_PAGE_TABLE_LEVEL
-	    || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
-		mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
-			     0, walk->write, 1, &walk->pt_write,
-			     walk->largepage, 0, gfn, walk->pfn, false);
-		++vcpu->stat.pf_fixed;
-		return 1;
-	}
 
-	if (*sptep == shadow_trap_nonpresent_pte) {
-		pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
-		sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1,
-				      1, ACC_ALL, sptep);
-		if (!sp) {
-			pgprintk("nonpaging_map: ENOMEM\n");
-			kvm_release_pfn_clean(walk->pfn);
-			return -ENOMEM;
+	for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
+		if (iterator.level == PT_PAGE_TABLE_LEVEL
+		    || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) {
+			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
+				     0, write, 1, &pt_write,
+				     largepage, 0, gfn, pfn, false);
+			++vcpu->stat.pf_fixed;
+			break;
 		}
 
-		set_shadow_pte(sptep,
-			       __pa(sp->spt)
-			       | PT_PRESENT_MASK | PT_WRITABLE_MASK
-			       | shadow_user_mask | shadow_x_mask);
-	}
-	return 0;
-}
+		if (*iterator.sptep == shadow_trap_nonpresent_pte) {
+			pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+			sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
+					      iterator.level - 1,
+					      1, ACC_ALL, iterator.sptep);
+			if (!sp) {
+				pgprintk("nonpaging_map: ENOMEM\n");
+				kvm_release_pfn_clean(pfn);
+				return -ENOMEM;
+			}
 
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
-			int largepage, gfn_t gfn, pfn_t pfn)
-{
-	int r;
-	struct direct_shadow_walk walker = {
-		.walker = { .entry = direct_map_entry, },
-		.pfn = pfn,
-		.largepage = largepage,
-		.write = write,
-		.pt_write = 0,
-	};
-
-	r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT);
-	if (r < 0)
-		return r;
-	return walker.pt_write;
+			set_shadow_pte(iterator.sptep,
+				       __pa(sp->spt)
+				       | PT_PRESENT_MASK | PT_WRITABLE_MASK
+				       | shadow_user_mask | shadow_x_mask);
+		}
+	}
+	return pt_write;
 }
 
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
-- 
cgit v1.1


From e7a04c99b54ad9acb98a56113ec3163bc1039e13 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 25 Dec 2008 15:10:50 +0200
Subject: KVM: MMU: Replace walk_shadow() by for_each_shadow_entry() in fetch()

Effectively reverting to the pre walk_shadow() version -- but now
with the reusable for_each().

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 128 ++++++++++++++++++++-------------------------
 1 file changed, 58 insertions(+), 70 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 9fd78b6..69c7e33 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -283,91 +283,79 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  */
-static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
-				    struct kvm_vcpu *vcpu, u64 addr,
-				    u64 *sptep, int level)
+static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+			 struct guest_walker *gw,
+			 int user_fault, int write_fault, int largepage,
+			 int *ptwrite, pfn_t pfn)
 {
-	struct shadow_walker *sw =
-		container_of(_sw, struct shadow_walker, walker);
-	struct guest_walker *gw = sw->guest_walker;
 	unsigned access = gw->pt_access;
 	struct kvm_mmu_page *shadow_page;
-	u64 spte;
+	u64 spte, *sptep;
 	int metaphysical;
 	gfn_t table_gfn;
 	int r;
+	int level;
 	pt_element_t curr_pte;
+	struct kvm_shadow_walk_iterator iterator;
 
-	if (level == PT_PAGE_TABLE_LEVEL
-	    || (sw->largepage && level == PT_DIRECTORY_LEVEL)) {
-		mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
-			     sw->user_fault, sw->write_fault,
-			     gw->ptes[gw->level-1] & PT_DIRTY_MASK,
-			     sw->ptwrite, sw->largepage,
-			     gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
-			     gw->gfn, sw->pfn, false);
-		sw->sptep = sptep;
-		return 1;
-	}
-
-	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
-		return 0;
-
-	if (is_large_pte(*sptep)) {
-		set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
-		kvm_flush_remote_tlbs(vcpu->kvm);
-		rmap_remove(vcpu->kvm, sptep);
-	}
+	if (!is_present_pte(gw->ptes[gw->level - 1]))
+		return NULL;
 
-	if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) {
-		metaphysical = 1;
-		if (!is_dirty_pte(gw->ptes[level - 1]))
-			access &= ~ACC_WRITE_MASK;
-		table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
-	} else {
-		metaphysical = 0;
-		table_gfn = gw->table_gfn[level - 2];
-	}
-	shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1,
-				       metaphysical, access, sptep);
-	if (!metaphysical) {
-		r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2],
-					  &curr_pte, sizeof(curr_pte));
-		if (r || curr_pte != gw->ptes[level - 2]) {
-			kvm_mmu_put_page(shadow_page, sptep);
-			kvm_release_pfn_clean(sw->pfn);
-			sw->sptep = NULL;
-			return 1;
+	for_each_shadow_entry(vcpu, addr, iterator) {
+		level = iterator.level;
+		sptep = iterator.sptep;
+		if (level == PT_PAGE_TABLE_LEVEL
+		    || (largepage && level == PT_DIRECTORY_LEVEL)) {
+			mmu_set_spte(vcpu, sptep, access,
+				     gw->pte_access & access,
+				     user_fault, write_fault,
+				     gw->ptes[gw->level-1] & PT_DIRTY_MASK,
+				     ptwrite, largepage,
+				     gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
+				     gw->gfn, pfn, false);
+			break;
 		}
-	}
 
-	spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK
-		| PT_WRITABLE_MASK | PT_USER_MASK;
-	*sptep = spte;
-	return 0;
-}
+		if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
+			continue;
 
-static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
-			 struct guest_walker *guest_walker,
-			 int user_fault, int write_fault, int largepage,
-			 int *ptwrite, pfn_t pfn)
-{
-	struct shadow_walker walker = {
-		.walker = { .entry = FNAME(shadow_walk_entry), },
-		.guest_walker = guest_walker,
-		.user_fault = user_fault,
-		.write_fault = write_fault,
-		.largepage = largepage,
-		.ptwrite = ptwrite,
-		.pfn = pfn,
-	};
+		if (is_large_pte(*sptep)) {
+			set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+			kvm_flush_remote_tlbs(vcpu->kvm);
+			rmap_remove(vcpu->kvm, sptep);
+		}
 
-	if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1]))
-		return NULL;
+		if (level == PT_DIRECTORY_LEVEL
+		    && gw->level == PT_DIRECTORY_LEVEL) {
+			metaphysical = 1;
+			if (!is_dirty_pte(gw->ptes[level - 1]))
+				access &= ~ACC_WRITE_MASK;
+			table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
+		} else {
+			metaphysical = 0;
+			table_gfn = gw->table_gfn[level - 2];
+		}
+		shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
+					       metaphysical, access, sptep);
+		if (!metaphysical) {
+			r = kvm_read_guest_atomic(vcpu->kvm,
+						  gw->pte_gpa[level - 2],
+						  &curr_pte, sizeof(curr_pte));
+			if (r || curr_pte != gw->ptes[level - 2]) {
+				kvm_mmu_put_page(shadow_page, sptep);
+				kvm_release_pfn_clean(pfn);
+				sptep = NULL;
+				break;
+			}
+		}
 
-	walk_shadow(&walker.walker, vcpu, addr);
+		spte = __pa(shadow_page->spt)
+			| PT_PRESENT_MASK | PT_ACCESSED_MASK
+			| PT_WRITABLE_MASK | PT_USER_MASK;
+		*sptep = spte;
+	}
 
-	return walker.sptep;
+	return sptep;
 }
 
 /*
-- 
cgit v1.1


From a461930bc3cece021f8f89a80dcc1d0691a92b52 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 25 Dec 2008 15:19:00 +0200
Subject: KVM: MMU: Replace walk_shadow() by for_each_shadow_entry() in
 invlpg()

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 81 ++++++++++++++++++----------------------------
 1 file changed, 32 insertions(+), 49 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 69c7e33..46b68f9 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -25,7 +25,6 @@
 #if PTTYPE == 64
 	#define pt_element_t u64
 	#define guest_walker guest_walker64
-	#define shadow_walker shadow_walker64
 	#define FNAME(name) paging##64_##name
 	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
 	#define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
@@ -42,7 +41,6 @@
 #elif PTTYPE == 32
 	#define pt_element_t u32
 	#define guest_walker guest_walker32
-	#define shadow_walker shadow_walker32
 	#define FNAME(name) paging##32_##name
 	#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
 	#define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
@@ -73,18 +71,6 @@ struct guest_walker {
 	u32 error_code;
 };
 
-struct shadow_walker {
-	struct kvm_shadow_walk walker;
-	struct guest_walker *guest_walker;
-	int user_fault;
-	int write_fault;
-	int largepage;
-	int *ptwrite;
-	pfn_t pfn;
-	u64 *sptep;
-	gpa_t pte_gpa;
-};
-
 static gfn_t gpte_to_gfn(pt_element_t gpte)
 {
 	return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -453,54 +439,52 @@ out_unlock:
 	return 0;
 }
 
-static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
-				      struct kvm_vcpu *vcpu, u64 addr,
-				      u64 *sptep, int level)
+static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 {
-	struct shadow_walker *sw =
-		container_of(_sw, struct shadow_walker, walker);
+	struct kvm_shadow_walk_iterator iterator;
+	pt_element_t gpte;
+	gpa_t pte_gpa = -1;
+	int level;
+	u64 *sptep;
+
+	spin_lock(&vcpu->kvm->mmu_lock);
 
-	/* FIXME: properly handle invlpg on large guest pages */
-	if (level == PT_PAGE_TABLE_LEVEL ||
-	    ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) {
-		struct kvm_mmu_page *sp = page_header(__pa(sptep));
+	for_each_shadow_entry(vcpu, gva, iterator) {
+		level = iterator.level;
+		sptep = iterator.sptep;
 
-		sw->pte_gpa = (sp->gfn << PAGE_SHIFT);
-		sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
+		/* FIXME: properly handle invlpg on large guest pages */
+		if (level == PT_PAGE_TABLE_LEVEL ||
+		    ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) {
+			struct kvm_mmu_page *sp = page_header(__pa(sptep));
 
-		if (is_shadow_present_pte(*sptep)) {
-			rmap_remove(vcpu->kvm, sptep);
-			if (is_large_pte(*sptep))
-				--vcpu->kvm->stat.lpages;
+			pte_gpa = (sp->gfn << PAGE_SHIFT);
+			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
+
+			if (is_shadow_present_pte(*sptep)) {
+				rmap_remove(vcpu->kvm, sptep);
+				if (is_large_pte(*sptep))
+					--vcpu->kvm->stat.lpages;
+			}
+			set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+			break;
 		}
-		set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
-		return 1;
-	}
-	if (!is_shadow_present_pte(*sptep))
-		return 1;
-	return 0;
-}
 
-static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
-{
-	pt_element_t gpte;
-	struct shadow_walker walker = {
-		.walker = { .entry = FNAME(shadow_invlpg_entry), },
-		.pte_gpa = -1,
-	};
+		if (!is_shadow_present_pte(*sptep))
+			break;
+	}
 
-	spin_lock(&vcpu->kvm->mmu_lock);
-	walk_shadow(&walker.walker, vcpu, gva);
 	spin_unlock(&vcpu->kvm->mmu_lock);
-	if (walker.pte_gpa == -1)
+
+	if (pte_gpa == -1)
 		return;
-	if (kvm_read_guest_atomic(vcpu->kvm, walker.pte_gpa, &gpte,
+	if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
 				  sizeof(pt_element_t)))
 		return;
 	if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) {
 		if (mmu_topup_memory_caches(vcpu))
 			return;
-		kvm_mmu_pte_write(vcpu, walker.pte_gpa, (const u8 *)&gpte,
+		kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte,
 				  sizeof(pt_element_t), 0);
 	}
 }
@@ -607,7 +591,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
 #undef pt_element_t
 #undef guest_walker
-#undef shadow_walker
 #undef FNAME
 #undef PT_BASE_ADDR_MASK
 #undef PT_INDEX
-- 
cgit v1.1


From e8c4a4e8a7cc047dfb3c26b2cbc8599ad3460364 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 25 Dec 2008 15:20:07 +0200
Subject: KVM: MMU: Drop walk_shadow()

No longer used.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a25e1ad..1585080 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -145,11 +145,6 @@ struct kvm_rmap_desc {
 	struct kvm_rmap_desc *more;
 };
 
-struct kvm_shadow_walk {
-	int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu,
-		     u64 addr, u64 *spte, int level);
-};
-
 struct kvm_shadow_walk_iterator {
 	u64 addr;
 	hpa_t shadow_addr;
@@ -1299,21 +1294,6 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
 	--iterator->level;
 }
 
-static int walk_shadow(struct kvm_shadow_walk *walker,
-		       struct kvm_vcpu *vcpu, u64 addr)
-{
-	struct kvm_shadow_walk_iterator iterator;
-	int r;
-
-	for_each_shadow_entry(vcpu, addr, iterator) {
-		r = walker->entry(walker, vcpu, addr,
-				  iterator.sptep, iterator.level);
-		if (r)
-			return r;
-	}
-	return 0;
-}
-
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 					 struct kvm_mmu_page *sp)
 {
-- 
cgit v1.1


From 53f658b3c33616a4997ee254311b335e59063289 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 11 Dec 2008 20:45:05 +0100
Subject: KVM: VMX: initialize TSC offset relative to vm creation time

VMX initializes the TSC offset for each vcpu at different times, and
also reinitializes it for vcpus other than 0 on APIC SIPI message.

This bug causes the TSC's to appear unsynchronized in the guest, even if
the host is good.

Older Linux kernels don't handle the situation very well, so
gettimeofday is likely to go backwards in time:

http://www.mail-archive.com/kvm@vger.kernel.org/msg02955.html
http://sourceforge.net/tracker/index.php?func=detail&aid=2025534&group_id=180599&atid=893831

Fix it by initializating the offset of each vcpu relative to vm creation
time, and moving it from vmx_vcpu_reset to vmx_vcpu_setup, out of the
APIC MP init path.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/vmx.c              | 19 +++++++++++--------
 arch/x86/kvm/x86.c              |  2 ++
 3 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c2a01d0..9efc446 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -398,6 +398,7 @@ struct kvm_arch{
 
 	unsigned long irq_sources_bitmap;
 	unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
+	u64 vm_init_tsc;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index cee81c9..3312047 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -865,11 +865,8 @@ static u64 guest_read_tsc(void)
  * writes 'guest_tsc' into guest's timestamp counter "register"
  * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
  */
-static void guest_write_tsc(u64 guest_tsc)
+static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
 {
-	u64 host_tsc;
-
-	rdtscll(host_tsc);
 	vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
 }
 
@@ -934,6 +931,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct kvm_msr_entry *msr;
+	u64 host_tsc;
 	int ret = 0;
 
 	switch (msr_index) {
@@ -959,7 +957,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 		vmcs_writel(GUEST_SYSENTER_ESP, data);
 		break;
 	case MSR_IA32_TIME_STAMP_COUNTER:
-		guest_write_tsc(data);
+		rdtscll(host_tsc);
+		guest_write_tsc(data, host_tsc);
 		break;
 	case MSR_P6_PERFCTR0:
 	case MSR_P6_PERFCTR1:
@@ -2109,7 +2108,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 {
 	u32 host_sysenter_cs, msr_low, msr_high;
 	u32 junk;
-	u64 host_pat;
+	u64 host_pat, tsc_this, tsc_base;
 	unsigned long a;
 	struct descriptor_table dt;
 	int i;
@@ -2237,6 +2236,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
 	vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
 
+	tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
+	rdtscll(tsc_this);
+	if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
+		tsc_base = tsc_this;
+
+	guest_write_tsc(0, tsc_base);
 
 	return 0;
 }
@@ -2328,8 +2333,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
 	vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
 
-	guest_write_tsc(0);
-
 	/* Special registers */
 	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 873602b..3b2acfd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4170,6 +4170,8 @@ struct  kvm *kvm_arch_create_vm(void)
 	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
 	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
 
+	rdtscll(kvm->arch.vm_init_tsc);
+
 	return kvm;
 }
 
-- 
cgit v1.1


From 77c2002e7c6f019f59a6f3cc5f8b16b41748dbe1 Mon Sep 17 00:00:00 2001
From: Izik Eidus <ieidus@redhat.com>
Date: Mon, 29 Dec 2008 01:42:19 +0200
Subject: KVM: introduce kvm_read_guest_virt, kvm_write_guest_virt

This commit change the name of emulator_read_std into kvm_read_guest_virt,
and add new function name kvm_write_guest_virt that allow writing into a
guest virtual address.

Signed-off-by: Izik Eidus <ieidus@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  4 ---
 arch/x86/kvm/x86.c              | 56 ++++++++++++++++++++++++++++++-----------
 2 files changed, 42 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9efc446..b74576a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -609,10 +609,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
 void fx_init(struct kvm_vcpu *vcpu);
 
-int emulator_read_std(unsigned long addr,
-		      void *val,
-		      unsigned int bytes,
-		      struct kvm_vcpu *vcpu);
 int emulator_write_emulated(unsigned long addr,
 			    const void *val,
 			    unsigned int bytes,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3b2acfd..67f9176 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1976,10 +1976,8 @@ static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
 	return dev;
 }
 
-int emulator_read_std(unsigned long addr,
-			     void *val,
-			     unsigned int bytes,
-			     struct kvm_vcpu *vcpu)
+int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+			struct kvm_vcpu *vcpu)
 {
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
@@ -1987,27 +1985,57 @@ int emulator_read_std(unsigned long addr,
 	while (bytes) {
 		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
 		unsigned offset = addr & (PAGE_SIZE-1);
-		unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
+		unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
 
 		if (gpa == UNMAPPED_GVA) {
 			r = X86EMUL_PROPAGATE_FAULT;
 			goto out;
 		}
-		ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
+		ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
 		if (ret < 0) {
 			r = X86EMUL_UNHANDLEABLE;
 			goto out;
 		}
 
-		bytes -= tocopy;
-		data += tocopy;
-		addr += tocopy;
+		bytes -= toread;
+		data += toread;
+		addr += toread;
 	}
 out:
 	return r;
 }
-EXPORT_SYMBOL_GPL(emulator_read_std);
+
+int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
+			 struct kvm_vcpu *vcpu)
+{
+	void *data = val;
+	int r = X86EMUL_CONTINUE;
+
+	while (bytes) {
+		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+		unsigned offset = addr & (PAGE_SIZE-1);
+		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
+		int ret;
+
+		if (gpa == UNMAPPED_GVA) {
+			r = X86EMUL_PROPAGATE_FAULT;
+			goto out;
+		}
+		ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
+		if (ret < 0) {
+			r = X86EMUL_UNHANDLEABLE;
+			goto out;
+		}
+
+		bytes -= towrite;
+		data += towrite;
+		addr += towrite;
+	}
+out:
+	return r;
+}
+
 
 static int emulator_read_emulated(unsigned long addr,
 				  void *val,
@@ -2029,8 +2057,8 @@ static int emulator_read_emulated(unsigned long addr,
 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
 		goto mmio;
 
-	if (emulator_read_std(addr, val, bytes, vcpu)
-			== X86EMUL_CONTINUE)
+	if (kvm_read_guest_virt(addr, val, bytes, vcpu)
+				== X86EMUL_CONTINUE)
 		return X86EMUL_CONTINUE;
 	if (gpa == UNMAPPED_GVA)
 		return X86EMUL_PROPAGATE_FAULT;
@@ -2233,7 +2261,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
 
 	rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
 
-	emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
+	kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu);
 
 	printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
 	       context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
@@ -2241,7 +2269,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
 
 static struct x86_emulate_ops emulate_ops = {
-	.read_std            = emulator_read_std,
+	.read_std            = kvm_read_guest_virt,
 	.read_emulated       = emulator_read_emulated,
 	.write_emulated      = emulator_write_emulated,
 	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
-- 
cgit v1.1


From 0f346074403bc109f9569f14b45cb09e83729032 Mon Sep 17 00:00:00 2001
From: Izik Eidus <ieidus@redhat.com>
Date: Mon, 29 Dec 2008 01:42:20 +0200
Subject: KVM: remove the vmap usage

vmap() on guest pages hides those pages from the Linux mm for an extended
(userspace determined) amount of time.  Get rid of it.

Signed-off-by: Izik Eidus <ieidus@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 62 ++++++++++++------------------------------------------
 1 file changed, 13 insertions(+), 49 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 67f9176..2a4f3a6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2371,40 +2371,19 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 }
 EXPORT_SYMBOL_GPL(emulate_instruction);
 
-static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
-		if (vcpu->arch.pio.guest_pages[i]) {
-			kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
-			vcpu->arch.pio.guest_pages[i] = NULL;
-		}
-}
-
 static int pio_copy_data(struct kvm_vcpu *vcpu)
 {
 	void *p = vcpu->arch.pio_data;
-	void *q;
+	gva_t q = vcpu->arch.pio.guest_gva;
 	unsigned bytes;
-	int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
+	int ret;
 
-	q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
-		 PAGE_KERNEL);
-	if (!q) {
-		free_pio_guest_pages(vcpu);
-		return -ENOMEM;
-	}
-	q += vcpu->arch.pio.guest_page_offset;
 	bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
 	if (vcpu->arch.pio.in)
-		memcpy(q, p, bytes);
+		ret = kvm_write_guest_virt(q, p, bytes, vcpu);
 	else
-		memcpy(p, q, bytes);
-	q -= vcpu->arch.pio.guest_page_offset;
-	vunmap(q);
-	free_pio_guest_pages(vcpu);
-	return 0;
+		ret = kvm_read_guest_virt(q, p, bytes, vcpu);
+	return ret;
 }
 
 int complete_pio(struct kvm_vcpu *vcpu)
@@ -2515,7 +2494,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	vcpu->arch.pio.in = in;
 	vcpu->arch.pio.string = 0;
 	vcpu->arch.pio.down = 0;
-	vcpu->arch.pio.guest_page_offset = 0;
 	vcpu->arch.pio.rep = 0;
 
 	if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
@@ -2543,9 +2521,7 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 		  gva_t address, int rep, unsigned port)
 {
 	unsigned now, in_page;
-	int i, ret = 0;
-	int nr_pages = 1;
-	struct page *page;
+	int ret = 0;
 	struct kvm_io_device *pio_dev;
 
 	vcpu->run->exit_reason = KVM_EXIT_IO;
@@ -2557,7 +2533,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	vcpu->arch.pio.in = in;
 	vcpu->arch.pio.string = 1;
 	vcpu->arch.pio.down = down;
-	vcpu->arch.pio.guest_page_offset = offset_in_page(address);
 	vcpu->arch.pio.rep = rep;
 
 	if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
@@ -2577,15 +2552,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	else
 		in_page = offset_in_page(address) + size;
 	now = min(count, (unsigned long)in_page / size);
-	if (!now) {
-		/*
-		 * String I/O straddles page boundary.  Pin two guest pages
-		 * so that we satisfy atomicity constraints.  Do just one
-		 * transaction to avoid complexity.
-		 */
-		nr_pages = 2;
+	if (!now)
 		now = 1;
-	}
 	if (down) {
 		/*
 		 * String I/O in reverse.  Yuck.  Kill the guest, fix later.
@@ -2600,15 +2568,7 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
 		kvm_x86_ops->skip_emulated_instruction(vcpu);
 
-	for (i = 0; i < nr_pages; ++i) {
-		page = gva_to_page(vcpu, address + i * PAGE_SIZE);
-		vcpu->arch.pio.guest_pages[i] = page;
-		if (!page) {
-			kvm_inject_gp(vcpu, 0);
-			free_pio_guest_pages(vcpu);
-			return 1;
-		}
-	}
+	vcpu->arch.pio.guest_gva = address;
 
 	pio_dev = vcpu_find_pio_dev(vcpu, port,
 				    vcpu->arch.pio.cur_count,
@@ -2616,7 +2576,11 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	if (!vcpu->arch.pio.in) {
 		/* string PIO write */
 		ret = pio_copy_data(vcpu);
-		if (ret >= 0 && pio_dev) {
+		if (ret == X86EMUL_PROPAGATE_FAULT) {
+			kvm_inject_gp(vcpu, 0);
+			return 1;
+		}
+		if (ret == 0 && pio_dev) {
 			pio_string_write(pio_dev, vcpu);
 			complete_pio(vcpu);
 			if (vcpu->arch.pio.count == 0)
-- 
cgit v1.1


From 61a6bd672bda3b9468bf5895c1be085c4e481138 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 29 Dec 2008 17:32:28 +0200
Subject: KVM: Fallback support for MSR_VM_HSAVE_PA

Since we advertise MSR_VM_HSAVE_PA, userspace will attempt to read it
even on Intel.  Implement fake support for this MSR to avoid the
warnings.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2a4f3a6..c3fbe8c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -742,6 +742,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		break;
 	case MSR_IA32_UCODE_REV:
 	case MSR_IA32_UCODE_WRITE:
+	case MSR_VM_HSAVE_PA:
 		break;
 	case 0x200 ... 0x2ff:
 		return set_msr_mtrr(vcpu, msr, data);
@@ -863,6 +864,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_IA32_LASTBRANCHTOIP:
 	case MSR_IA32_LASTINTFROMIP:
 	case MSR_IA32_LASTINTTOIP:
+	case MSR_VM_HSAVE_PA:
 		data = 0;
 		break;
 	case MSR_MTRRcap:
-- 
cgit v1.1


From 52d939a0bf44081bc9f69b4fbdc9e7f416df27c7 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 30 Dec 2008 15:55:06 -0200
Subject: KVM: PIT: provide an option to disable interrupt reinjection

Certain clocks (such as TSC) in older 2.6 guests overaccount for lost
ticks, causing severe time drift. Interrupt reinjection magnifies the
problem.

Provide an option to disable it.

[avi: allow room for expansion in case we want to disable reinjection
      of other timers]

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm.h |  5 +++++
 arch/x86/kvm/i8254.c       |  4 ++++
 arch/x86/kvm/i8254.h       |  1 +
 arch/x86/kvm/x86.c         | 21 +++++++++++++++++++++
 4 files changed, 31 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 32eb96c..54bcf22 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -233,4 +233,9 @@ struct kvm_guest_debug_arch {
 struct kvm_pit_state {
 	struct kvm_pit_channel_state channels[3];
 };
+
+struct kvm_reinject_control {
+	__u8 pit_reinject;
+	__u8 reserved[31];
+};
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 72bd275..528daad 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -201,6 +201,9 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
 	if (!atomic_inc_and_test(&pt->pending))
 		set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
 
+	if (!pt->reinject)
+		atomic_set(&pt->pending, 1);
+
 	if (vcpu0 && waitqueue_active(&vcpu0->wq))
 		wake_up_interruptible(&vcpu0->wq);
 
@@ -580,6 +583,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
 	pit_state->irq_ack_notifier.gsi = 0;
 	pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
 	kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
+	pit_state->pit_timer.reinject = true;
 	mutex_unlock(&pit->pit_state.lock);
 
 	kvm_pit_reset(pit);
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 4178022..76959c4 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -9,6 +9,7 @@ struct kvm_kpit_timer {
 	s64 period; /* unit: ns */
 	s64 scheduled;
 	atomic_t pending;
+	bool reinject;
 };
 
 struct kvm_kpit_channel_state {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c3fbe8c..a1f1461 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -993,6 +993,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_NOP_IO_DELAY:
 	case KVM_CAP_MP_STATE:
 	case KVM_CAP_SYNC_MMU:
+	case KVM_CAP_REINJECT_CONTROL:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -1728,6 +1729,15 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 	return r;
 }
 
+static int kvm_vm_ioctl_reinject(struct kvm *kvm,
+				 struct kvm_reinject_control *control)
+{
+	if (!kvm->arch.vpit)
+		return -ENXIO;
+	kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
+	return 0;
+}
+
 /*
  * Get (and clear) the dirty memory log for a memory slot.
  */
@@ -1925,6 +1935,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_REINJECT_CONTROL: {
+		struct kvm_reinject_control control;
+		r =  -EFAULT;
+		if (copy_from_user(&control, argp, sizeof(control)))
+			goto out;
+		r = kvm_vm_ioctl_reinject(kvm, &control);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
 	default:
 		;
 	}
-- 
cgit v1.1


From 1c08364c3565242f1e1bd585bc2ce458967941af Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 4 Jan 2009 12:39:07 +0200
Subject: KVM: Move struct kvm_pio_request into x86 kvm_host.h

This is an x86 specific stucture and has no business living in common code.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b74576a..863ea73 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -227,6 +227,18 @@ struct kvm_pv_mmu_op_buffer {
 	char buf[512] __aligned(sizeof(long));
 };
 
+struct kvm_pio_request {
+	unsigned long count;
+	int cur_count;
+	gva_t guest_gva;
+	int in;
+	int port;
+	int size;
+	int string;
+	int down;
+	int rep;
+};
+
 /*
  * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
  * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu
-- 
cgit v1.1


From 1872a3f411ffe95c8e92300e0986a3532db55ce9 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 4 Jan 2009 23:26:52 +0200
Subject: KVM: VMX: Fix guest state validity checks

The vmx guest state validity checks are full of bugs.  Make them
conform to the manual.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3312047..be94410 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1784,14 +1784,16 @@ static bool code_segment_valid(struct kvm_vcpu *vcpu)
 	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
 	cs_rpl = cs.selector & SELECTOR_RPL_MASK;
 
+	if (cs.unusable)
+		return false;
 	if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
 		return false;
 	if (!cs.s)
 		return false;
-	if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) {
+	if (cs.type & AR_TYPE_WRITEABLE_MASK) {
 		if (cs.dpl > cs_rpl)
 			return false;
-	} else if (cs.type & AR_TYPE_CODE_MASK) {
+	} else {
 		if (cs.dpl != cs_rpl)
 			return false;
 	}
@@ -1810,7 +1812,9 @@ static bool stack_segment_valid(struct kvm_vcpu *vcpu)
 	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
 	ss_rpl = ss.selector & SELECTOR_RPL_MASK;
 
-	if ((ss.type != 3) || (ss.type != 7))
+	if (ss.unusable)
+		return true;
+	if (ss.type != 3 && ss.type != 7)
 		return false;
 	if (!ss.s)
 		return false;
@@ -1830,6 +1834,8 @@ static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
 	vmx_get_segment(vcpu, &var, seg);
 	rpl = var.selector & SELECTOR_RPL_MASK;
 
+	if (var.unusable)
+		return true;
 	if (!var.s)
 		return false;
 	if (!var.present)
@@ -1851,9 +1857,11 @@ static bool tr_valid(struct kvm_vcpu *vcpu)
 
 	vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
 
+	if (tr.unusable)
+		return false;
 	if (tr.selector & SELECTOR_TI_MASK)	/* TI = 1 */
 		return false;
-	if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */
+	if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
 		return false;
 	if (!tr.present)
 		return false;
@@ -1867,6 +1875,8 @@ static bool ldtr_valid(struct kvm_vcpu *vcpu)
 
 	vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
 
+	if (ldtr.unusable)
+		return true;
 	if (ldtr.selector & SELECTOR_TI_MASK)	/* TI = 1 */
 		return false;
 	if (ldtr.type != 2)
-- 
cgit v1.1


From 9fd4a3b7a412f983696b23121413a79d2132fed6 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 4 Jan 2009 23:43:42 +0200
Subject: KVM: VMX: don't clobber segment AR if emulating invalid state

The ususable bit is important for determining state validity; don't
clobber it.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index be94410..a6598cb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1649,7 +1649,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
 	var->limit = vmcs_read32(sf->limit);
 	var->selector = vmcs_read16(sf->selector);
 	ar = vmcs_read32(sf->ar_bytes);
-	if (ar & AR_UNUSABLE_MASK)
+	if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
 		ar = 0;
 	var->type = ar & 15;
 	var->s = (ar >> 4) & 1;
-- 
cgit v1.1


From 10f32d84c750ccf8c0afb3a4ea9d4059aa3e9ffc Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 5 Jan 2009 00:53:19 +0200
Subject: KVM: VMX: Prevent exit handler from running if emulating due to
 invalid state

If we've just emulated an instruction, we won't have any valid exit
reason and associated information.

Fix by moving the clearing of the emulation_required flag to the exit handler.
This way the exit handler can notice that we've been emulating and abort
early.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a6598cb..a309be6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3130,7 +3130,6 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
 				struct kvm_run *kvm_run)
 {
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	int err;
 
 	preempt_enable();
@@ -3155,11 +3154,6 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
 
 	local_irq_disable();
 	preempt_disable();
-
-	/* Guest state should be valid now except if we need to
-	 * emulate an MMIO */
-	if (guest_state_valid(vcpu))
-		vmx->emulation_required = 0;
 }
 
 /*
@@ -3208,8 +3202,11 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 	/* If we need to emulate an MMIO from handle_invalid_guest_state
 	 * we just return 0 */
-	if (vmx->emulation_required && emulate_invalid_guest_state)
+	if (vmx->emulation_required && emulate_invalid_guest_state) {
+		if (guest_state_valid(vcpu))
+			vmx->emulation_required = 0;
 		return 0;
+	}
 
 	/* Access CR3 don't cause VMExit in paging mode, so we need
 	 * to sync with guest real CR3. */
-- 
cgit v1.1


From 350f69dcd169d536307aa4a8c38c480e3a51c0db Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 5 Jan 2009 11:12:40 +0200
Subject: KVM: x86 emulator: Make emulate_pop() a little more generic

Allow emulate_pop() to read into arbitrary memory rather than just the
source operand.  Needed for complicated instructions like far returns.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 54fb098..94459f3 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1136,18 +1136,19 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
 }
 
 static int emulate_pop(struct x86_emulate_ctxt *ctxt,
-		       struct x86_emulate_ops *ops)
+		       struct x86_emulate_ops *ops,
+		       void *dest, int len)
 {
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
 	rc = ops->read_emulated(register_address(c, ss_base(ctxt),
 						 c->regs[VCPU_REGS_RSP]),
-				&c->src.val, c->src.bytes, ctxt->vcpu);
+				dest, len, ctxt->vcpu);
 	if (rc != 0)
 		return rc;
 
-	register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.bytes);
+	register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
 	return rc;
 }
 
@@ -1157,11 +1158,9 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
-	c->src.bytes = c->dst.bytes;
-	rc = emulate_pop(ctxt, ops);
+	rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
 	if (rc != 0)
 		return rc;
-	c->dst.val = c->src.val;
 	return 0;
 }
 
@@ -1467,11 +1466,9 @@ special_insn:
 		break;
 	case 0x58 ... 0x5f: /* pop reg */
 	pop_instruction:
-		c->src.bytes = c->op_bytes;
-		rc = emulate_pop(ctxt, ops);
+		rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
 		if (rc != 0)
 			goto done;
-		c->dst.val = c->src.val;
 		break;
 	case 0x63:		/* movsxd */
 		if (ctxt->mode != X86EMUL_MODE_PROT64)
-- 
cgit v1.1


From 8b3079a5c0c031de07c8390aa160a4229088274f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 5 Jan 2009 12:10:54 +0200
Subject: KVM: VMX: When emulating on invalid vmx state, don't return to
 userspace unnecessarily

If we aren't doing mmio there's no need to exit to userspace (which will
just be confused).

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a309be6..df454de 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -91,6 +91,7 @@ struct vcpu_vmx {
 	} rmode;
 	int vpid;
 	bool emulation_required;
+	enum emulation_result invalid_state_emulation_result;
 
 	/* Support for vnmi-less CPUs */
 	int soft_vnmi_blocked;
@@ -3130,7 +3131,8 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
 				struct kvm_run *kvm_run)
 {
-	int err;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	enum emulation_result err = EMULATE_DONE;
 
 	preempt_enable();
 	local_irq_enable();
@@ -3154,6 +3156,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
 
 	local_irq_disable();
 	preempt_disable();
+
+	vmx->invalid_state_emulation_result = err;
 }
 
 /*
@@ -3205,7 +3209,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	if (vmx->emulation_required && emulate_invalid_guest_state) {
 		if (guest_state_valid(vcpu))
 			vmx->emulation_required = 0;
-		return 0;
+		return vmx->invalid_state_emulation_result != EMULATE_DO_MMIO;
 	}
 
 	/* Access CR3 don't cause VMExit in paging mode, so we need
-- 
cgit v1.1


From a77ab5ead5c1fef2c6c5a9b3cf3765e52643a2aa Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 5 Jan 2009 13:27:34 +0200
Subject: KVM: x86 emulator: implement 'ret far' instruction (opcode 0xcb)

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 94459f3..ca91749 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -178,7 +178,7 @@ static u32 opcode_table[256] = {
 	0, ImplicitOps | Stack, 0, 0,
 	ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
 	/* 0xC8 - 0xCF */
-	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, ImplicitOps | Stack, 0, 0, 0, 0,
 	/* 0xD0 - 0xD7 */
 	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
 	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
@@ -1278,6 +1278,25 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
 	return 0;
 }
 
+static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
+			   struct x86_emulate_ops *ops)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc;
+	unsigned long cs;
+
+	rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes);
+	if (rc)
+		return rc;
+	if (c->op_bytes == 4)
+		c->eip = (u32)c->eip;
+	rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
+	if (rc)
+		return rc;
+	rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS);
+	return rc;
+}
+
 static inline int writeback(struct x86_emulate_ctxt *ctxt,
 			    struct x86_emulate_ops *ops)
 {
@@ -1735,6 +1754,11 @@ special_insn:
 	mov:
 		c->dst.val = c->src.val;
 		break;
+	case 0xcb:		/* ret far */
+		rc = emulate_ret_far(ctxt, ops);
+		if (rc)
+			goto done;
+		break;
 	case 0xd0 ... 0xd1:	/* Grp2 */
 		c->src.val = 1;
 		emulate_grp2(ctxt);
-- 
cgit v1.1


From 269e05e48502f1cc06802e9fba90f5100dd6bb0d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 5 Jan 2009 15:21:42 +0200
Subject: KVM: Properly lock PIT creation

Otherwise, two threads can create a PIT in parallel and cause a memory leak.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8254.c | 2 --
 arch/x86/kvm/x86.c   | 6 ++++++
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 528daad..69d1bbff 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -548,9 +548,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
 	if (!pit)
 		return NULL;
 
-	mutex_lock(&kvm->lock);
 	pit->irq_source_id = kvm_request_irq_source_id(kvm);
-	mutex_unlock(&kvm->lock);
 	if (pit->irq_source_id < 0) {
 		kfree(pit);
 		return NULL;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a1f1461..6fbc346 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1837,10 +1837,16 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto out;
 		break;
 	case KVM_CREATE_PIT:
+		mutex_lock(&kvm->lock);
+		r = -EEXIST;
+		if (kvm->arch.vpit)
+			goto create_pit_unlock;
 		r = -ENOMEM;
 		kvm->arch.vpit = kvm_create_pit(kvm);
 		if (kvm->arch.vpit)
 			r = 0;
+	create_pit_unlock:
+		mutex_unlock(&kvm->lock);
 		break;
 	case KVM_IRQ_LINE: {
 		struct kvm_irq_level irq_event;
-- 
cgit v1.1


From c8a73f186bf62235b6fb5dd52601d641917dd50b Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 5 Jan 2009 16:02:47 +0100
Subject: KVM: SVM: Add microcode patch level dummy

VMware ESX checks if the microcode level is correct when using a barcelona
CPU, in order to see if it actually can use SVM. Let's tell it we're on the
safe side...

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b3a7a31..3a60c3f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1928,6 +1928,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 	case MSR_VM_CR:
 		*data = 0;
 		break;
+	case MSR_IA32_UCODE_REV:
+		*data = 0x01000065;
+		break;
 	default:
 		return kvm_get_msr_common(vcpu, ecx, data);
 	}
-- 
cgit v1.1


From 4677a3b693e035f186e2875259b9a0bb94c42fbe Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 6 Jan 2009 13:00:27 +0200
Subject: KVM: MMU: Optimize page unshadowing

Using kvm_mmu_lookup_page() will result in multiple scans of the hash chains;
use hlist_for_each_entry_safe() to achieve a single scan instead.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1585080..aac0499 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1471,11 +1471,20 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 
 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 {
+	unsigned index;
+	struct hlist_head *bucket;
 	struct kvm_mmu_page *sp;
+	struct hlist_node *node, *nn;
 
-	while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
-		pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word);
-		kvm_mmu_zap_page(kvm, sp);
+	index = kvm_page_table_hashfn(gfn);
+	bucket = &kvm->arch.mmu_page_hash[index];
+	hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
+		if (sp->gfn == gfn && !sp->role.metaphysical
+		    && !sp->role.invalid) {
+			pgprintk("%s: zap %lx %x\n",
+				 __func__, gfn, sp->role.word);
+			kvm_mmu_zap_page(kvm, sp);
+		}
 	}
 }
 
-- 
cgit v1.1


From 5d9b8e30f543a9f21a968a4cda71e8f6d1c66a61 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 4 Jan 2009 18:04:18 +0200
Subject: KVM: Add CONFIG_HAVE_KVM_IRQCHIP

Two KVM archs support irqchips and two don't.  Add a Kconfig item to
make selecting between the two models easier.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/Kconfig | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index b81125f..0a303c3e 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -4,6 +4,10 @@
 config HAVE_KVM
        bool
 
+config HAVE_KVM_IRQCHIP
+       bool
+       default y
+
 menuconfig VIRTUALIZATION
 	bool "Virtualization"
 	depends on HAVE_KVM || X86
-- 
cgit v1.1


From 4780c65904f0fc4e312ee2da9383eacbe04e61ea Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 4 Jan 2009 18:06:06 +0200
Subject: KVM: Reset PIT irq injection logic when the PIT IRQ is unmasked

While the PIT is masked the guest cannot ack the irq, so the reinject logic
will never allow the interrupt to be injected.

Fix by resetting the reinjection counters on unmask.

Unbreaks Xen.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8254.c | 15 +++++++++++++++
 arch/x86/kvm/i8254.h |  1 +
 2 files changed, 16 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 69d1bbff..c13bb92 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -539,6 +539,16 @@ void kvm_pit_reset(struct kvm_pit *pit)
 	pit->pit_state.irq_ack = 1;
 }
 
+static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
+{
+	struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
+
+	if (!mask) {
+		atomic_set(&pit->pit_state.pit_timer.pending, 0);
+		pit->pit_state.irq_ack = 1;
+	}
+}
+
 struct kvm_pit *kvm_create_pit(struct kvm *kvm)
 {
 	struct kvm_pit *pit;
@@ -586,6 +596,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
 
 	kvm_pit_reset(pit);
 
+	pit->mask_notifier.func = pit_mask_notifer;
+	kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+
 	return pit;
 }
 
@@ -594,6 +607,8 @@ void kvm_free_pit(struct kvm *kvm)
 	struct hrtimer *timer;
 
 	if (kvm->arch.vpit) {
+		kvm_unregister_irq_mask_notifier(kvm, 0,
+					       &kvm->arch.vpit->mask_notifier);
 		mutex_lock(&kvm->arch.vpit->pit_state.lock);
 		timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
 		hrtimer_cancel(timer);
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 76959c4..6acbe4b 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -46,6 +46,7 @@ struct kvm_pit {
 	struct kvm *kvm;
 	struct kvm_kpit_state pit_state;
 	int irq_source_id;
+	struct kvm_irq_mask_notifier mask_notifier;
 };
 
 #define KVM_PIT_BASE_ADDRESS	    0x40
-- 
cgit v1.1


From ff81ff10b4417952919dcc0bd67effa9ceb411c0 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 8 Jan 2009 11:05:17 -0800
Subject: KVM: SVM: Fix typo in has_svm()

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 3a60c3f..db5021b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -250,7 +250,7 @@ static int has_svm(void)
 	const char *msg;
 
 	if (!cpu_has_svm(&msg)) {
-		printk(KERN_INFO "has_svn: %s\n", msg);
+		printk(KERN_INFO "has_svm: %s\n", msg);
 		return 0;
 	}
 
-- 
cgit v1.1


From 9903a927a4aea4b1ea42356a8453fca664df0b18 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 8 Jan 2009 17:44:19 -0200
Subject: KVM: MMU: drop zeroing on mmu_memory_cache_alloc

Zeroing on mmu_memory_cache_alloc is unnecessary since:

- Smaller areas are pre-allocated with kmem_cache_zalloc.
- Page pointed by ->spt is overwritten with prefetch_page
  and entries in page pointed by ->gfns are initialized
  before reading.

[avi: zeroing pages is unnecessary]

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index aac0499..de9a9fb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -352,7 +352,6 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
 
 	BUG_ON(!mc->nobjs);
 	p = mc->objects[--mc->nobjs];
-	memset(p, 0, size);
 	return p;
 }
 
-- 
cgit v1.1


From f6e2c02b6d28ddabe99377c5640a833407a62632 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 11 Jan 2009 13:02:10 +0200
Subject: KVM: MMU: Rename "metaphysical" attribute to "direct"

This actually describes what is going on, rather than alerting the reader
that something strange is going on.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  5 +++--
 arch/x86/kvm/mmu.c              | 32 ++++++++++++++++----------------
 arch/x86/kvm/paging_tmpl.h      | 12 ++++++------
 3 files changed, 25 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 863ea73..55fd4c5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -170,7 +170,8 @@ struct kvm_pte_chain {
  *   bits 0:3 - total guest paging levels (2-4, or zero for real mode)
  *   bits 4:7 - page table level for this shadow (1-4)
  *   bits 8:9 - page table quadrant for 2-level guests
- *   bit   16 - "metaphysical" - gfn is not a real page (huge page/real mode)
+ *   bit   16 - direct mapping of virtual to physical mapping at gfn
+ *              used for real mode and two-dimensional paging
  *   bits 17:19 - common access permissions for all ptes in this shadow page
  */
 union kvm_mmu_page_role {
@@ -180,7 +181,7 @@ union kvm_mmu_page_role {
 		unsigned level:4;
 		unsigned quadrant:2;
 		unsigned pad_for_nice_hex_output:6;
-		unsigned metaphysical:1;
+		unsigned direct:1;
 		unsigned access:3;
 		unsigned invalid:1;
 		unsigned cr4_pge:1;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index de9a9fb..ef060ec 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1066,7 +1066,7 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry(sp, node, bucket, hash_link)
-		if (sp->gfn == gfn && !sp->role.metaphysical
+		if (sp->gfn == gfn && !sp->role.direct
 		    && !sp->role.invalid) {
 			pgprintk("%s: found role %x\n",
 				 __func__, sp->role.word);
@@ -1200,7 +1200,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 					     gfn_t gfn,
 					     gva_t gaddr,
 					     unsigned level,
-					     int metaphysical,
+					     int direct,
 					     unsigned access,
 					     u64 *parent_pte)
 {
@@ -1213,7 +1213,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 
 	role = vcpu->arch.mmu.base_role;
 	role.level = level;
-	role.metaphysical = metaphysical;
+	role.direct = direct;
 	role.access = access;
 	if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
 		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
@@ -1250,7 +1250,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	sp->role = role;
 	sp->global = role.cr4_pge;
 	hlist_add_head(&sp->hash_link, bucket);
-	if (!metaphysical) {
+	if (!direct) {
 		if (rmap_write_protect(vcpu->kvm, gfn))
 			kvm_flush_remote_tlbs(vcpu->kvm);
 		account_shadowed(vcpu->kvm, gfn);
@@ -1395,7 +1395,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	kvm_mmu_page_unlink_children(kvm, sp);
 	kvm_mmu_unlink_parents(kvm, sp);
 	kvm_flush_remote_tlbs(kvm);
-	if (!sp->role.invalid && !sp->role.metaphysical)
+	if (!sp->role.invalid && !sp->role.direct)
 		unaccount_shadowed(kvm, sp->gfn);
 	if (sp->unsync)
 		kvm_unlink_unsync_page(kvm, sp);
@@ -1458,7 +1458,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
-		if (sp->gfn == gfn && !sp->role.metaphysical) {
+		if (sp->gfn == gfn && !sp->role.direct) {
 			pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
 				 sp->role.word);
 			r = 1;
@@ -1478,7 +1478,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
-		if (sp->gfn == gfn && !sp->role.metaphysical
+		if (sp->gfn == gfn && !sp->role.direct
 		    && !sp->role.invalid) {
 			pgprintk("%s: zap %lx %x\n",
 				 __func__, gfn, sp->role.word);
@@ -1638,7 +1638,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 	/* don't unsync if pagetable is shadowed with multiple roles */
 	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
-		if (s->gfn != sp->gfn || s->role.metaphysical)
+		if (s->gfn != sp->gfn || s->role.direct)
 			continue;
 		if (s->role.word != sp->role.word)
 			return 1;
@@ -1951,7 +1951,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 	int i;
 	gfn_t root_gfn;
 	struct kvm_mmu_page *sp;
-	int metaphysical = 0;
+	int direct = 0;
 
 	root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
 
@@ -1960,18 +1960,18 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 
 		ASSERT(!VALID_PAGE(root));
 		if (tdp_enabled)
-			metaphysical = 1;
+			direct = 1;
 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
-				      PT64_ROOT_LEVEL, metaphysical,
+				      PT64_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
 		vcpu->arch.mmu.root_hpa = root;
 		return;
 	}
-	metaphysical = !is_paging(vcpu);
+	direct = !is_paging(vcpu);
 	if (tdp_enabled)
-		metaphysical = 1;
+		direct = 1;
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
@@ -1985,7 +1985,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		} else if (vcpu->arch.mmu.root_level == 0)
 			root_gfn = 0;
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
-				      PT32_ROOT_LEVEL, metaphysical,
+				      PT32_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
@@ -2487,7 +2487,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
-		if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid)
+		if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
 			continue;
 		pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
 		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
@@ -3125,7 +3125,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
 	gfn_t gfn;
 
 	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		if (sp->role.metaphysical)
+		if (sp->role.direct)
 			continue;
 
 		gfn = unalias_gfn(vcpu->kvm, sp->gfn);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 46b68f9..7314c09 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -277,7 +277,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	unsigned access = gw->pt_access;
 	struct kvm_mmu_page *shadow_page;
 	u64 spte, *sptep;
-	int metaphysical;
+	int direct;
 	gfn_t table_gfn;
 	int r;
 	int level;
@@ -313,17 +313,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 
 		if (level == PT_DIRECTORY_LEVEL
 		    && gw->level == PT_DIRECTORY_LEVEL) {
-			metaphysical = 1;
+			direct = 1;
 			if (!is_dirty_pte(gw->ptes[level - 1]))
 				access &= ~ACC_WRITE_MASK;
 			table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
 		} else {
-			metaphysical = 0;
+			direct = 0;
 			table_gfn = gw->table_gfn[level - 2];
 		}
 		shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
-					       metaphysical, access, sptep);
-		if (!metaphysical) {
+					       direct, access, sptep);
+		if (!direct) {
 			r = kvm_read_guest_atomic(vcpu->kvm,
 						  gw->pte_gpa[level - 2],
 						  &curr_pte, sizeof(curr_pte));
@@ -512,7 +512,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
 	pt_element_t pt[256 / sizeof(pt_element_t)];
 	gpa_t pte_gpa;
 
-	if (sp->role.metaphysical
+	if (sp->role.direct
 	    || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
 		nonpaging_prefetch_page(vcpu, sp);
 		return;
-- 
cgit v1.1


From 5a41accd3f69c6ef1d58f0c148980bae70515937 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 11 Jan 2009 17:19:35 +0200
Subject: KVM: MMU: Only enable cr4_pge role in shadow mode

Two dimensional paging is only confused by it.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6fbc346..19ccde6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -364,7 +364,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	}
 	kvm_x86_ops->set_cr4(vcpu, cr4);
 	vcpu->arch.cr4 = cr4;
-	vcpu->arch.mmu.base_role.cr4_pge = !!(cr4 & X86_CR4_PGE);
+	vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
 	kvm_mmu_sync_global(vcpu);
 	kvm_mmu_reset_context(vcpu);
 }
-- 
cgit v1.1


From 193554750441d91e127dd5066b8aebe0f769101c Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@redhat.com>
Date: Wed, 14 Jan 2009 16:56:00 +0000
Subject: KVM: x86: Fix typos and whitespace errors

Some typos, comments, whitespace errors corrected in the cpuid code

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 19ccde6..141a016 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1067,7 +1067,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
 		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
 			goto out;
 		r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
-			cpuid_arg->entries);
+						      cpuid_arg->entries);
 		if (r)
 			goto out;
 
@@ -1165,8 +1165,8 @@ out:
 }
 
 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
-				    struct kvm_cpuid2 *cpuid,
-				    struct kvm_cpuid_entry2 __user *entries)
+				     struct kvm_cpuid2 *cpuid,
+				     struct kvm_cpuid_entry2 __user *entries)
 {
 	int r;
 
@@ -1185,8 +1185,8 @@ out:
 }
 
 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
-				    struct kvm_cpuid2 *cpuid,
-				    struct kvm_cpuid_entry2 __user *entries)
+				     struct kvm_cpuid2 *cpuid,
+				     struct kvm_cpuid_entry2 __user *entries)
 {
 	int r;
 
@@ -1195,7 +1195,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 		goto out;
 	r = -EFAULT;
 	if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
-			   vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
+			 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
 		goto out;
 	return 0;
 
@@ -1205,12 +1205,12 @@ out:
 }
 
 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
-			  u32 index)
+			   u32 index)
 {
 	entry->function = function;
 	entry->index = index;
 	cpuid_count(entry->function, entry->index,
-		&entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
+		    &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
 	entry->flags = 0;
 }
 
@@ -1249,7 +1249,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) |
 		bit(X86_FEATURE_SVM);
 
-	/* all func 2 cpuid_count() should be called on the same cpu */
+	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
 	do_cpuid_1_ent(entry, function, index);
 	++*nent;
@@ -1323,7 +1323,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 }
 
 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
-				    struct kvm_cpuid_entry2 __user *entries)
+				     struct kvm_cpuid_entry2 __user *entries)
 {
 	struct kvm_cpuid_entry2 *cpuid_entries;
 	int limit, nent = 0, r = -E2BIG;
@@ -1340,7 +1340,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 	limit = cpuid_entries[0].eax;
 	for (func = 1; func <= limit && nent < cpuid->nent; ++func)
 		do_cpuid_ent(&cpuid_entries[nent], func, 0,
-				&nent, cpuid->nent);
+			     &nent, cpuid->nent);
 	r = -E2BIG;
 	if (nent >= cpuid->nent)
 		goto out_free;
@@ -1349,10 +1349,10 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 	limit = cpuid_entries[nent - 1].eax;
 	for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
 		do_cpuid_ent(&cpuid_entries[nent], func, 0,
-			       &nent, cpuid->nent);
+			     &nent, cpuid->nent);
 	r = -EFAULT;
 	if (copy_to_user(entries, cpuid_entries,
-			nent * sizeof(struct kvm_cpuid_entry2)))
+			 nent * sizeof(struct kvm_cpuid_entry2)))
 		goto out_free;
 	cpuid->nent = nent;
 	r = 0;
@@ -1496,7 +1496,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
 			goto out;
 		r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
-				cpuid_arg->entries);
+					      cpuid_arg->entries);
 		if (r)
 			goto out;
 		break;
@@ -1509,7 +1509,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
 			goto out;
 		r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
-				cpuid_arg->entries);
+					      cpuid_arg->entries);
 		if (r)
 			goto out;
 		r = -EFAULT;
@@ -2864,7 +2864,7 @@ static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
 	if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
 		return 0;
 	if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
-		!(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
+	    !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
 		return 0;
 	return 1;
 }
@@ -2892,7 +2892,6 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
 			if (!best || e->function > best->function)
 				best = e;
 	}
-
 	return best;
 }
 
-- 
cgit v1.1


From 399ec807ddc38ecccf8c06dbde04531cbdc63e11 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 19 Nov 2008 13:58:46 +0200
Subject: KVM: Userspace controlled irq routing

Currently KVM has a static routing from GSI numbers to interrupts (namely,
0-15 are mapped 1:1 to both PIC and IOAPIC, and 16:23 are mapped 1:1 to
the IOAPIC).  This is insufficient for several reasons:

- HPET requires non 1:1 mapping for the timer interrupt
- MSIs need a new method to assign interrupt numbers and dispatch them
- ACPI APIC mode needs to be able to reassign the PCI LINK interrupts to the
  ioapics

This patch implements an interrupt routing table (as a linked list, but this
can be easily changed) and a userspace interface to replace the table.  The
routing table is initialized according to the current hardwired mapping.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 141a016..32e3a7e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1835,6 +1835,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			}
 		} else
 			goto out;
+		r = kvm_setup_default_irq_routing(kvm);
+		if (r) {
+			kfree(kvm->arch.vpic);
+			kfree(kvm->arch.vioapic);
+			goto out;
+		}
 		break;
 	case KVM_CREATE_PIT:
 		mutex_lock(&kvm->lock);
-- 
cgit v1.1


From 91b2ae773d3b168b763237fac33f75b13d891f20 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 19 Jan 2009 14:57:52 +0200
Subject: KVM: Avoid using CONFIG_ in userspace visible headers

Kconfig symbols are not available in userspace, and are not stripped by
headers-install.  Avoid their use by adding #defines in <asm/kvm.h> to
suit each architecture.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 54bcf22..dc3f6cf 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -15,6 +15,7 @@
 #define __KVM_HAVE_DEVICE_ASSIGNMENT
 #define __KVM_HAVE_MSI
 #define __KVM_HAVE_USER_NMI
+#define __KVM_HAVE_GUEST_DEBUG
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
-- 
cgit v1.1


From 44882eed2ebe7f75f8cdae5671ab1d6e0fa40dbc Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 27 Jan 2009 15:12:38 -0200
Subject: KVM: make irq ack notifications aware of routing table

IRQ ack notifications assume an identity mapping between pin->gsi,
which might not be the case with, for example, HPET.

Translate before acking.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Acked-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/i8259.c | 5 +++--
 arch/x86/kvm/irq.h   | 2 ++
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 179dcb0..9316037 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -49,7 +49,8 @@ static void pic_unlock(struct kvm_pic *s)
 	spin_unlock(&s->lock);
 
 	while (acks) {
-		kvm_notify_acked_irq(kvm, __ffs(acks));
+		kvm_notify_acked_irq(kvm, SELECT_PIC(__ffs(acks)),
+				     __ffs(acks));
 		acks &= acks - 1;
 	}
 
@@ -232,7 +233,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 	}
 	pic_update_irq(s);
 	pic_unlock(s);
-	kvm_notify_acked_irq(kvm, irq);
+	kvm_notify_acked_irq(kvm, SELECT_PIC(irq), irq);
 
 	return intno;
 }
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 82579ee..9f59318 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -32,6 +32,8 @@
 #include "lapic.h"
 
 #define PIC_NUM_PINS 16
+#define SELECT_PIC(irq) \
+	((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
 
 struct kvm;
 struct kvm_vcpu;
-- 
cgit v1.1


From d20626936dd6aa783760e780dae5abb127564316 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 2 Feb 2009 16:23:50 +0100
Subject: x86: Add EFER descriptions for FFXSR

AMD k10 includes support for the FFXSR feature, which leaves out
XMM registers on FXSAVE/FXSAVE when the EFER_FFXSR bit is set in
EFER.

The CPUID feature bit exists already, but the EFER bit is missing
currently, so this patch adds it to the list of known EFER bits.

Signed-off-by: Alexander Graf <agraf@suse.de>
CC: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/msr-index.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 46e9646..f4e505f 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -19,12 +19,14 @@
 #define _EFER_LMA		10 /* Long mode active (read-only) */
 #define _EFER_NX		11 /* No execute enable */
 #define _EFER_SVME		12 /* Enable virtualization */
+#define _EFER_FFXSR		14 /* Enable Fast FXSAVE/FXRSTOR */
 
 #define EFER_SCE		(1<<_EFER_SCE)
 #define EFER_LME		(1<<_EFER_LME)
 #define EFER_LMA		(1<<_EFER_LMA)
 #define EFER_NX			(1<<_EFER_NX)
 #define EFER_SVME		(1<<_EFER_SVME)
+#define EFER_FFXSR		(1<<_EFER_FFXSR)
 
 /* Intel MSRs. Some also available on other CPUs */
 #define MSR_IA32_PERFCTR0		0x000000c1
-- 
cgit v1.1


From 1b2fd70c4eddef53f32639296818c0253e7ca48d Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 2 Feb 2009 16:23:51 +0100
Subject: KVM: Add FFXSR support

AMD K10 CPUs implement the FFXSR feature that gets enabled using
EFER. Let's check if the virtual CPU description includes that
CPUID feature bit and allow enabling it then.

This is required for Windows Server 2008 in Hyper-V mode.

v2 adds CPUID capability exposure

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c |  3 +++
 arch/x86/kvm/x86.c | 12 ++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index db5021b..1c4a018 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -417,6 +417,9 @@ static __init int svm_hardware_setup(void)
 	if (boot_cpu_has(X86_FEATURE_NX))
 		kvm_enable_efer_bits(EFER_NX);
 
+	if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
+		kvm_enable_efer_bits(EFER_FFXSR);
+
 	if (nested) {
 		printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
 		kvm_enable_efer_bits(EFER_SVME);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 32e3a7e..8f83590 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -490,6 +490,17 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 		return;
 	}
 
+	if (efer & EFER_FFXSR) {
+		struct kvm_cpuid_entry2 *feat;
+
+		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
+			printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
+			kvm_inject_gp(vcpu, 0);
+			return;
+		}
+	}
+
 	if (efer & EFER_SVME) {
 		struct kvm_cpuid_entry2 *feat;
 
@@ -1240,6 +1251,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 #ifdef CONFIG_X86_64
 		bit(X86_FEATURE_LM) |
 #endif
+		bit(X86_FEATURE_FXSR_OPT) |
 		bit(X86_FEATURE_MMXEXT) |
 		bit(X86_FEATURE_3DNOWEXT) |
 		bit(X86_FEATURE_3DNOW);
-- 
cgit v1.1


From 34c33d163fe509da8414a736c6328855f8c164e5 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@web.de>
Date: Sun, 8 Feb 2009 13:28:15 +0100
Subject: KVM: Drop unused evaluations from string pio handlers

Looks like neither the direction nor the rep prefix are used anymore.
Drop related evaluations from SVM's and VMX's I/O exit handlers.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 4 +---
 arch/x86/kvm/vmx.c | 4 +---
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1c4a018..22e88a4 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1194,7 +1194,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
 	u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
-	int size, down, in, string, rep;
+	int size, in, string;
 	unsigned port;
 
 	++svm->vcpu.stat.io_exits;
@@ -1213,8 +1213,6 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
 	port = io_info >> 16;
 	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
-	rep = (io_info & SVM_IOIO_REP_MASK) != 0;
-	down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
 
 	skip_emulated_instruction(&svm->vcpu);
 	return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index df454de..509b353 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2698,7 +2698,7 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	unsigned long exit_qualification;
-	int size, down, in, string, rep;
+	int size, in, string;
 	unsigned port;
 
 	++vcpu->stat.io_exits;
@@ -2714,8 +2714,6 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	size = (exit_qualification & 7) + 1;
 	in = (exit_qualification & 8) != 0;
-	down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
-	rep = (exit_qualification & 32) != 0;
 	port = exit_qualification >> 16;
 
 	skip_emulated_instruction(vcpu);
-- 
cgit v1.1


From 49cd7d2238e44f7ee4269481cd8a1261cc8f93a5 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 11 Feb 2009 13:50:40 +0800
Subject: KVM: VMX: Use kvm_mmu_page_fault() handle EPT violation mmio

Removed duplicated code.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 30 +-----------------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 509b353..cb27ffc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3058,11 +3058,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	u64 exit_qualification;
-	enum emulation_result er;
 	gpa_t gpa;
-	unsigned long hva;
 	int gla_validity;
-	int r;
 
 	exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
 
@@ -3085,32 +3082,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 
 	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
-	hva = gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT);
-	if (!kvm_is_error_hva(hva)) {
-		r = kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
-		if (r < 0) {
-			printk(KERN_ERR "EPT: Not enough memory!\n");
-			return -ENOMEM;
-		}
-		return 1;
-	} else {
-		/* must be MMIO */
-		er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
-
-		if (er == EMULATE_FAIL) {
-			printk(KERN_ERR
-			 "EPT: Fail to handle EPT violation vmexit!er is %d\n",
-			 er);
-			printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
-			 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
-			 (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
-			printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
-				(long unsigned int)exit_qualification);
-			return -ENOTSUPP;
-		} else if (er == EMULATE_DO_MMIO)
-			return 0;
-	}
-	return 1;
+	return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
 }
 
 static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-- 
cgit v1.1


From c807660407a695f390034e402edfe544a1d2e40c Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Wed, 4 Feb 2009 17:52:04 +0100
Subject: KVM: Fix kvmclock on !constant_tsc boxes

kvmclock currently falls apart on machines without constant tsc.
This patch fixes it.  Changes:

  * keep tsc frequency in a per-cpu variable.
  * handle kvmclock update using a new request flag, thus checking
    whenever we need an update each time we enter guest context.
  * use a cpufreq notifier to track frequency changes and force
    kvmclock updates.
  * send ipis to kick cpu out of guest context if needed to make
    sure the guest doesn't see stale values.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 94 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8f83590..05d7be8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -36,6 +36,7 @@
 #include <linux/highmem.h>
 #include <linux/iommu.h>
 #include <linux/intel-iommu.h>
+#include <linux/cpufreq.h>
 
 #include <asm/uaccess.h>
 #include <asm/msr.h>
@@ -617,6 +618,8 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
 		 hv_clock->tsc_to_system_mul);
 }
 
+static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
+
 static void kvm_write_guest_time(struct kvm_vcpu *v)
 {
 	struct timespec ts;
@@ -627,9 +630,9 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	if ((!vcpu->time_page))
 		return;
 
-	if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
-		kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
-		vcpu->hv_clock_tsc_khz = tsc_khz;
+	if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) {
+		kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock);
+		vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz);
 	}
 
 	/* Keep irq disabled to prevent changes to the clock */
@@ -660,6 +663,16 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
 }
 
+static int kvm_request_guest_time_update(struct kvm_vcpu *v)
+{
+	struct kvm_vcpu_arch *vcpu = &v->arch;
+
+	if (!vcpu->time_page)
+		return 0;
+	set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
+	return 1;
+}
+
 static bool msr_mtrr_valid(unsigned msr)
 {
 	switch (msr) {
@@ -790,7 +803,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 			vcpu->arch.time_page = NULL;
 		}
 
-		kvm_write_guest_time(vcpu);
+		kvm_request_guest_time_update(vcpu);
 		break;
 	}
 	default:
@@ -1000,6 +1013,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
 	case KVM_CAP_SET_TSS_ADDR:
 	case KVM_CAP_EXT_CPUID:
+	case KVM_CAP_CLOCKSOURCE:
 	case KVM_CAP_PIT:
 	case KVM_CAP_NOP_IO_DELAY:
 	case KVM_CAP_MP_STATE:
@@ -1025,9 +1039,6 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_IOMMU:
 		r = iommu_found();
 		break;
-	case KVM_CAP_CLOCKSOURCE:
-		r = boot_cpu_has(X86_FEATURE_CONSTANT_TSC);
-		break;
 	default:
 		r = 0;
 		break;
@@ -1098,7 +1109,7 @@ out:
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
-	kvm_write_guest_time(vcpu);
+	kvm_request_guest_time_update(vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -2642,9 +2653,72 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
 
+static void bounce_off(void *info)
+{
+	/* nothing */
+}
+
+static unsigned int  ref_freq;
+static unsigned long tsc_khz_ref;
+
+static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+				     void *data)
+{
+	struct cpufreq_freqs *freq = data;
+	struct kvm *kvm;
+	struct kvm_vcpu *vcpu;
+	int i, send_ipi = 0;
+
+	if (!ref_freq)
+		ref_freq = freq->old;
+
+	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
+		return 0;
+	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
+		return 0;
+	per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
+
+	spin_lock(&kvm_lock);
+	list_for_each_entry(kvm, &vm_list, vm_list) {
+		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+			vcpu = kvm->vcpus[i];
+			if (!vcpu)
+				continue;
+			if (vcpu->cpu != freq->cpu)
+				continue;
+			if (!kvm_request_guest_time_update(vcpu))
+				continue;
+			if (vcpu->cpu != smp_processor_id())
+				send_ipi++;
+		}
+	}
+	spin_unlock(&kvm_lock);
+
+	if (freq->old < freq->new && send_ipi) {
+		/*
+		 * We upscale the frequency.  Must make the guest
+		 * doesn't see old kvmclock values while running with
+		 * the new frequency, otherwise we risk the guest sees
+		 * time go backwards.
+		 *
+		 * In case we update the frequency for another cpu
+		 * (which might be in guest context) send an interrupt
+		 * to kick the cpu out of guest context.  Next time
+		 * guest context is entered kvmclock will be updated,
+		 * so the guest will not see stale values.
+		 */
+		smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
+	}
+	return 0;
+}
+
+static struct notifier_block kvmclock_cpufreq_notifier_block = {
+        .notifier_call  = kvmclock_cpufreq_notifier
+};
+
 int kvm_arch_init(void *opaque)
 {
-	int r;
+	int r, cpu;
 	struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
 
 	if (kvm_x86_ops) {
@@ -2675,6 +2749,15 @@ int kvm_arch_init(void *opaque)
 	kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
 			PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
+
+	for_each_possible_cpu(cpu)
+		per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+		tsc_khz_ref = tsc_khz;
+		cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
+					  CPUFREQ_TRANSITION_NOTIFIER);
+	}
+
 	return 0;
 
 out:
@@ -3010,6 +3093,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (vcpu->requests) {
 		if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
 			__kvm_migrate_timers(vcpu);
+		if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
+			kvm_write_guest_time(vcpu);
 		if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
 			kvm_mmu_sync_roots(vcpu);
 		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
-- 
cgit v1.1


From 6bed6b9e84bf5e6b468847a50c0751389fdd01d4 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 18 Feb 2009 14:08:59 +0100
Subject: KVM: MMU: remove redundant check in mmu_set_spte

The following code flow is unnecessary:

	if (largepage)
		was_rmapped = is_large_pte(*shadow_pte);
	 else
	 	was_rmapped = 1;

The is_large_pte() function will always evaluate to one here because the
(largepage && !is_large_pte) case is already handled in the first
if-clause. So we can remove this check and set was_rmapped to one always
here.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Acked-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ef060ec..c90b4b2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1791,12 +1791,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 			pgprintk("hfn old %lx new %lx\n",
 				 spte_to_pfn(*shadow_pte), pfn);
 			rmap_remove(vcpu->kvm, shadow_pte);
-		} else {
-			if (largepage)
-				was_rmapped = is_large_pte(*shadow_pte);
-			else
-				was_rmapped = 1;
-		}
+		} else
+			was_rmapped = 1;
 	}
 	if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
 		      dirty, largepage, global, gfn, pfn, speculative, true)) {
-- 
cgit v1.1


From 452425dbaa1974e9fc489e64a8de46a47b4c2754 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 18 Feb 2009 14:54:37 +0100
Subject: KVM: MMU: remove assertion in kvm_mmu_alloc_page

The assertion no longer makes sense since we don't clear page tables on
allocation; instead we clear them during prefetch.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c90b4b2..4a21b0f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -802,7 +802,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
 	INIT_LIST_HEAD(&sp->oos_link);
-	ASSERT(is_empty_shadow_page(sp->spt));
 	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
 	sp->multimapped = 0;
 	sp->parent_pte = parent_pte;
-- 
cgit v1.1


From 4925663a079c77d95d8685228ad6675fc5639c8e Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 4 Feb 2009 17:28:14 +0200
Subject: KVM: Report IRQ injection status to userspace.

IRQ injection status is either -1 (if there was no CPU found
that should except the interrupt because IRQ was masked or
ioapic was misconfigured or ...) or >= 0 in that case the
number indicates to how many CPUs interrupt was injected.
If the value is 0 it means that the interrupt was coalesced
and probably should be reinjected.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/i8259.c            | 18 +++++++++++++-----
 arch/x86/kvm/x86.c              | 13 +++++++++++--
 3 files changed, 25 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 55fd4c5..f0faf58 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -616,7 +616,7 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
 			   u32 error_code);
 
-void kvm_pic_set_irq(void *opaque, int irq, int level);
+int kvm_pic_set_irq(void *opaque, int irq, int level);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 9316037..b4e662e 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -77,12 +77,13 @@ void kvm_pic_clear_isr_ack(struct kvm *kvm)
 /*
  * set irq level. If an edge is detected, then the IRR is set to 1
  */
-static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
+static inline int pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
 {
-	int mask;
+	int mask, ret = 1;
 	mask = 1 << irq;
 	if (s->elcr & mask)	/* level triggered */
 		if (level) {
+			ret = !(s->irr & mask);
 			s->irr |= mask;
 			s->last_irr |= mask;
 		} else {
@@ -91,11 +92,15 @@ static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
 		}
 	else	/* edge triggered */
 		if (level) {
-			if ((s->last_irr & mask) == 0)
+			if ((s->last_irr & mask) == 0) {
+				ret = !(s->irr & mask);
 				s->irr |= mask;
+			}
 			s->last_irr |= mask;
 		} else
 			s->last_irr &= ~mask;
+
+	return (s->imr & mask) ? -1 : ret;
 }
 
 /*
@@ -172,16 +177,19 @@ void kvm_pic_update_irq(struct kvm_pic *s)
 	pic_unlock(s);
 }
 
-void kvm_pic_set_irq(void *opaque, int irq, int level)
+int kvm_pic_set_irq(void *opaque, int irq, int level)
 {
 	struct kvm_pic *s = opaque;
+	int ret = -1;
 
 	pic_lock(s);
 	if (irq >= 0 && irq < PIC_NUM_PINS) {
-		pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
+		ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
 		pic_update_irq(s);
 	}
 	pic_unlock(s);
+
+	return ret;
 }
 
 /*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 05d7be8..e4db5be 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1019,6 +1019,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_MP_STATE:
 	case KVM_CAP_SYNC_MMU:
 	case KVM_CAP_REINJECT_CONTROL:
+	case KVM_CAP_IRQ_INJECT_STATUS:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -1877,6 +1878,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	create_pit_unlock:
 		mutex_unlock(&kvm->lock);
 		break;
+	case KVM_IRQ_LINE_STATUS:
 	case KVM_IRQ_LINE: {
 		struct kvm_irq_level irq_event;
 
@@ -1884,10 +1886,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		if (copy_from_user(&irq_event, argp, sizeof irq_event))
 			goto out;
 		if (irqchip_in_kernel(kvm)) {
+			__s32 status;
 			mutex_lock(&kvm->lock);
-			kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
-				    irq_event.irq, irq_event.level);
+			status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+					irq_event.irq, irq_event.level);
 			mutex_unlock(&kvm->lock);
+			if (ioctl == KVM_IRQ_LINE_STATUS) {
+				irq_event.status = status;
+				if (copy_to_user(argp, &irq_event,
+							sizeof irq_event))
+					goto out;
+			}
 			r = 0;
 		}
 		break;
-- 
cgit v1.1


From 1fbdc7a58512a6283e10fd27108197679db95ffa Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@amd.com>
Date: Sun, 11 Jan 2009 22:39:44 +0100
Subject: KVM: SVM: set accessed bit for VMCB segment selectors

In the segment descriptor _cache_ the accessed bit is always set
(although it can be cleared in the descriptor itself). Since Intel
checks for this condition on a VMENTRY, set this bit in the AMD path
to enable cross vendor migration.

Cc: stable@kernel.org
Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Acked-By: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 41 +++++++++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 22e88a4..1821c20 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -796,20 +796,37 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
 	var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
 	var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
 
-	/*
-	 * SVM always stores 0 for the 'G' bit in the CS selector in
-	 * the VMCB on a VMEXIT. This hurts cross-vendor migration:
-	 * Intel's VMENTRY has a check on the 'G' bit.
-	 */
-	if (seg == VCPU_SREG_CS)
+	switch (seg) {
+	case VCPU_SREG_CS:
+		/*
+		 * SVM always stores 0 for the 'G' bit in the CS selector in
+		 * the VMCB on a VMEXIT. This hurts cross-vendor migration:
+		 * Intel's VMENTRY has a check on the 'G' bit.
+		 */
 		var->g = s->limit > 0xfffff;
-
-	/*
-	 * Work around a bug where the busy flag in the tr selector
-	 * isn't exposed
-	 */
-	if (seg == VCPU_SREG_TR)
+		break;
+	case VCPU_SREG_TR:
+		/*
+		 * Work around a bug where the busy flag in the tr selector
+		 * isn't exposed
+		 */
 		var->type |= 0x2;
+		break;
+	case VCPU_SREG_DS:
+	case VCPU_SREG_ES:
+	case VCPU_SREG_FS:
+	case VCPU_SREG_GS:
+		/*
+		 * The accessed bit must always be set in the segment
+		 * descriptor cache, although it can be cleared in the
+		 * descriptor, the cached bit always remains at 1. Since
+		 * Intel has a check on this, set it here to support
+		 * cross-vendor migration.
+		 */
+		if (!var->unusable)
+			var->type |= 0x1;
+		break;
+	}
 
 	var->unusable = !var->present;
 }
-- 
cgit v1.1


From c5bc22424021cabda862727fb3f5098b866f074d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 19 Feb 2009 12:18:56 +0100
Subject: KVM: MMU: Fix another largepage memory leak

In the paging_fetch function rmap_remove is called after setting a large
pte to non-present. This causes rmap_remove to not drop the reference to
the large page. The result is a memory leak of that page.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Acked-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7314c09..0f11792 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -306,9 +306,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			continue;
 
 		if (is_large_pte(*sptep)) {
+			rmap_remove(vcpu->kvm, sptep);
 			set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
 			kvm_flush_remote_tlbs(vcpu->kvm);
-			rmap_remove(vcpu->kvm, sptep);
 		}
 
 		if (level == PT_DIRECTORY_LEVEL
-- 
cgit v1.1


From 401d10dee083bda281f2fdcdf654080313ba30ec Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@redhat.com>
Date: Fri, 20 Feb 2009 22:53:37 +0530
Subject: KVM: VMX: Update necessary state when guest enters long mode

setup_msrs() should be called when entering long mode to save the
shadow state for the 64-bit guest state.

Using vmx_set_efer() in enter_lmode() removes some duplicated code
and also ensures we call setup_msrs(). We can safely pass the value
of shadow_efer to vmx_set_efer() as no other bits in the efer change
while enabling long mode (guest first sets EFER.LME, then sets CR0.PG
which causes a vmexit where we activate long mode).

With this fix, is_long_mode() can check for EFER.LMA set instead of
EFER.LME and 5e23049e86dd298b72e206b420513dbc3a240cd9 can be reverted.

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 54 ++++++++++++++++++++++++------------------------------
 1 file changed, 24 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index cb27ffc..48063a0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1430,6 +1430,29 @@ continue_rmode:
 	init_rmode(vcpu->kvm);
 }
 
+static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+
+	vcpu->arch.shadow_efer = efer;
+	if (!msr)
+		return;
+	if (efer & EFER_LMA) {
+		vmcs_write32(VM_ENTRY_CONTROLS,
+			     vmcs_read32(VM_ENTRY_CONTROLS) |
+			     VM_ENTRY_IA32E_MODE);
+		msr->data = efer;
+	} else {
+		vmcs_write32(VM_ENTRY_CONTROLS,
+			     vmcs_read32(VM_ENTRY_CONTROLS) &
+			     ~VM_ENTRY_IA32E_MODE);
+
+		msr->data = efer & ~EFER_LME;
+	}
+	setup_msrs(vmx);
+}
+
 #ifdef CONFIG_X86_64
 
 static void enter_lmode(struct kvm_vcpu *vcpu)
@@ -1444,13 +1467,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
 			     (guest_tr_ar & ~AR_TYPE_MASK)
 			     | AR_TYPE_BUSY_64_TSS);
 	}
-
 	vcpu->arch.shadow_efer |= EFER_LMA;
-
-	find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
-	vmcs_write32(VM_ENTRY_CONTROLS,
-		     vmcs_read32(VM_ENTRY_CONTROLS)
-		     | VM_ENTRY_IA32E_MODE);
+	vmx_set_efer(vcpu, vcpu->arch.shadow_efer);
 }
 
 static void exit_lmode(struct kvm_vcpu *vcpu)
@@ -1609,30 +1627,6 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	vmcs_writel(GUEST_CR4, hw_cr4);
 }
 
-static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
-
-	vcpu->arch.shadow_efer = efer;
-	if (!msr)
-		return;
-	if (efer & EFER_LMA) {
-		vmcs_write32(VM_ENTRY_CONTROLS,
-				     vmcs_read32(VM_ENTRY_CONTROLS) |
-				     VM_ENTRY_IA32E_MODE);
-		msr->data = efer;
-
-	} else {
-		vmcs_write32(VM_ENTRY_CONTROLS,
-				     vmcs_read32(VM_ENTRY_CONTROLS) &
-				     ~VM_ENTRY_IA32E_MODE);
-
-		msr->data = efer & ~EFER_LME;
-	}
-	setup_msrs(vmx);
-}
-
 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
 {
 	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-- 
cgit v1.1


From 41d6af119206e98764b4ae6d264d63acefcf851e Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@qumranet.com>
Date: Thu, 28 Feb 2008 16:06:15 +0530
Subject: KVM: is_long_mode() should check for EFER.LMA

is_long_mode currently checks the LongModeEnable bit in
EFER instead of the LongModeActive bit. This is wrong, but
we survived this till now since it wasn't triggered. This
breaks guests that go from long mode to compatibility mode.

This is noticed on a solaris guest and fixes bug #1842160

Signed-off-by: Amit Shah <amit.shah@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 258e5d5..eaab214 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -54,7 +54,7 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
 static inline int is_long_mode(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_X86_64
-	return vcpu->arch.shadow_efer & EFER_LME;
+	return vcpu->arch.shadow_efer & EFER_LMA;
 #else
 	return 0;
 #endif
-- 
cgit v1.1


From d7364a29b305862cbc92b3f8b6ad9968270d068c Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Sat, 21 Feb 2009 02:18:13 +0100
Subject: KVM: fix sparse warnings: context imbalance

Impact: Attribute function with __acquires(...) resp. __releases(...).

Fix this sparse warnings:
  arch/x86/kvm/i8259.c:34:13: warning: context imbalance in 'pic_lock' - wrong count at exit
  arch/x86/kvm/i8259.c:39:13: warning: context imbalance in 'pic_unlock' - unexpected unlock

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8259.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index b4e662e..1ccb50c 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -32,11 +32,13 @@
 #include <linux/kvm_host.h>
 
 static void pic_lock(struct kvm_pic *s)
+	__acquires(&s->lock)
 {
 	spin_lock(&s->lock);
 }
 
 static void pic_unlock(struct kvm_pic *s)
+	__releases(&s->lock)
 {
 	struct kvm *kvm = s->kvm;
 	unsigned acks = s->pending_acks;
-- 
cgit v1.1


From cded19f396bc3c9d299331709d0a9fbc6ecc148a Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Sat, 21 Feb 2009 02:19:13 +0100
Subject: KVM: fix sparse warnings: Should it be static?

Impact: Make symbols static.

Fix this sparse warnings:
  arch/x86/kvm/mmu.c:992:5: warning: symbol 'mmu_pages_add' was not declared. Should it be static?
  arch/x86/kvm/mmu.c:1124:5: warning: symbol 'mmu_pages_next' was not declared. Should it be static?
  arch/x86/kvm/mmu.c:1144:6: warning: symbol 'mmu_pages_clear_parents' was not declared. Should it be static?
  arch/x86/kvm/x86.c:2037:5: warning: symbol 'kvm_read_guest_virt' was not declared. Should it be static?
  arch/x86/kvm/x86.c:2067:5: warning: symbol 'kvm_write_guest_virt' was not declared. Should it be static?
  virt/kvm/irq_comm.c:220:5: warning: symbol 'setup_routing_entry' was not declared. Should it be static?

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 11 ++++++-----
 arch/x86/kvm/x86.c |  8 ++++----
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4a21b0f..2a36f7f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -989,8 +989,8 @@ struct kvm_mmu_pages {
 	     idx < 512;					\
 	     idx = find_next_bit(bitmap, 512, idx+1))
 
-int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
-		   int idx)
+static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
+			 int idx)
 {
 	int i;
 
@@ -1121,8 +1121,9 @@ struct mmu_page_path {
 			i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});	\
 			i = mmu_pages_next(&pvec, &parents, i))
 
-int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents,
-		   int i)
+static int mmu_pages_next(struct kvm_mmu_pages *pvec,
+			  struct mmu_page_path *parents,
+			  int i)
 {
 	int n;
 
@@ -1141,7 +1142,7 @@ int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents,
 	return n;
 }
 
-void mmu_pages_clear_parents(struct mmu_page_path *parents)
+static void mmu_pages_clear_parents(struct mmu_page_path *parents)
 {
 	struct kvm_mmu_page *sp;
 	unsigned int level = 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e4db5be..8ca100a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2043,8 +2043,8 @@ static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
 	return dev;
 }
 
-int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
-			struct kvm_vcpu *vcpu)
+static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+			       struct kvm_vcpu *vcpu)
 {
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
@@ -2073,8 +2073,8 @@ out:
 	return r;
 }
 
-int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
-			 struct kvm_vcpu *vcpu)
+static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
+				struct kvm_vcpu *vcpu)
 {
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
-- 
cgit v1.1


From 4539b35881ae9664b0e2953438dd83f5ee02c0b4 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 12 Mar 2009 18:18:43 +0100
Subject: KVM: Fix missing smp tlb flush in invlpg

When kvm emulates an invlpg instruction, it can drop a shadow pte, but
leaves the guest tlbs intact.  This can cause memory corruption when
swapping out.

Without this the other cpu can still write to a freed host physical page.
tlb smp flush must happen if rmap_remove is called always before mmu_lock
is released because the VM will take the mmu_lock before it can finally add
the page to the freelist after swapout. mmu notifier makes it safe to flush
the tlb after freeing the page (otherwise it would never be safe) so we can do
a single flush for multiple sptes invalidated.

Cc: stable@kernel.org
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 0f11792..6bd7020 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -446,6 +446,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 	gpa_t pte_gpa = -1;
 	int level;
 	u64 *sptep;
+	int need_flush = 0;
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 
@@ -465,6 +466,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 				rmap_remove(vcpu->kvm, sptep);
 				if (is_large_pte(*sptep))
 					--vcpu->kvm->stat.lpages;
+				need_flush = 1;
 			}
 			set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
 			break;
@@ -474,6 +476,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 			break;
 	}
 
+	if (need_flush)
+		kvm_flush_remote_tlbs(vcpu->kvm);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	if (pte_gpa == -1)
-- 
cgit v1.1


From 16175a796d061833aacfbd9672235f2d2725df65 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 23 Mar 2009 22:13:44 +0200
Subject: KVM: VMX: Don't allow uninhibited access to EFER on i386

vmx_set_msr() does not allow i386 guests to touch EFER, but they can still
do so through the default: label in the switch.  If they set EFER_LME, they
can oops the host.

Fix by having EFER access through the normal channel (which will check for
EFER_LME) even on i386.

Reported-and-tested-by: Benjamin Gilbert <bgilbert@cs.cmu.edu>
Cc: stable@kernel.org
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 48063a0..bb48133 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -936,11 +936,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 	int ret = 0;
 
 	switch (msr_index) {
-#ifdef CONFIG_X86_64
 	case MSR_EFER:
 		vmx_load_host_state(vmx);
 		ret = kvm_set_msr_common(vcpu, msr_index, data);
 		break;
+#ifdef CONFIG_X86_64
 	case MSR_FS_BASE:
 		vmcs_writel(GUEST_FS_BASE, data);
 		break;
-- 
cgit v1.1