summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/ia32/ia32entry.S2
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h3
-rw-r--r--arch/x86/include/asm/kvm.h17
-rw-r--r--arch/x86/include/asm/kvm_emulate.h46
-rw-r--r--arch/x86/include/asm/kvm_host.h70
-rw-r--r--arch/x86/include/asm/lguest_hcall.h29
-rw-r--r--arch/x86/include/asm/svm.h9
-rw-r--r--arch/x86/kernel/amd_iommu.c20
-rw-r--r--arch/x86/kernel/amd_iommu_init.c48
-rw-r--r--arch/x86/kernel/aperture_64.c15
-rw-r--r--arch/x86/kernel/crash.c6
-rw-r--r--arch/x86/kernel/dumpstack.h8
-rw-r--r--arch/x86/kernel/pci-gart_64.c3
-rw-r--r--arch/x86/kvm/emulate.c1247
-rw-r--r--arch/x86/kvm/i8259.c53
-rw-r--r--arch/x86/kvm/irq.h1
-rw-r--r--arch/x86/kvm/kvm_timer.h4
-rw-r--r--arch/x86/kvm/mmu.c173
-rw-r--r--arch/x86/kvm/mmutrace.h12
-rw-r--r--arch/x86/kvm/paging_tmpl.h27
-rw-r--r--arch/x86/kvm/svm.c916
-rw-r--r--arch/x86/kvm/timer.c3
-rw-r--r--arch/x86/kvm/trace.h165
-rw-r--r--arch/x86/kvm/vmx.c279
-rw-r--r--arch/x86/kvm/x86.c1501
-rw-r--r--arch/x86/lguest/boot.c61
-rw-r--r--arch/x86/lguest/i386_head.S2
27 files changed, 2731 insertions, 1989 deletions
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 59b4556..e790bc1 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -626,7 +626,7 @@ ia32_sys_call_table:
.quad stub32_sigreturn
.quad stub32_clone /* 120 */
.quad sys_setdomainname
- .quad sys_uname
+ .quad sys_newuname
.quad sys_modify_ldt
.quad compat_sys_adjtimex
.quad sys32_mprotect /* 125 */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index ba19ad4..86a0ff0 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -21,6 +21,7 @@
#define _ASM_X86_AMD_IOMMU_TYPES_H
#include <linux/types.h>
+#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/spinlock.h>
@@ -140,6 +141,7 @@
/* constants to configure the command buffer */
#define CMD_BUFFER_SIZE 8192
+#define CMD_BUFFER_UNINITIALIZED 1
#define CMD_BUFFER_ENTRIES 512
#define MMIO_CMD_SIZE_SHIFT 56
#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
@@ -237,6 +239,7 @@ struct protection_domain {
struct list_head list; /* for list of all protection domains */
struct list_head dev_list; /* List of all devices in this domain */
spinlock_t lock; /* mostly used to lock the page table*/
+ struct mutex api_lock; /* protect page tables in the iommu-api path */
u16 id; /* the domain id written to the device table */
int mode; /* paging mode (0-6 levels) */
u64 *pt_root; /* page table root pointer */
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index f46b79f..ff90055 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -21,6 +21,7 @@
#define __KVM_HAVE_PIT_STATE2
#define __KVM_HAVE_XEN_HVM
#define __KVM_HAVE_VCPU_EVENTS
+#define __KVM_HAVE_DEBUGREGS
/* Architectural interrupt line count. */
#define KVM_NR_INTERRUPTS 256
@@ -257,6 +258,11 @@ struct kvm_reinject_control {
/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
+#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004
+
+/* Interrupt shadow states */
+#define KVM_X86_SHADOW_INT_MOV_SS 0x01
+#define KVM_X86_SHADOW_INT_STI 0x02
/* for KVM_GET/SET_VCPU_EVENTS */
struct kvm_vcpu_events {
@@ -271,7 +277,7 @@ struct kvm_vcpu_events {
__u8 injected;
__u8 nr;
__u8 soft;
- __u8 pad;
+ __u8 shadow;
} interrupt;
struct {
__u8 injected;
@@ -284,4 +290,13 @@ struct kvm_vcpu_events {
__u32 reserved[10];
};
+/* for KVM_GET/SET_DEBUGREGS */
+struct kvm_debugregs {
+ __u64 db[4];
+ __u64 dr6;
+ __u64 dr7;
+ __u64 flags;
+ __u64 reserved[9];
+};
+
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 7a6f54f..0b2729b 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -11,6 +11,8 @@
#ifndef _ASM_X86_KVM_X86_EMULATE_H
#define _ASM_X86_KVM_X86_EMULATE_H
+#include <asm/desc_defs.h>
+
struct x86_emulate_ctxt;
/*
@@ -63,6 +65,15 @@ struct x86_emulate_ops {
unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
/*
+ * write_std: Write bytes of standard (non-emulated/special) memory.
+ * Used for descriptor writing.
+ * @addr: [IN ] Linear address to which to write.
+ * @val: [OUT] Value write to memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to write to memory.
+ */
+ int (*write_std)(unsigned long addr, void *val,
+ unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
+ /*
* fetch: Read bytes of standard (non-emulated/special) memory.
* Used for instruction fetch.
* @addr: [IN ] Linear address from which to read.
@@ -109,6 +120,23 @@ struct x86_emulate_ops {
unsigned int bytes,
struct kvm_vcpu *vcpu);
+ int (*pio_in_emulated)(int size, unsigned short port, void *val,
+ unsigned int count, struct kvm_vcpu *vcpu);
+
+ int (*pio_out_emulated)(int size, unsigned short port, const void *val,
+ unsigned int count, struct kvm_vcpu *vcpu);
+
+ bool (*get_cached_descriptor)(struct desc_struct *desc,
+ int seg, struct kvm_vcpu *vcpu);
+ void (*set_cached_descriptor)(struct desc_struct *desc,
+ int seg, struct kvm_vcpu *vcpu);
+ u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
+ void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
+ void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
+ ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
+ void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
+ int (*cpl)(struct kvm_vcpu *vcpu);
+ void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
};
/* Type, address-of, and value of an instruction's operand. */
@@ -124,6 +152,12 @@ struct fetch_cache {
unsigned long end;
};
+struct read_cache {
+ u8 data[1024];
+ unsigned long pos;
+ unsigned long end;
+};
+
struct decode_cache {
u8 twobyte;
u8 b;
@@ -139,7 +173,7 @@ struct decode_cache {
u8 seg_override;
unsigned int d;
unsigned long regs[NR_VCPU_REGS];
- unsigned long eip, eip_orig;
+ unsigned long eip;
/* modrm */
u8 modrm;
u8 modrm_mod;
@@ -151,16 +185,15 @@ struct decode_cache {
void *modrm_ptr;
unsigned long modrm_val;
struct fetch_cache fetch;
+ struct read_cache io_read;
};
-#define X86_SHADOW_INT_MOV_SS 1
-#define X86_SHADOW_INT_STI 2
-
struct x86_emulate_ctxt {
/* Register state before/after emulation. */
struct kvm_vcpu *vcpu;
unsigned long eflags;
+ unsigned long eip; /* eip before instruction emulation */
/* Emulated execution mode, represented by an X86EMUL_MODE value. */
int mode;
u32 cs_base;
@@ -168,6 +201,7 @@ struct x86_emulate_ctxt {
/* interruptibility state, as a result of execution of STI or MOV SS */
int interruptibility;
+ bool restart; /* restart string instruction after writeback */
/* decode cache */
struct decode_cache decode;
};
@@ -194,5 +228,9 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops);
int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops);
+int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 tss_selector, int reason,
+ bool has_error_code, u32 error_code);
#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 06d9e79..3c31c5a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -171,8 +171,8 @@ struct kvm_pte_chain {
union kvm_mmu_page_role {
unsigned word;
struct {
- unsigned glevels:4;
unsigned level:4;
+ unsigned cr4_pae:1;
unsigned quadrant:2;
unsigned pad_for_nice_hex_output:6;
unsigned direct:1;
@@ -187,8 +187,6 @@ struct kvm_mmu_page {
struct list_head link;
struct hlist_node hash_link;
- struct list_head oos_link;
-
/*
* The following two entries are used to key the shadow page in the
* hash table.
@@ -204,9 +202,9 @@ struct kvm_mmu_page {
* in this shadow page.
*/
DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
- int multimapped; /* More than one parent_pte? */
- int root_count; /* Currently serving as active root */
+ bool multimapped; /* More than one parent_pte? */
bool unsync;
+ int root_count; /* Currently serving as active root */
unsigned int unsync_children;
union {
u64 *parent_pte; /* !multimapped */
@@ -224,14 +222,9 @@ struct kvm_pv_mmu_op_buffer {
struct kvm_pio_request {
unsigned long count;
- int cur_count;
- gva_t guest_gva;
int in;
int port;
int size;
- int string;
- int down;
- int rep;
};
/*
@@ -362,8 +355,8 @@ struct kvm_vcpu_arch {
u64 *mce_banks;
/* used for guest single stepping over the given code position */
- u16 singlestep_cs;
unsigned long singlestep_rip;
+
/* fields used by HYPER-V emulation */
u64 hv_vapic;
};
@@ -389,6 +382,7 @@ struct kvm_arch {
unsigned int n_free_mmu_pages;
unsigned int n_requested_mmu_pages;
unsigned int n_alloc_mmu_pages;
+ atomic_t invlpg_counter;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
* Hash table of struct kvm_mmu_page.
@@ -461,11 +455,6 @@ struct kvm_vcpu_stat {
u32 nmi_injections;
};
-struct descriptor_table {
- u16 limit;
- unsigned long base;
-} __attribute__((packed));
-
struct kvm_x86_ops {
int (*cpu_has_kvm_support)(void); /* __init */
int (*disabled_by_bios)(void); /* __init */
@@ -503,12 +492,11 @@ struct kvm_x86_ops {
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
- void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
- int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest);
- int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
+ void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+ void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+ void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+ void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+ void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
@@ -587,23 +575,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
- unsigned long *rflags);
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
- unsigned long *rflags);
void kvm_enable_efer_bits(u64);
int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
struct x86_emulate_ctxt;
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in,
- int size, unsigned port);
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
- int size, unsigned long count, int down,
- gva_t address, int rep, unsigned port);
+int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
@@ -616,12 +595,15 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason);
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
+ bool has_error_code, u32 error_code);
void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
@@ -649,8 +631,6 @@ int emulator_write_emulated(unsigned long addr,
unsigned int bytes,
struct kvm_vcpu *vcpu);
-unsigned long segment_base(u16 selector);
-
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *new, int bytes,
@@ -675,7 +655,6 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
void kvm_enable_tdp(void);
void kvm_disable_tdp(void);
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
int complete_pio(struct kvm_vcpu *vcpu);
bool kvm_check_iopl(struct kvm_vcpu *vcpu);
@@ -724,23 +703,6 @@ static inline void kvm_load_ldt(u16 sel)
asm("lldt %0" : : "rm"(sel));
}
-static inline void kvm_get_idt(struct descriptor_table *table)
-{
- asm("sidt %0" : "=m"(*table));
-}
-
-static inline void kvm_get_gdt(struct descriptor_table *table)
-{
- asm("sgdt %0" : "=m"(*table));
-}
-
-static inline unsigned long kvm_read_tr_base(void)
-{
- u16 tr;
- asm("str %0" : "=g"(tr));
- return segment_base(tr);
-}
-
#ifdef CONFIG_X86_64
static inline unsigned long read_msr(unsigned long msr)
{
@@ -826,4 +788,6 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
void kvm_define_shared_msr(unsigned index, u32 msr);
void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index ba0eed8..b60f292 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -28,22 +28,39 @@
#ifndef __ASSEMBLY__
#include <asm/hw_irq.h>
-#include <asm/kvm_para.h>
/*G:030
* But first, how does our Guest contact the Host to ask for privileged
* operations? There are two ways: the direct way is to make a "hypercall",
* to make requests of the Host Itself.
*
- * We use the KVM hypercall mechanism, though completely different hypercall
- * numbers. Seventeen hypercalls are available: the hypercall number is put in
- * the %eax register, and the arguments (when required) are placed in %ebx,
- * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax.
+ * Our hypercall mechanism uses the highest unused trap code (traps 32 and
+ * above are used by real hardware interrupts). Seventeen hypercalls are
+ * available: the hypercall number is put in the %eax register, and the
+ * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
+ * If a return value makes sense, it's returned in %eax.
*
* Grossly invalid calls result in Sudden Death at the hands of the vengeful
* Host, rather than returning failure. This reflects Winston Churchill's
* definition of a gentleman: "someone who is only rude intentionally".
-:*/
+ */
+static inline unsigned long
+hcall(unsigned long call,
+ unsigned long arg1, unsigned long arg2, unsigned long arg3,
+ unsigned long arg4)
+{
+ /* "int" is the Intel instruction to trigger a trap. */
+ asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
+ /* The call in %eax (aka "a") might be overwritten */
+ : "=a"(call)
+ /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */
+ : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4)
+ /* "memory" means this might write somewhere in memory.
+ * This isn't true for all calls, but it's safe to tell
+ * gcc that it might happen so it doesn't get clever. */
+ : "memory");
+ return call;
+}
/* Can't use our min() macro here: needs to be a constant */
#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 38638cd..0e83105 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -81,7 +81,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u32 event_inj_err;
u64 nested_cr3;
u64 lbr_ctl;
- u8 reserved_5[832];
+ u64 reserved_5;
+ u64 next_rip;
+ u8 reserved_6[816];
};
@@ -115,6 +117,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
+#define SVM_VM_CR_VALID_MASK 0x001fULL
+#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
+#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL
+
struct __attribute__ ((__packed__)) vmcb_seg {
u16 selector;
u16 attrib;
@@ -238,6 +244,7 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36
#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
+#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
#define SVM_EXIT_READ_CR0 0x000
#define SVM_EXIT_READ_CR3 0x003
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index f3dadb5..f854d89b 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -118,7 +118,7 @@ static bool check_device(struct device *dev)
return false;
/* No device or no PCI device */
- if (!dev || dev->bus != &pci_bus_type)
+ if (dev->bus != &pci_bus_type)
return false;
devid = get_device_id(dev);
@@ -392,6 +392,7 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
u32 tail, head;
u8 *target;
+ WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
target = iommu->cmd_buf + tail;
memcpy_toio(target, cmd, sizeof(*cmd));
@@ -2186,7 +2187,7 @@ static void prealloc_protection_domains(void)
struct dma_ops_domain *dma_dom;
u16 devid;
- while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+ for_each_pci_dev(dev) {
/* Do we handle this device? */
if (!check_device(&dev->dev))
@@ -2298,7 +2299,7 @@ static void cleanup_domain(struct protection_domain *domain)
list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
struct device *dev = dev_data->dev;
- do_detach(dev);
+ __detach_device(dev);
atomic_set(&dev_data->bind, 0);
}
@@ -2327,6 +2328,7 @@ static struct protection_domain *protection_domain_alloc(void)
return NULL;
spin_lock_init(&domain->lock);
+ mutex_init(&domain->api_lock);
domain->id = domain_id_alloc();
if (!domain->id)
goto out_err;
@@ -2379,9 +2381,7 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom)
free_pagetable(domain);
- domain_id_free(domain->id);
-
- kfree(domain);
+ protection_domain_free(domain);
dom->priv = NULL;
}
@@ -2456,6 +2456,8 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
iova &= PAGE_MASK;
paddr &= PAGE_MASK;
+ mutex_lock(&domain->api_lock);
+
for (i = 0; i < npages; ++i) {
ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
if (ret)
@@ -2465,6 +2467,8 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
paddr += PAGE_SIZE;
}
+ mutex_unlock(&domain->api_lock);
+
return 0;
}
@@ -2477,12 +2481,16 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
iova &= PAGE_MASK;
+ mutex_lock(&domain->api_lock);
+
for (i = 0; i < npages; ++i) {
iommu_unmap_page(domain, iova, PM_MAP_4k);
iova += PAGE_SIZE;
}
iommu_flush_tlb_pde(domain);
+
+ mutex_unlock(&domain->api_lock);
}
static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 42f5350..6360abf 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -138,9 +138,9 @@ int amd_iommus_present;
bool amd_iommu_np_cache __read_mostly;
/*
- * Set to true if ACPI table parsing and hardware intialization went properly
+ * The ACPI table parsing functions set this variable on an error
*/
-static bool amd_iommu_initialized;
+static int __initdata amd_iommu_init_err;
/*
* List of protection domains - used during resume
@@ -391,9 +391,11 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table)
*/
for (i = 0; i < table->length; ++i)
checksum += p[i];
- if (checksum != 0)
+ if (checksum != 0) {
/* ACPI table corrupt */
- return -ENODEV;
+ amd_iommu_init_err = -ENODEV;
+ return 0;
+ }
p += IVRS_HEADER_LENGTH;
@@ -436,7 +438,7 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
if (cmd_buf == NULL)
return NULL;
- iommu->cmd_buf_size = CMD_BUFFER_SIZE;
+ iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED;
return cmd_buf;
}
@@ -472,12 +474,13 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu)
&entry, sizeof(entry));
amd_iommu_reset_cmd_buffer(iommu);
+ iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED);
}
static void __init free_command_buffer(struct amd_iommu *iommu)
{
free_pages((unsigned long)iommu->cmd_buf,
- get_order(iommu->cmd_buf_size));
+ get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED)));
}
/* allocates the memory where the IOMMU will log its events to */
@@ -920,11 +923,16 @@ static int __init init_iommu_all(struct acpi_table_header *table)
h->mmio_phys);
iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
- if (iommu == NULL)
- return -ENOMEM;
+ if (iommu == NULL) {
+ amd_iommu_init_err = -ENOMEM;
+ return 0;
+ }
+
ret = init_iommu_one(iommu, h);
- if (ret)
- return ret;
+ if (ret) {
+ amd_iommu_init_err = ret;
+ return 0;
+ }
break;
default:
break;
@@ -934,8 +942,6 @@ static int __init init_iommu_all(struct acpi_table_header *table)
}
WARN_ON(p != end);
- amd_iommu_initialized = true;
-
return 0;
}
@@ -1211,6 +1217,10 @@ static int __init amd_iommu_init(void)
if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
return -ENODEV;
+ ret = amd_iommu_init_err;
+ if (ret)
+ goto out;
+
dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
@@ -1270,12 +1280,19 @@ static int __init amd_iommu_init(void)
if (acpi_table_parse("IVRS", init_iommu_all) != 0)
goto free;
- if (!amd_iommu_initialized)
+ if (amd_iommu_init_err) {
+ ret = amd_iommu_init_err;
goto free;
+ }
if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
goto free;
+ if (amd_iommu_init_err) {
+ ret = amd_iommu_init_err;
+ goto free;
+ }
+
ret = sysdev_class_register(&amd_iommu_sysdev_class);
if (ret)
goto free;
@@ -1288,6 +1305,8 @@ static int __init amd_iommu_init(void)
if (ret)
goto free;
+ enable_iommus();
+
if (iommu_pass_through)
ret = amd_iommu_init_passthrough();
else
@@ -1300,8 +1319,6 @@ static int __init amd_iommu_init(void)
amd_iommu_init_notifier();
- enable_iommus();
-
if (iommu_pass_through)
goto out;
@@ -1315,6 +1332,7 @@ out:
return ret;
free:
+ disable_iommus();
amd_iommu_uninit_devices();
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 3704997..b5d8b0b 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -393,6 +393,7 @@ void __init gart_iommu_hole_init(void)
for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
int bus;
int dev_base, dev_limit;
+ u32 ctl;
bus = bus_dev_ranges[i].bus;
dev_base = bus_dev_ranges[i].dev_base;
@@ -406,7 +407,19 @@ void __init gart_iommu_hole_init(void)
gart_iommu_aperture = 1;
x86_init.iommu.iommu_init = gart_iommu_init;
- aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7;
+ ctl = read_pci_config(bus, slot, 3,
+ AMD64_GARTAPERTURECTL);
+
+ /*
+ * Before we do anything else disable the GART. It may
+ * still be enabled if we boot into a crash-kernel here.
+ * Reconfiguring the GART while it is enabled could have
+ * unknown side-effects.
+ */
+ ctl &= ~GARTEN;
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
+
+ aper_order = (ctl >> 1) & 7;
aper_size = (32 * 1024 * 1024) << aper_order;
aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
aper_base <<= 25;
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index a4849c1..ebd4c51 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -27,7 +27,6 @@
#include <asm/cpu.h>
#include <asm/reboot.h>
#include <asm/virtext.h>
-#include <asm/x86_init.h>
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
@@ -103,10 +102,5 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#ifdef CONFIG_HPET_TIMER
hpet_disable();
#endif
-
-#ifdef CONFIG_X86_64
- x86_platform.iommu_shutdown();
-#endif
-
crash_save_cpu(regs, safe_smp_processor_id());
}
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index e39e771..e1a93be 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -14,6 +14,8 @@
#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
#endif
+#include <linux/uaccess.h>
+
extern void
show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp, char *log_lvl);
@@ -42,8 +44,10 @@ static inline unsigned long rewind_frame_pointer(int n)
get_bp(frame);
#ifdef CONFIG_FRAME_POINTER
- while (n--)
- frame = frame->next_frame;
+ while (n--) {
+ if (probe_kernel_address(&frame->next_frame, frame))
+ break;
+ }
#endif
return (unsigned long)frame;
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 68cd24f..0f7f130 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -565,6 +565,9 @@ static void enable_gart_translations(void)
enable_gart_translation(dev, __pa(agp_gatt_table));
}
+
+ /* Flush the GART-TLB to remove stale entries */
+ k8_flush_garts();
}
/*
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4dade6a..5ac0bb4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -33,6 +33,7 @@
#include <asm/kvm_emulate.h>
#include "x86.h"
+#include "tss.h"
/*
* Opcode effective-address decode tables.
@@ -50,6 +51,8 @@
#define DstReg (2<<1) /* Register operand. */
#define DstMem (3<<1) /* Memory operand. */
#define DstAcc (4<<1) /* Destination Accumulator */
+#define DstDI (5<<1) /* Destination is in ES:(E)DI */
+#define DstMem64 (6<<1) /* 64bit memory operand */
#define DstMask (7<<1)
/* Source operand type. */
#define SrcNone (0<<4) /* No source operand. */
@@ -63,6 +66,7 @@
#define SrcOne (7<<4) /* Implied '1' */
#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
#define SrcImmU (9<<4) /* Immediate operand, unsigned */
+#define SrcSI (0xa<<4) /* Source is in the DS:RSI */
#define SrcMask (0xf<<4)
/* Generic ModRM decode. */
#define ModRM (1<<8)
@@ -85,6 +89,9 @@
#define Src2ImmByte (2<<29)
#define Src2One (3<<29)
#define Src2Imm16 (4<<29)
+#define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be
+ in memory and second argument is located
+ immediately after the first one in memory. */
#define Src2Mask (7<<29)
enum {
@@ -147,8 +154,8 @@ static u32 opcode_table[256] = {
0, 0, 0, 0,
/* 0x68 - 0x6F */
SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
- SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
- SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
+ DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
+ SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
/* 0x70 - 0x77 */
SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
@@ -173,12 +180,12 @@ static u32 opcode_table[256] = {
/* 0xA0 - 0xA7 */
ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
- ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
- ByteOp | ImplicitOps | String, ImplicitOps | String,
+ ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
+ ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
/* 0xA8 - 0xAF */
- 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
- ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
- ByteOp | ImplicitOps | String, ImplicitOps | String,
+ 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
+ ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
+ ByteOp | DstDI | String, DstDI | String,
/* 0xB0 - 0xB7 */
ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
@@ -204,13 +211,13 @@ static u32 opcode_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0,
/* 0xE0 - 0xE7 */
0, 0, 0, 0,
- ByteOp | SrcImmUByte, SrcImmUByte,
- ByteOp | SrcImmUByte, SrcImmUByte,
+ ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
+ ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
/* 0xE8 - 0xEF */
SrcImm | Stack, SrcImm | ImplicitOps,
SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
- SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
- SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+ SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
+ SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
/* 0xF0 - 0xF7 */
0, 0, 0, 0,
ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
@@ -343,7 +350,8 @@ static u32 group_table[] = {
[Group5*8] =
DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
SrcMem | ModRM | Stack, 0,
- SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
+ SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps,
+ SrcMem | ModRM | Stack, 0,
[Group7*8] =
0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
SrcNone | ModRM | DstMem | Mov, 0,
@@ -353,14 +361,14 @@ static u32 group_table[] = {
DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
[Group9*8] =
- 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0,
+ 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
};
static u32 group2_table[] = {
[Group7*8] =
- SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM,
+ SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv,
SrcNone | ModRM | DstMem | Mov, 0,
- SrcMem16 | ModRM | Mov, 0,
+ SrcMem16 | ModRM | Mov | Priv, 0,
[Group9*8] =
0, 0, 0, 0, 0, 0, 0, 0,
};
@@ -562,7 +570,7 @@ static u32 group2_table[] = {
#define insn_fetch(_type, _size, _eip) \
({ unsigned long _x; \
rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
- if (rc != 0) \
+ if (rc != X86EMUL_CONTINUE) \
goto done; \
(_eip) += (_size); \
(_type)_x; \
@@ -638,40 +646,40 @@ static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
- unsigned long linear, u8 *dest)
+ unsigned long eip, u8 *dest)
{
struct fetch_cache *fc = &ctxt->decode.fetch;
int rc;
- int size;
+ int size, cur_size;
- if (linear < fc->start || linear >= fc->end) {
- size = min(15UL, PAGE_SIZE - offset_in_page(linear));
- rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL);
- if (rc)
+ if (eip == fc->end) {
+ cur_size = fc->end - fc->start;
+ size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
+ rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
+ size, ctxt->vcpu, NULL);
+ if (rc != X86EMUL_CONTINUE)
return rc;
- fc->start = linear;
- fc->end = linear + size;
+ fc->end += size;
}
- *dest = fc->data[linear - fc->start];
- return 0;
+ *dest = fc->data[eip - fc->start];
+ return X86EMUL_CONTINUE;
}
static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
unsigned long eip, void *dest, unsigned size)
{
- int rc = 0;
+ int rc;
/* x86 instructions are limited to 15 bytes. */
- if (eip + size - ctxt->decode.eip_orig > 15)
+ if (eip + size - ctxt->eip > 15)
return X86EMUL_UNHANDLEABLE;
- eip += ctxt->cs_base;
while (size--) {
rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
- if (rc)
+ if (rc != X86EMUL_CONTINUE)
return rc;
}
- return 0;
+ return X86EMUL_CONTINUE;
}
/*
@@ -702,7 +710,7 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
*address = 0;
rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
ctxt->vcpu, NULL);
- if (rc)
+ if (rc != X86EMUL_CONTINUE)
return rc;
rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
ctxt->vcpu, NULL);
@@ -782,7 +790,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
struct decode_cache *c = &ctxt->decode;
u8 sib;
int index_reg = 0, base_reg = 0, scale;
- int rc = 0;
+ int rc = X86EMUL_CONTINUE;
if (c->rex_prefix) {
c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
@@ -895,7 +903,7 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- int rc = 0;
+ int rc = X86EMUL_CONTINUE;
switch (c->ad_bytes) {
case 2:
@@ -916,14 +924,18 @@ int
x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- int rc = 0;
+ int rc = X86EMUL_CONTINUE;
int mode = ctxt->mode;
int def_op_bytes, def_ad_bytes, group;
- /* Shadow copy of register state. Committed on successful emulation. */
+ /* we cannot decode insn before we complete previous rep insn */
+ WARN_ON(ctxt->restart);
+
+ /* Shadow copy of register state. Committed on successful emulation. */
memset(c, 0, sizeof(struct decode_cache));
- c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu);
+ c->eip = ctxt->eip;
+ c->fetch.start = c->fetch.end = c->eip;
ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
@@ -1015,11 +1027,6 @@ done_prefixes:
}
}
- if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
- kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");
- return -1;
- }
-
if (c->d & Group) {
group = c->d & GroupMask;
c->modrm = insn_fetch(u8, 1, c->eip);
@@ -1046,7 +1053,7 @@ done_prefixes:
rc = decode_modrm(ctxt, ops);
else if (c->d & MemAbs)
rc = decode_abs(ctxt, ops);
- if (rc)
+ if (rc != X86EMUL_CONTINUE)
goto done;
if (!c->has_seg_override)
@@ -1057,6 +1064,10 @@ done_prefixes:
if (c->ad_bytes != 8)
c->modrm_ea = (u32)c->modrm_ea;
+
+ if (c->rip_relative)
+ c->modrm_ea += c->eip;
+
/*
* Decode and fetch the source operand: register, memory
* or immediate.
@@ -1091,6 +1102,8 @@ done_prefixes:
break;
}
c->src.type = OP_MEM;
+ c->src.ptr = (unsigned long *)c->modrm_ea;
+ c->src.val = 0;
break;
case SrcImm:
case SrcImmU:
@@ -1139,6 +1152,14 @@ done_prefixes:
c->src.bytes = 1;
c->src.val = 1;
break;
+ case SrcSI:
+ c->src.type = OP_MEM;
+ c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->src.ptr = (unsigned long *)
+ register_address(c, seg_override_base(ctxt, c),
+ c->regs[VCPU_REGS_RSI]);
+ c->src.val = 0;
+ break;
}
/*
@@ -1168,6 +1189,12 @@ done_prefixes:
c->src2.bytes = 1;
c->src2.val = 1;
break;
+ case Src2Mem16:
+ c->src2.type = OP_MEM;
+ c->src2.bytes = 2;
+ c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
+ c->src2.val = 0;
+ break;
}
/* Decode and fetch the destination operand: register or memory. */
@@ -1180,6 +1207,7 @@ done_prefixes:
c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
break;
case DstMem:
+ case DstMem64:
if ((c->d & ModRM) && c->modrm_mod == 3) {
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
c->dst.type = OP_REG;
@@ -1188,12 +1216,24 @@ done_prefixes:
break;
}
c->dst.type = OP_MEM;
+ c->dst.ptr = (unsigned long *)c->modrm_ea;
+ if ((c->d & DstMask) == DstMem64)
+ c->dst.bytes = 8;
+ else
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.val = 0;
+ if (c->d & BitOp) {
+ unsigned long mask = ~(c->dst.bytes * 8 - 1);
+
+ c->dst.ptr = (void *)c->dst.ptr +
+ (c->src.val & mask) / 8;
+ }
break;
case DstAcc:
c->dst.type = OP_REG;
- c->dst.bytes = c->op_bytes;
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
c->dst.ptr = &c->regs[VCPU_REGS_RAX];
- switch (c->op_bytes) {
+ switch (c->dst.bytes) {
case 1:
c->dst.val = *(u8 *)c->dst.ptr;
break;
@@ -1203,18 +1243,248 @@ done_prefixes:
case 4:
c->dst.val = *(u32 *)c->dst.ptr;
break;
+ case 8:
+ c->dst.val = *(u64 *)c->dst.ptr;
+ break;
}
c->dst.orig_val = c->dst.val;
break;
+ case DstDI:
+ c->dst.type = OP_MEM;
+ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->dst.ptr = (unsigned long *)
+ register_address(c, es_base(ctxt),
+ c->regs[VCPU_REGS_RDI]);
+ c->dst.val = 0;
+ break;
}
- if (c->rip_relative)
- c->modrm_ea += c->eip;
-
done:
return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
}
+static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ unsigned int size, unsigned short port,
+ void *dest)
+{
+ struct read_cache *rc = &ctxt->decode.io_read;
+
+ if (rc->pos == rc->end) { /* refill pio read ahead */
+ struct decode_cache *c = &ctxt->decode;
+ unsigned int in_page, n;
+ unsigned int count = c->rep_prefix ?
+ address_mask(c, c->regs[VCPU_REGS_RCX]) : 1;
+ in_page = (ctxt->eflags & EFLG_DF) ?
+ offset_in_page(c->regs[VCPU_REGS_RDI]) :
+ PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]);
+ n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
+ count);
+ if (n == 0)
+ n = 1;
+ rc->pos = rc->end = 0;
+ if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu))
+ return 0;
+ rc->end = n * size;
+ }
+
+ memcpy(dest, rc->data + rc->pos, size);
+ rc->pos += size;
+ return 1;
+}
+
+static u32 desc_limit_scaled(struct desc_struct *desc)
+{
+ u32 limit = get_desc_limit(desc);
+
+ return desc->g ? (limit << 12) | 0xfff : limit;
+}
+
+static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 selector, struct desc_ptr *dt)
+{
+ if (selector & 1 << 2) {
+ struct desc_struct desc;
+ memset (dt, 0, sizeof *dt);
+ if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu))
+ return;
+
+ dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
+ dt->address = get_desc_base(&desc);
+ } else
+ ops->get_gdt(dt, ctxt->vcpu);
+}
+
+/* allowed just for 8 bytes segments */
+static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 selector, struct desc_struct *desc)
+{
+ struct desc_ptr dt;
+ u16 index = selector >> 3;
+ int ret;
+ u32 err;
+ ulong addr;
+
+ get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+
+ if (dt.size < index * 8 + 7) {
+ kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ addr = dt.address + index * 8;
+ ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT)
+ kvm_inject_page_fault(ctxt->vcpu, addr, err);
+
+ return ret;
+}
+
+/* allowed just for 8 bytes segments */
+static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 selector, struct desc_struct *desc)
+{
+ struct desc_ptr dt;
+ u16 index = selector >> 3;
+ u32 err;
+ ulong addr;
+ int ret;
+
+ get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+
+ if (dt.size < index * 8 + 7) {
+ kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ addr = dt.address + index * 8;
+ ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT)
+ kvm_inject_page_fault(ctxt->vcpu, addr, err);
+
+ return ret;
+}
+
+static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 selector, int seg)
+{
+ struct desc_struct seg_desc;
+ u8 dpl, rpl, cpl;
+ unsigned err_vec = GP_VECTOR;
+ u32 err_code = 0;
+ bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+ int ret;
+
+ memset(&seg_desc, 0, sizeof seg_desc);
+
+ if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
+ || ctxt->mode == X86EMUL_MODE_REAL) {
+ /* set real mode segment descriptor */
+ set_desc_base(&seg_desc, selector << 4);
+ set_desc_limit(&seg_desc, 0xffff);
+ seg_desc.type = 3;
+ seg_desc.p = 1;
+ seg_desc.s = 1;
+ goto load;
+ }
+
+ /* NULL selector is not valid for TR, CS and SS */
+ if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
+ && null_selector)
+ goto exception;
+
+ /* TR should be in GDT only */
+ if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+ goto exception;
+
+ if (null_selector) /* for NULL selector skip all following checks */
+ goto load;
+
+ ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ err_code = selector & 0xfffc;
+ err_vec = GP_VECTOR;
+
+ /* can't load system descriptor into segment selecor */
+ if (seg <= VCPU_SREG_GS && !seg_desc.s)
+ goto exception;
+
+ if (!seg_desc.p) {
+ err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+ goto exception;
+ }
+
+ rpl = selector & 3;
+ dpl = seg_desc.dpl;
+ cpl = ops->cpl(ctxt->vcpu);
+
+ switch (seg) {
+ case VCPU_SREG_SS:
+ /*
+ * segment is not a writable data segment or segment
+ * selector's RPL != CPL or segment selector's RPL != CPL
+ */
+ if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
+ goto exception;
+ break;
+ case VCPU_SREG_CS:
+ if (!(seg_desc.type & 8))
+ goto exception;
+
+ if (seg_desc.type & 4) {
+ /* conforming */
+ if (dpl > cpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (rpl > cpl || dpl != cpl)
+ goto exception;
+ }
+ /* CS(RPL) <- CPL */
+ selector = (selector & 0xfffc) | cpl;
+ break;
+ case VCPU_SREG_TR:
+ if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
+ goto exception;
+ break;
+ case VCPU_SREG_LDTR:
+ if (seg_desc.s || seg_desc.type != 2)
+ goto exception;
+ break;
+ default: /* DS, ES, FS, or GS */
+ /*
+ * segment is not a data or readable code segment or
+ * ((segment is a data or nonconforming code segment)
+ * and (both RPL and CPL > DPL))
+ */
+ if ((seg_desc.type & 0xa) == 0x8 ||
+ (((seg_desc.type & 0xc) != 0xc) &&
+ (rpl > dpl && cpl > dpl)))
+ goto exception;
+ break;
+ }
+
+ if (seg_desc.s) {
+ /* mark segment as accessed */
+ seg_desc.type |= 1;
+ ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ }
+load:
+ ops->set_segment_selector(selector, seg, ctxt->vcpu);
+ ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
+ return X86EMUL_CONTINUE;
+exception:
+ kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code);
+ return X86EMUL_PROPAGATE_FAULT;
+}
+
static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
{
struct decode_cache *c = &ctxt->decode;
@@ -1251,7 +1521,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
int rc;
unsigned long val, change_mask;
int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
- int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu);
+ int cpl = ops->cpl(ctxt->vcpu);
rc = emulate_pop(ctxt, ops, &val, len);
if (rc != X86EMUL_CONTINUE)
@@ -1306,10 +1576,10 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
int rc;
rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
return rc;
- rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg);
+ rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg);
return rc;
}
@@ -1332,7 +1602,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- int rc = 0;
+ int rc = X86EMUL_CONTINUE;
int reg = VCPU_REGS_RDI;
while (reg >= VCPU_REGS_RAX) {
@@ -1343,7 +1613,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
}
rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
break;
--reg;
}
@@ -1354,12 +1624,8 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- int rc;
- rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
- if (rc != 0)
- return rc;
- return 0;
+ return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
}
static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
@@ -1395,7 +1661,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- int rc = 0;
switch (c->modrm_reg) {
case 0 ... 1: /* test */
@@ -1408,11 +1673,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
emulate_1op("neg", c->dst, ctxt->eflags);
break;
default:
- DPRINTF("Cannot emulate %02x\n", c->b);
- rc = X86EMUL_UNHANDLEABLE;
- break;
+ return 0;
}
- return rc;
+ return 1;
}
static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
@@ -1442,20 +1705,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
emulate_push(ctxt);
break;
}
- return 0;
+ return X86EMUL_CONTINUE;
}
static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- unsigned long memop)
+ struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- u64 old, new;
- int rc;
-
- rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- return rc;
+ u64 old = c->dst.orig_val;
if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
@@ -1463,17 +1720,13 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
ctxt->eflags &= ~EFLG_ZF;
-
} else {
- new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
+ c->dst.val = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
(u32) c->regs[VCPU_REGS_RBX];
- rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- return rc;
ctxt->eflags |= EFLG_ZF;
}
- return 0;
+ return X86EMUL_CONTINUE;
}
static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
@@ -1484,14 +1737,14 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
unsigned long cs;
rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes);
- if (rc)
+ if (rc != X86EMUL_CONTINUE)
return rc;
if (c->op_bytes == 4)
c->eip = (u32)c->eip;
rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
- if (rc)
+ if (rc != X86EMUL_CONTINUE)
return rc;
- rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS);
+ rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
return rc;
}
@@ -1544,7 +1797,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
default:
break;
}
- return 0;
+ return X86EMUL_CONTINUE;
}
static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
@@ -1598,8 +1851,11 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
u64 msr_data;
/* syscall is not available in real mode */
- if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86)
- return X86EMUL_UNHANDLEABLE;
+ if (ctxt->mode == X86EMUL_MODE_REAL ||
+ ctxt->mode == X86EMUL_MODE_VM86) {
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1649,14 +1905,16 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
/* inject #GP if in real mode */
if (ctxt->mode == X86EMUL_MODE_REAL) {
kvm_inject_gp(ctxt->vcpu, 0);
- return X86EMUL_UNHANDLEABLE;
+ return X86EMUL_PROPAGATE_FAULT;
}
/* XXX sysenter/sysexit have not been tested in 64bit mode.
* Therefore, we inject an #UD.
*/
- if (ctxt->mode == X86EMUL_MODE_PROT64)
- return X86EMUL_UNHANDLEABLE;
+ if (ctxt->mode == X86EMUL_MODE_PROT64) {
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1711,7 +1969,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
if (ctxt->mode == X86EMUL_MODE_REAL ||
ctxt->mode == X86EMUL_MODE_VM86) {
kvm_inject_gp(ctxt->vcpu, 0);
- return X86EMUL_UNHANDLEABLE;
+ return X86EMUL_PROPAGATE_FAULT;
}
setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1756,7 +2014,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
+static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
{
int iopl;
if (ctxt->mode == X86EMUL_MODE_REAL)
@@ -1764,7 +2023,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
if (ctxt->mode == X86EMUL_MODE_VM86)
return true;
iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
- return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl;
+ return ops->cpl(ctxt->vcpu) > iopl;
}
static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
@@ -1801,22 +2060,419 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
u16 port, u16 len)
{
- if (emulator_bad_iopl(ctxt))
+ if (emulator_bad_iopl(ctxt, ops))
if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
return false;
return true;
}
+static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ int seg)
+{
+ struct desc_struct desc;
+ if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
+ return get_desc_base(&desc);
+ else
+ return ~0;
+}
+
+static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ struct tss_segment_16 *tss)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ tss->ip = c->eip;
+ tss->flag = ctxt->eflags;
+ tss->ax = c->regs[VCPU_REGS_RAX];
+ tss->cx = c->regs[VCPU_REGS_RCX];
+ tss->dx = c->regs[VCPU_REGS_RDX];
+ tss->bx = c->regs[VCPU_REGS_RBX];
+ tss->sp = c->regs[VCPU_REGS_RSP];
+ tss->bp = c->regs[VCPU_REGS_RBP];
+ tss->si = c->regs[VCPU_REGS_RSI];
+ tss->di = c->regs[VCPU_REGS_RDI];
+
+ tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
+ tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+ tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
+ tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
+ tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
+}
+
+static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ struct tss_segment_16 *tss)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int ret;
+
+ c->eip = tss->ip;
+ ctxt->eflags = tss->flag | 2;
+ c->regs[VCPU_REGS_RAX] = tss->ax;
+ c->regs[VCPU_REGS_RCX] = tss->cx;
+ c->regs[VCPU_REGS_RDX] = tss->dx;
+ c->regs[VCPU_REGS_RBX] = tss->bx;
+ c->regs[VCPU_REGS_RSP] = tss->sp;
+ c->regs[VCPU_REGS_RBP] = tss->bp;
+ c->regs[VCPU_REGS_RSI] = tss->si;
+ c->regs[VCPU_REGS_RDI] = tss->di;
+
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors
+ */
+ ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu);
+ ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
+ ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
+
+ /*
+ * Now load segment descriptors. If fault happenes at this stage
+ * it is handled in a context of new task
+ */
+ ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int task_switch_16(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 tss_selector, u16 old_tss_sel,
+ ulong old_tss_base, struct desc_struct *new_desc)
+{
+ struct tss_segment_16 tss_seg;
+ int ret;
+ u32 err, new_tss_base = get_desc_base(new_desc);
+
+ ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT) {
+ /* FIXME: need to provide precise fault address */
+ kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ return ret;
+ }
+
+ save_state_to_tss16(ctxt, ops, &tss_seg);
+
+ ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT) {
+ /* FIXME: need to provide precise fault address */
+ kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ return ret;
+ }
+
+ ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT) {
+ /* FIXME: need to provide precise fault address */
+ kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ return ret;
+ }
+
+ if (old_tss_sel != 0xffff) {
+ tss_seg.prev_task_link = old_tss_sel;
+
+ ret = ops->write_std(new_tss_base,
+ &tss_seg.prev_task_link,
+ sizeof tss_seg.prev_task_link,
+ ctxt->vcpu, &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT) {
+ /* FIXME: need to provide precise fault address */
+ kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ return ret;
+ }
+ }
+
+ return load_state_from_tss16(ctxt, ops, &tss_seg);
+}
+
+static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ struct tss_segment_32 *tss)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ tss->cr3 = ops->get_cr(3, ctxt->vcpu);
+ tss->eip = c->eip;
+ tss->eflags = ctxt->eflags;
+ tss->eax = c->regs[VCPU_REGS_RAX];
+ tss->ecx = c->regs[VCPU_REGS_RCX];
+ tss->edx = c->regs[VCPU_REGS_RDX];
+ tss->ebx = c->regs[VCPU_REGS_RBX];
+ tss->esp = c->regs[VCPU_REGS_RSP];
+ tss->ebp = c->regs[VCPU_REGS_RBP];
+ tss->esi = c->regs[VCPU_REGS_RSI];
+ tss->edi = c->regs[VCPU_REGS_RDI];
+
+ tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
+ tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+ tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
+ tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
+ tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu);
+ tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu);
+ tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
+}
+
+static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ struct tss_segment_32 *tss)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int ret;
+
+ ops->set_cr(3, tss->cr3, ctxt->vcpu);
+ c->eip = tss->eip;
+ ctxt->eflags = tss->eflags | 2;
+ c->regs[VCPU_REGS_RAX] = tss->eax;
+ c->regs[VCPU_REGS_RCX] = tss->ecx;
+ c->regs[VCPU_REGS_RDX] = tss->edx;
+ c->regs[VCPU_REGS_RBX] = tss->ebx;
+ c->regs[VCPU_REGS_RSP] = tss->esp;
+ c->regs[VCPU_REGS_RBP] = tss->ebp;
+ c->regs[VCPU_REGS_RSI] = tss->esi;
+ c->regs[VCPU_REGS_RDI] = tss->edi;
+
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors
+ */
+ ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu);
+ ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
+ ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
+ ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu);
+ ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu);
+
+ /*
+ * Now load segment descriptors. If fault happenes at this stage
+ * it is handled in a context of new task
+ */
+ ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int task_switch_32(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 tss_selector, u16 old_tss_sel,
+ ulong old_tss_base, struct desc_struct *new_desc)
+{
+ struct tss_segment_32 tss_seg;
+ int ret;
+ u32 err, new_tss_base = get_desc_base(new_desc);
+
+ ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT) {
+ /* FIXME: need to provide precise fault address */
+ kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ return ret;
+ }
+
+ save_state_to_tss32(ctxt, ops, &tss_seg);
+
+ ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT) {
+ /* FIXME: need to provide precise fault address */
+ kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ return ret;
+ }
+
+ ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT) {
+ /* FIXME: need to provide precise fault address */
+ kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ return ret;
+ }
+
+ if (old_tss_sel != 0xffff) {
+ tss_seg.prev_task_link = old_tss_sel;
+
+ ret = ops->write_std(new_tss_base,
+ &tss_seg.prev_task_link,
+ sizeof tss_seg.prev_task_link,
+ ctxt->vcpu, &err);
+ if (ret == X86EMUL_PROPAGATE_FAULT) {
+ /* FIXME: need to provide precise fault address */
+ kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ return ret;
+ }
+ }
+
+ return load_state_from_tss32(ctxt, ops, &tss_seg);
+}
+
+static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 tss_selector, int reason,
+ bool has_error_code, u32 error_code)
+{
+ struct desc_struct curr_tss_desc, next_tss_desc;
+ int ret;
+ u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
+ ulong old_tss_base =
+ get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR);
+ u32 desc_limit;
+
+ /* FIXME: old_tss_base == ~0 ? */
+
+ ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ /* FIXME: check that next_tss_desc is tss */
+
+ if (reason != TASK_SWITCH_IRET) {
+ if ((tss_selector & 3) > next_tss_desc.dpl ||
+ ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ }
+
+ desc_limit = desc_limit_scaled(&next_tss_desc);
+ if (!next_tss_desc.p ||
+ ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
+ desc_limit < 0x2b)) {
+ kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR,
+ tss_selector & 0xfffc);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
+ curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
+ write_segment_descriptor(ctxt, ops, old_tss_sel,
+ &curr_tss_desc);
+ }
+
+ if (reason == TASK_SWITCH_IRET)
+ ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
+
+ /* set back link to prev task only if NT bit is set in eflags
+ note that old_tss_sel is not used afetr this point */
+ if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+ old_tss_sel = 0xffff;
+
+ if (next_tss_desc.type & 8)
+ ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel,
+ old_tss_base, &next_tss_desc);
+ else
+ ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel,
+ old_tss_base, &next_tss_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE)
+ ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT;
+
+ if (reason != TASK_SWITCH_IRET) {
+ next_tss_desc.type |= (1 << 1); /* set busy flag */
+ write_segment_descriptor(ctxt, ops, tss_selector,
+ &next_tss_desc);
+ }
+
+ ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
+ ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu);
+ ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
+
+ if (has_error_code) {
+ struct decode_cache *c = &ctxt->decode;
+
+ c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
+ c->lock_prefix = 0;
+ c->src.val = (unsigned long) error_code;
+ emulate_push(ctxt);
+ }
+
+ return ret;
+}
+
+int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ u16 tss_selector, int reason,
+ bool has_error_code, u32 error_code)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc;
+
+ memset(c, 0, sizeof(struct decode_cache));
+ c->eip = ctxt->eip;
+ memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+ c->dst.type = OP_NONE;
+
+ rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
+ has_error_code, error_code);
+
+ if (rc == X86EMUL_CONTINUE) {
+ memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
+ kvm_rip_write(ctxt->vcpu, c->eip);
+ rc = writeback(ctxt, ops);
+ }
+
+ return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+}
+
+static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
+ int reg, struct operand *op)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
+
+ register_address_increment(c, &c->regs[reg], df * op->bytes);
+ op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]);
+}
+
int
x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
- unsigned long memop = 0;
u64 msr_data;
- unsigned long saved_eip = 0;
struct decode_cache *c = &ctxt->decode;
- unsigned int port;
- int io_dir_in;
- int rc = 0;
+ int rc = X86EMUL_CONTINUE;
+ int saved_dst_type = c->dst.type;
ctxt->interruptibility = 0;
@@ -1826,26 +2482,30 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
*/
memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
- saved_eip = c->eip;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ goto done;
+ }
/* LOCK prefix is allowed only with some instructions */
- if (c->lock_prefix && !(c->d & Lock)) {
+ if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
goto done;
}
/* Privileged instruction can be executed only in CPL=0 */
- if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) {
+ if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
kvm_inject_gp(ctxt->vcpu, 0);
goto done;
}
- if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
- memop = c->modrm_ea;
-
if (c->rep_prefix && (c->d & String)) {
+ ctxt->restart = true;
/* All REP prefixes have the same first termination condition */
- if (c->regs[VCPU_REGS_RCX] == 0) {
+ if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
+ string_done:
+ ctxt->restart = false;
kvm_rip_write(ctxt->vcpu, c->eip);
goto done;
}
@@ -1857,25 +2517,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
* - if REPNE/REPNZ and ZF = 1 then done
*/
if ((c->b == 0xa6) || (c->b == 0xa7) ||
- (c->b == 0xae) || (c->b == 0xaf)) {
+ (c->b == 0xae) || (c->b == 0xaf)) {
if ((c->rep_prefix == REPE_PREFIX) &&
- ((ctxt->eflags & EFLG_ZF) == 0)) {
- kvm_rip_write(ctxt->vcpu, c->eip);
- goto done;
- }
+ ((ctxt->eflags & EFLG_ZF) == 0))
+ goto string_done;
if ((c->rep_prefix == REPNE_PREFIX) &&
- ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
- kvm_rip_write(ctxt->vcpu, c->eip);
- goto done;
- }
+ ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))
+ goto string_done;
}
- c->regs[VCPU_REGS_RCX]--;
- c->eip = kvm_rip_read(ctxt->vcpu);
+ c->eip = ctxt->eip;
}
if (c->src.type == OP_MEM) {
- c->src.ptr = (unsigned long *)memop;
- c->src.val = 0;
rc = ops->read_emulated((unsigned long)c->src.ptr,
&c->src.val,
c->src.bytes,
@@ -1885,29 +2538,25 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
c->src.orig_val = c->src.val;
}
+ if (c->src2.type == OP_MEM) {
+ rc = ops->read_emulated((unsigned long)c->src2.ptr,
+ &c->src2.val,
+ c->src2.bytes,
+ ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
if ((c->d & DstMask) == ImplicitOps)
goto special_insn;
- if (c->dst.type == OP_MEM) {
- c->dst.ptr = (unsigned long *)memop;
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.val = 0;
- if (c->d & BitOp) {
- unsigned long mask = ~(c->dst.bytes * 8 - 1);
-
- c->dst.ptr = (void *)c->dst.ptr +
- (c->src.val & mask) / 8;
- }
- if (!(c->d & Mov)) {
- /* optimisation - avoid slow emulated read */
- rc = ops->read_emulated((unsigned long)c->dst.ptr,
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- }
+ if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
+ /* optimisation - avoid slow emulated read if Mov */
+ rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val,
+ c->dst.bytes, ctxt->vcpu);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
}
c->dst.orig_val = c->dst.val;
@@ -1926,7 +2575,7 @@ special_insn:
break;
case 0x07: /* pop es */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
break;
case 0x08 ... 0x0d:
@@ -1945,7 +2594,7 @@ special_insn:
break;
case 0x17: /* pop ss */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
break;
case 0x18 ... 0x1d:
@@ -1957,7 +2606,7 @@ special_insn:
break;
case 0x1f: /* pop ds */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
break;
case 0x20 ... 0x25:
@@ -1988,7 +2637,7 @@ special_insn:
case 0x58 ... 0x5f: /* pop reg */
pop_instruction:
rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
break;
case 0x60: /* pusha */
@@ -1996,7 +2645,7 @@ special_insn:
break;
case 0x61: /* popa */
rc = emulate_popa(ctxt, ops);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
break;
case 0x63: /* movsxd */
@@ -2010,47 +2659,29 @@ special_insn:
break;
case 0x6c: /* insb */
case 0x6d: /* insw/insd */
+ c->dst.bytes = min(c->dst.bytes, 4u);
if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
- (c->d & ByteOp) ? 1 : c->op_bytes)) {
+ c->dst.bytes)) {
kvm_inject_gp(ctxt->vcpu, 0);
goto done;
}
- if (kvm_emulate_pio_string(ctxt->vcpu,
- 1,
- (c->d & ByteOp) ? 1 : c->op_bytes,
- c->rep_prefix ?
- address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
- (ctxt->eflags & EFLG_DF),
- register_address(c, es_base(ctxt),
- c->regs[VCPU_REGS_RDI]),
- c->rep_prefix,
- c->regs[VCPU_REGS_RDX]) == 0) {
- c->eip = saved_eip;
- return -1;
- }
- return 0;
+ if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
+ c->regs[VCPU_REGS_RDX], &c->dst.val))
+ goto done; /* IO is needed, skip writeback */
+ break;
case 0x6e: /* outsb */
case 0x6f: /* outsw/outsd */
+ c->src.bytes = min(c->src.bytes, 4u);
if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
- (c->d & ByteOp) ? 1 : c->op_bytes)) {
+ c->src.bytes)) {
kvm_inject_gp(ctxt->vcpu, 0);
goto done;
}
- if (kvm_emulate_pio_string(ctxt->vcpu,
- 0,
- (c->d & ByteOp) ? 1 : c->op_bytes,
- c->rep_prefix ?
- address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
- (ctxt->eflags & EFLG_DF),
- register_address(c,
- seg_override_base(ctxt, c),
- c->regs[VCPU_REGS_RSI]),
- c->rep_prefix,
- c->regs[VCPU_REGS_RDX]) == 0) {
- c->eip = saved_eip;
- return -1;
- }
- return 0;
+ ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
+ &c->src.val, 1, ctxt->vcpu);
+
+ c->dst.type = OP_NONE; /* nothing to writeback */
+ break;
case 0x70 ... 0x7f: /* jcc (short) */
if (test_cc(c->b, ctxt->eflags))
jmp_rel(c, c->src.val);
@@ -2107,12 +2738,11 @@ special_insn:
case 0x8c: { /* mov r/m, sreg */
struct kvm_segment segreg;
- if (c->modrm_reg <= 5)
+ if (c->modrm_reg <= VCPU_SREG_GS)
kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
else {
- printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n",
- c->modrm);
- goto cannot_emulate;
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ goto done;
}
c->dst.val = segreg.selector;
break;
@@ -2132,16 +2762,16 @@ special_insn:
}
if (c->modrm_reg == VCPU_SREG_SS)
- toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
+ toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS);
- rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg);
+ rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
c->dst.type = OP_NONE; /* Disable writeback. */
break;
}
case 0x8f: /* pop (sole member of Grp1a) */
rc = emulate_grp1a(ctxt, ops);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
break;
case 0x90: /* nop / xchg r8,rax */
@@ -2175,89 +2805,16 @@ special_insn:
c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
break;
case 0xa4 ... 0xa5: /* movs */
- c->dst.type = OP_MEM;
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = (unsigned long *)register_address(c,
- es_base(ctxt),
- c->regs[VCPU_REGS_RDI]);
- rc = ops->read_emulated(register_address(c,
- seg_override_base(ctxt, c),
- c->regs[VCPU_REGS_RSI]),
- &c->dst.val,
- c->dst.bytes, ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- register_address_increment(c, &c->regs[VCPU_REGS_RSI],
- (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
- : c->dst.bytes);
- register_address_increment(c, &c->regs[VCPU_REGS_RDI],
- (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
- : c->dst.bytes);
- break;
+ goto mov;
case 0xa6 ... 0xa7: /* cmps */
- c->src.type = OP_NONE; /* Disable writeback. */
- c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->src.ptr = (unsigned long *)register_address(c,
- seg_override_base(ctxt, c),
- c->regs[VCPU_REGS_RSI]);
- rc = ops->read_emulated((unsigned long)c->src.ptr,
- &c->src.val,
- c->src.bytes,
- ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- goto done;
-
c->dst.type = OP_NONE; /* Disable writeback. */
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = (unsigned long *)register_address(c,
- es_base(ctxt),
- c->regs[VCPU_REGS_RDI]);
- rc = ops->read_emulated((unsigned long)c->dst.ptr,
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- goto done;
-
DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
-
- emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
-
- register_address_increment(c, &c->regs[VCPU_REGS_RSI],
- (ctxt->eflags & EFLG_DF) ? -c->src.bytes
- : c->src.bytes);
- register_address_increment(c, &c->regs[VCPU_REGS_RDI],
- (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
- : c->dst.bytes);
-
- break;
+ goto cmp;
case 0xaa ... 0xab: /* stos */
- c->dst.type = OP_MEM;
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = (unsigned long *)register_address(c,
- es_base(ctxt),
- c->regs[VCPU_REGS_RDI]);
c->dst.val = c->regs[VCPU_REGS_RAX];
- register_address_increment(c, &c->regs[VCPU_REGS_RDI],
- (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
- : c->dst.bytes);
break;
case 0xac ... 0xad: /* lods */
- c->dst.type = OP_REG;
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
- rc = ops->read_emulated(register_address(c,
- seg_override_base(ctxt, c),
- c->regs[VCPU_REGS_RSI]),
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- register_address_increment(c, &c->regs[VCPU_REGS_RSI],
- (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
- : c->dst.bytes);
- break;
+ goto mov;
case 0xae ... 0xaf: /* scas */
DPRINTF("Urk! I don't handle SCAS.\n");
goto cannot_emulate;
@@ -2277,7 +2834,7 @@ special_insn:
break;
case 0xcb: /* ret far */
rc = emulate_ret_far(ctxt, ops);
- if (rc)
+ if (rc != X86EMUL_CONTINUE)
goto done;
break;
case 0xd0 ... 0xd1: /* Grp2 */
@@ -2290,14 +2847,10 @@ special_insn:
break;
case 0xe4: /* inb */
case 0xe5: /* in */
- port = c->src.val;
- io_dir_in = 1;
- goto do_io;
+ goto do_io_in;
case 0xe6: /* outb */
case 0xe7: /* out */
- port = c->src.val;
- io_dir_in = 0;
- goto do_io;
+ goto do_io_out;
case 0xe8: /* call (near) */ {
long int rel = c->src.val;
c->src.val = (unsigned long) c->eip;
@@ -2308,8 +2861,9 @@ special_insn:
case 0xe9: /* jmp rel */
goto jmp;
case 0xea: /* jmp far */
- if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val,
- VCPU_SREG_CS))
+ jump_far:
+ if (load_segment_descriptor(ctxt, ops, c->src2.val,
+ VCPU_SREG_CS))
goto done;
c->eip = c->src.val;
@@ -2321,25 +2875,29 @@ special_insn:
break;
case 0xec: /* in al,dx */
case 0xed: /* in (e/r)ax,dx */
- port = c->regs[VCPU_REGS_RDX];
- io_dir_in = 1;
- goto do_io;
+ c->src.val = c->regs[VCPU_REGS_RDX];
+ do_io_in:
+ c->dst.bytes = min(c->dst.bytes, 4u);
+ if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
+ kvm_inject_gp(ctxt->vcpu, 0);
+ goto done;
+ }
+ if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
+ &c->dst.val))
+ goto done; /* IO is needed */
+ break;
case 0xee: /* out al,dx */
case 0xef: /* out (e/r)ax,dx */
- port = c->regs[VCPU_REGS_RDX];
- io_dir_in = 0;
- do_io:
- if (!emulator_io_permited(ctxt, ops, port,
- (c->d & ByteOp) ? 1 : c->op_bytes)) {
+ c->src.val = c->regs[VCPU_REGS_RDX];
+ do_io_out:
+ c->dst.bytes = min(c->dst.bytes, 4u);
+ if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
kvm_inject_gp(ctxt->vcpu, 0);
goto done;
}
- if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
- (c->d & ByteOp) ? 1 : c->op_bytes,
- port) != 0) {
- c->eip = saved_eip;
- goto cannot_emulate;
- }
+ ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
+ ctxt->vcpu);
+ c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xf4: /* hlt */
ctxt->vcpu->arch.halt_request = 1;
@@ -2350,16 +2908,15 @@ special_insn:
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xf6 ... 0xf7: /* Grp3 */
- rc = emulate_grp3(ctxt, ops);
- if (rc != 0)
- goto done;
+ if (!emulate_grp3(ctxt, ops))
+ goto cannot_emulate;
break;
case 0xf8: /* clc */
ctxt->eflags &= ~EFLG_CF;
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xfa: /* cli */
- if (emulator_bad_iopl(ctxt))
+ if (emulator_bad_iopl(ctxt, ops))
kvm_inject_gp(ctxt->vcpu, 0);
else {
ctxt->eflags &= ~X86_EFLAGS_IF;
@@ -2367,10 +2924,10 @@ special_insn:
}
break;
case 0xfb: /* sti */
- if (emulator_bad_iopl(ctxt))
+ if (emulator_bad_iopl(ctxt, ops))
kvm_inject_gp(ctxt->vcpu, 0);
else {
- toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
+ toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI);
ctxt->eflags |= X86_EFLAGS_IF;
c->dst.type = OP_NONE; /* Disable writeback. */
}
@@ -2383,28 +2940,55 @@ special_insn:
ctxt->eflags |= EFLG_DF;
c->dst.type = OP_NONE; /* Disable writeback. */
break;
- case 0xfe ... 0xff: /* Grp4/Grp5 */
+ case 0xfe: /* Grp4 */
+ grp45:
rc = emulate_grp45(ctxt, ops);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
break;
+ case 0xff: /* Grp5 */
+ if (c->modrm_reg == 5)
+ goto jump_far;
+ goto grp45;
}
writeback:
rc = writeback(ctxt, ops);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
+ /*
+ * restore dst type in case the decoding will be reused
+ * (happens for string instruction )
+ */
+ c->dst.type = saved_dst_type;
+
+ if ((c->d & SrcMask) == SrcSI)
+ string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI,
+ &c->src);
+
+ if ((c->d & DstMask) == DstDI)
+ string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst);
+
+ if (c->rep_prefix && (c->d & String)) {
+ struct read_cache *rc = &ctxt->decode.io_read;
+ register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
+ /*
+ * Re-enter guest when pio read ahead buffer is empty or,
+ * if it is not used, after each 1024 iteration.
+ */
+ if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
+ (rc->end != 0 && rc->end == rc->pos))
+ ctxt->restart = false;
+ }
+
/* Commit shadow register state. */
memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
kvm_rip_write(ctxt->vcpu, c->eip);
+ ops->set_rflags(ctxt->vcpu, ctxt->eflags);
done:
- if (rc == X86EMUL_UNHANDLEABLE) {
- c->eip = saved_eip;
- return -1;
- }
- return 0;
+ return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
twobyte_insn:
switch (c->b) {
@@ -2418,18 +3002,18 @@ twobyte_insn:
goto cannot_emulate;
rc = kvm_fix_hypercall(ctxt->vcpu);
- if (rc)
+ if (rc != X86EMUL_CONTINUE)
goto done;
/* Let the processor re-execute the fixed hypercall */
- c->eip = kvm_rip_read(ctxt->vcpu);
+ c->eip = ctxt->eip;
/* Disable writeback. */
c->dst.type = OP_NONE;
break;
case 2: /* lgdt */
rc = read_descriptor(ctxt, ops, c->src.ptr,
&size, &address, c->op_bytes);
- if (rc)
+ if (rc != X86EMUL_CONTINUE)
goto done;
realmode_lgdt(ctxt->vcpu, size, address);
/* Disable writeback. */
@@ -2440,7 +3024,7 @@ twobyte_insn:
switch (c->modrm_rm) {
case 1:
rc = kvm_fix_hypercall(ctxt->vcpu);
- if (rc)
+ if (rc != X86EMUL_CONTINUE)
goto done;
break;
default:
@@ -2450,7 +3034,7 @@ twobyte_insn:
rc = read_descriptor(ctxt, ops, c->src.ptr,
&size, &address,
c->op_bytes);
- if (rc)
+ if (rc != X86EMUL_CONTINUE)
goto done;
realmode_lidt(ctxt->vcpu, size, address);
}
@@ -2459,15 +3043,18 @@ twobyte_insn:
break;
case 4: /* smsw */
c->dst.bytes = 2;
- c->dst.val = realmode_get_cr(ctxt->vcpu, 0);
+ c->dst.val = ops->get_cr(0, ctxt->vcpu);
break;
case 6: /* lmsw */
- realmode_lmsw(ctxt->vcpu, (u16)c->src.val,
- &ctxt->eflags);
+ ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) |
+ (c->src.val & 0x0f), ctxt->vcpu);
c->dst.type = OP_NONE;
break;
+ case 5: /* not defined */
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ goto done;
case 7: /* invlpg*/
- emulate_invlpg(ctxt->vcpu, memop);
+ emulate_invlpg(ctxt->vcpu, c->modrm_ea);
/* Disable writeback. */
c->dst.type = OP_NONE;
break;
@@ -2493,54 +3080,54 @@ twobyte_insn:
c->dst.type = OP_NONE;
break;
case 0x20: /* mov cr, reg */
- if (c->modrm_mod != 3)
- goto cannot_emulate;
- c->regs[c->modrm_rm] =
- realmode_get_cr(ctxt->vcpu, c->modrm_reg);
+ switch (c->modrm_reg) {
+ case 1:
+ case 5 ... 7:
+ case 9 ... 15:
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ goto done;
+ }
+ c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
c->dst.type = OP_NONE; /* no writeback */
break;
case 0x21: /* mov from dr to reg */
- if (c->modrm_mod != 3)
- goto cannot_emulate;
- rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
- if (rc)
- goto cannot_emulate;
+ if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
+ (c->modrm_reg == 4 || c->modrm_reg == 5)) {
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ goto done;
+ }
+ emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
c->dst.type = OP_NONE; /* no writeback */
break;
case 0x22: /* mov reg, cr */
- if (c->modrm_mod != 3)
- goto cannot_emulate;
- realmode_set_cr(ctxt->vcpu,
- c->modrm_reg, c->modrm_val, &ctxt->eflags);
+ ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu);
c->dst.type = OP_NONE;
break;
case 0x23: /* mov from reg to dr */
- if (c->modrm_mod != 3)
- goto cannot_emulate;
- rc = emulator_set_dr(ctxt, c->modrm_reg,
- c->regs[c->modrm_rm]);
- if (rc)
- goto cannot_emulate;
+ if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
+ (c->modrm_reg == 4 || c->modrm_reg == 5)) {
+ kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ goto done;
+ }
+ emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]);
c->dst.type = OP_NONE; /* no writeback */
break;
case 0x30:
/* wrmsr */
msr_data = (u32)c->regs[VCPU_REGS_RAX]
| ((u64)c->regs[VCPU_REGS_RDX] << 32);
- rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
- if (rc) {
+ if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
kvm_inject_gp(ctxt->vcpu, 0);
- c->eip = kvm_rip_read(ctxt->vcpu);
+ goto done;
}
rc = X86EMUL_CONTINUE;
c->dst.type = OP_NONE;
break;
case 0x32:
/* rdmsr */
- rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
- if (rc) {
+ if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
kvm_inject_gp(ctxt->vcpu, 0);
- c->eip = kvm_rip_read(ctxt->vcpu);
+ goto done;
} else {
c->regs[VCPU_REGS_RAX] = (u32)msr_data;
c->regs[VCPU_REGS_RDX] = msr_data >> 32;
@@ -2577,7 +3164,7 @@ twobyte_insn:
break;
case 0xa1: /* pop fs */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
break;
case 0xa3:
@@ -2596,7 +3183,7 @@ twobyte_insn:
break;
case 0xa9: /* pop gs */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
- if (rc != 0)
+ if (rc != X86EMUL_CONTINUE)
goto done;
break;
case 0xab:
@@ -2668,16 +3255,14 @@ twobyte_insn:
(u64) c->src.val;
break;
case 0xc7: /* Grp9 (cmpxchg8b) */
- rc = emulate_grp9(ctxt, ops, memop);
- if (rc != 0)
+ rc = emulate_grp9(ctxt, ops);
+ if (rc != X86EMUL_CONTINUE)
goto done;
- c->dst.type = OP_NONE;
break;
}
goto writeback;
cannot_emulate:
DPRINTF("Cannot emulate %02x\n", c->b);
- c->eip = saved_eip;
return -1;
}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index a790fa1..93825ff 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -33,6 +33,29 @@
#include <linux/kvm_host.h>
#include "trace.h"
+static void pic_lock(struct kvm_pic *s)
+ __acquires(&s->lock)
+{
+ raw_spin_lock(&s->lock);
+}
+
+static void pic_unlock(struct kvm_pic *s)
+ __releases(&s->lock)
+{
+ bool wakeup = s->wakeup_needed;
+ struct kvm_vcpu *vcpu;
+
+ s->wakeup_needed = false;
+
+ raw_spin_unlock(&s->lock);
+
+ if (wakeup) {
+ vcpu = s->kvm->bsp_vcpu;
+ if (vcpu)
+ kvm_vcpu_kick(vcpu);
+ }
+}
+
static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
{
s->isr &= ~(1 << irq);
@@ -45,19 +68,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
* Other interrupt may be delivered to PIC while lock is dropped but
* it should be safe since PIC state is already updated at this stage.
*/
- raw_spin_unlock(&s->pics_state->lock);
+ pic_unlock(s->pics_state);
kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
- raw_spin_lock(&s->pics_state->lock);
+ pic_lock(s->pics_state);
}
void kvm_pic_clear_isr_ack(struct kvm *kvm)
{
struct kvm_pic *s = pic_irqchip(kvm);
- raw_spin_lock(&s->lock);
+ pic_lock(s);
s->pics[0].isr_ack = 0xff;
s->pics[1].isr_ack = 0xff;
- raw_spin_unlock(&s->lock);
+ pic_unlock(s);
}
/*
@@ -158,9 +181,9 @@ static void pic_update_irq(struct kvm_pic *s)
void kvm_pic_update_irq(struct kvm_pic *s)
{
- raw_spin_lock(&s->lock);
+ pic_lock(s);
pic_update_irq(s);
- raw_spin_unlock(&s->lock);
+ pic_unlock(s);
}
int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -168,14 +191,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
struct kvm_pic *s = opaque;
int ret = -1;
- raw_spin_lock(&s->lock);
+ pic_lock(s);
if (irq >= 0 && irq < PIC_NUM_PINS) {
ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
pic_update_irq(s);
trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
s->pics[irq >> 3].imr, ret == 0);
}
- raw_spin_unlock(&s->lock);
+ pic_unlock(s);
return ret;
}
@@ -205,7 +228,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
int irq, irq2, intno;
struct kvm_pic *s = pic_irqchip(kvm);
- raw_spin_lock(&s->lock);
+ pic_lock(s);
irq = pic_get_irq(&s->pics[0]);
if (irq >= 0) {
pic_intack(&s->pics[0], irq);
@@ -230,7 +253,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
intno = s->pics[0].irq_base + irq;
}
pic_update_irq(s);
- raw_spin_unlock(&s->lock);
+ pic_unlock(s);
return intno;
}
@@ -444,7 +467,7 @@ static int picdev_write(struct kvm_io_device *this,
printk(KERN_ERR "PIC: non byte write\n");
return 0;
}
- raw_spin_lock(&s->lock);
+ pic_lock(s);
switch (addr) {
case 0x20:
case 0x21:
@@ -457,7 +480,7 @@ static int picdev_write(struct kvm_io_device *this,
elcr_ioport_write(&s->pics[addr & 1], addr, data);
break;
}
- raw_spin_unlock(&s->lock);
+ pic_unlock(s);
return 0;
}
@@ -474,7 +497,7 @@ static int picdev_read(struct kvm_io_device *this,
printk(KERN_ERR "PIC: non byte read\n");
return 0;
}
- raw_spin_lock(&s->lock);
+ pic_lock(s);
switch (addr) {
case 0x20:
case 0x21:
@@ -488,7 +511,7 @@ static int picdev_read(struct kvm_io_device *this,
break;
}
*(unsigned char *)val = data;
- raw_spin_unlock(&s->lock);
+ pic_unlock(s);
return 0;
}
@@ -505,7 +528,7 @@ static void pic_irq_request(void *opaque, int level)
s->output = level;
if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
s->pics[0].isr_ack &= ~(1 << irq);
- kvm_vcpu_kick(vcpu);
+ s->wakeup_needed = true;
}
}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 34b15915..cd1f362 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -63,6 +63,7 @@ struct kvm_kpic_state {
struct kvm_pic {
raw_spinlock_t lock;
+ bool wakeup_needed;
unsigned pending_acks;
struct kvm *kvm;
struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
index 55c7524..64bc6ea 100644
--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
@@ -10,9 +10,7 @@ struct kvm_timer {
};
struct kvm_timer_ops {
- bool (*is_periodic)(struct kvm_timer *);
+ bool (*is_periodic)(struct kvm_timer *);
};
-
enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
-
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 48aeee8..7a17db1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -148,7 +148,6 @@ module_param(oos_shadow, bool, 0644);
#include <trace/events/kvm.h>
-#undef TRACE_INCLUDE_FILE
#define CREATE_TRACE_POINTS
#include "mmutrace.h"
@@ -174,12 +173,7 @@ struct kvm_shadow_walk_iterator {
shadow_walk_okay(&(_walker)); \
shadow_walk_next(&(_walker)))
-
-struct kvm_unsync_walk {
- int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
-};
-
-typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
+typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp);
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
@@ -327,7 +321,6 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
page = alloc_page(GFP_KERNEL);
if (!page)
return -ENOMEM;
- set_page_private(page, 0);
cache->objects[cache->nobjs++] = page_address(page);
}
return 0;
@@ -925,7 +918,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
- INIT_LIST_HEAD(&sp->oos_link);
bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
sp->multimapped = 0;
sp->parent_pte = parent_pte;
@@ -1009,8 +1001,7 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
}
-static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- mmu_parent_walk_fn fn)
+static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
{
struct kvm_pte_chain *pte_chain;
struct hlist_node *node;
@@ -1019,8 +1010,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (!sp->multimapped && sp->parent_pte) {
parent_sp = page_header(__pa(sp->parent_pte));
- fn(vcpu, parent_sp);
- mmu_parent_walk(vcpu, parent_sp, fn);
+ fn(parent_sp);
+ mmu_parent_walk(parent_sp, fn);
return;
}
hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
@@ -1028,8 +1019,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (!pte_chain->parent_ptes[i])
break;
parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
- fn(vcpu, parent_sp);
- mmu_parent_walk(vcpu, parent_sp, fn);
+ fn(parent_sp);
+ mmu_parent_walk(parent_sp, fn);
}
}
@@ -1066,16 +1057,15 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
}
}
-static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static int unsync_walk_fn(struct kvm_mmu_page *sp)
{
kvm_mmu_update_parents_unsync(sp);
return 1;
}
-static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
+static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
{
- mmu_parent_walk(vcpu, sp, unsync_walk_fn);
+ mmu_parent_walk(sp, unsync_walk_fn);
kvm_mmu_update_parents_unsync(sp);
}
@@ -1209,7 +1199,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
- if (sp->role.glevels != vcpu->arch.mmu.root_level) {
+ if (sp->role.cr4_pae != !!is_pae(vcpu)) {
kvm_mmu_zap_page(vcpu->kvm, sp);
return 1;
}
@@ -1331,6 +1321,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
role = vcpu->arch.mmu.base_role;
role.level = level;
role.direct = direct;
+ if (role.direct)
+ role.cr4_pae = 0;
role.access = access;
if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
@@ -1351,7 +1343,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
mmu_page_add_parent_pte(vcpu, sp, parent_pte);
if (sp->unsync_children) {
set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
- kvm_mmu_mark_parents_unsync(vcpu, sp);
+ kvm_mmu_mark_parents_unsync(sp);
}
trace_kvm_mmu_get_page(sp, false);
return sp;
@@ -1490,8 +1482,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
for_each_sp(pages, sp, parents, i) {
kvm_mmu_zap_page(kvm, sp);
mmu_pages_clear_parents(&parents);
+ zapped++;
}
- zapped += pages.nr;
kvm_mmu_pages_init(parent, &parents, &pages);
}
@@ -1542,14 +1534,16 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
*/
if (used_pages > kvm_nr_mmu_pages) {
- while (used_pages > kvm_nr_mmu_pages) {
+ while (used_pages > kvm_nr_mmu_pages &&
+ !list_empty(&kvm->arch.active_mmu_pages)) {
struct kvm_mmu_page *page;
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
- kvm_mmu_zap_page(kvm, page);
+ used_pages -= kvm_mmu_zap_page(kvm, page);
used_pages--;
}
+ kvm_nr_mmu_pages = used_pages;
kvm->arch.n_free_mmu_pages = 0;
}
else
@@ -1571,13 +1565,14 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
r = 0;
index = kvm_page_table_hashfn(gfn);
bucket = &kvm->arch.mmu_page_hash[index];
+restart:
hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
if (sp->gfn == gfn && !sp->role.direct) {
pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
sp->role.word);
r = 1;
if (kvm_mmu_zap_page(kvm, sp))
- n = bucket->first;
+ goto restart;
}
return r;
}
@@ -1591,12 +1586,14 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
index = kvm_page_table_hashfn(gfn);
bucket = &kvm->arch.mmu_page_hash[index];
+restart:
hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
if (sp->gfn == gfn && !sp->role.direct
&& !sp->role.invalid) {
pgprintk("%s: zap %lx %x\n",
__func__, gfn, sp->role.word);
- kvm_mmu_zap_page(kvm, sp);
+ if (kvm_mmu_zap_page(kvm, sp))
+ goto restart;
}
}
}
@@ -1762,7 +1759,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
++vcpu->kvm->stat.mmu_unsync;
sp->unsync = 1;
- kvm_mmu_mark_parents_unsync(vcpu, sp);
+ kvm_mmu_mark_parents_unsync(sp);
mmu_convert_notrap(sp);
return 0;
@@ -2296,13 +2293,19 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
/* no rsvd bits for 2 level 4K page table entries */
context->rsvd_bits_mask[0][1] = 0;
context->rsvd_bits_mask[0][0] = 0;
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+
+ if (!is_pse(vcpu)) {
+ context->rsvd_bits_mask[1][1] = 0;
+ break;
+ }
+
if (is_cpuid_PSE36())
/* 36bits PSE 4MB page */
context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
else
/* 32 bits PSE 4MB page */
context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
- context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
break;
case PT32E_ROOT_LEVEL:
context->rsvd_bits_mask[0][2] =
@@ -2315,7 +2318,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 62) |
rsvd_bits(13, 20); /* large page */
- context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
break;
case PT64_ROOT_LEVEL:
context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
@@ -2333,7 +2336,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51) |
rsvd_bits(13, 20); /* large page */
- context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
+ context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
break;
}
}
@@ -2435,7 +2438,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
else
r = paging32_init_context(vcpu);
- vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level;
+ vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
return r;
}
@@ -2524,7 +2527,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
}
++vcpu->kvm->stat.mmu_pte_updated;
- if (sp->role.glevels == PT32_ROOT_LEVEL)
+ if (!sp->role.cr4_pae)
paging32_update_pte(vcpu, sp, spte, new);
else
paging64_update_pte(vcpu, sp, spte, new);
@@ -2559,36 +2562,11 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
}
static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes)
+ u64 gpte)
{
gfn_t gfn;
- int r;
- u64 gpte = 0;
pfn_t pfn;
- if (bytes != 4 && bytes != 8)
- return;
-
- /*
- * Assume that the pte write on a page table of the same type
- * as the current vcpu paging mode. This is nearly always true
- * (might be false while changing modes). Note it is verified later
- * by update_pte().
- */
- if (is_pae(vcpu)) {
- /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
- if ((bytes == 4) && (gpa % 4 == 0)) {
- r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
- if (r)
- return;
- memcpy((void *)&gpte + (gpa % 8), new, 4);
- } else if ((bytes == 8) && (gpa % 8 == 0)) {
- memcpy((void *)&gpte, new, 8);
- }
- } else {
- if ((bytes == 4) && (gpa % 4 == 0))
- memcpy((void *)&gpte, new, 4);
- }
if (!is_present_gpte(gpte))
return;
gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -2637,10 +2615,46 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
int flooded = 0;
int npte;
int r;
+ int invlpg_counter;
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
- mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
+
+ invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
+
+ /*
+ * Assume that the pte write on a page table of the same type
+ * as the current vcpu paging mode. This is nearly always true
+ * (might be false while changing modes). Note it is verified later
+ * by update_pte().
+ */
+ if ((is_pae(vcpu) && bytes == 4) || !new) {
+ /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
+ if (is_pae(vcpu)) {
+ gpa &= ~(gpa_t)7;
+ bytes = 8;
+ }
+ r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
+ if (r)
+ gentry = 0;
+ new = (const u8 *)&gentry;
+ }
+
+ switch (bytes) {
+ case 4:
+ gentry = *(const u32 *)new;
+ break;
+ case 8:
+ gentry = *(const u64 *)new;
+ break;
+ default:
+ gentry = 0;
+ break;
+ }
+
+ mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
spin_lock(&vcpu->kvm->mmu_lock);
+ if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
+ gentry = 0;
kvm_mmu_access_page(vcpu, gfn);
kvm_mmu_free_some_pages(vcpu);
++vcpu->kvm->stat.mmu_pte_write;
@@ -2659,10 +2673,12 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
}
index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+
+restart:
hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
continue;
- pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
+ pte_size = sp->role.cr4_pae ? 8 : 4;
misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
misaligned |= bytes < 4;
if (misaligned || flooded) {
@@ -2679,14 +2695,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
pgprintk("misaligned: gpa %llx bytes %d role %x\n",
gpa, bytes, sp->role.word);
if (kvm_mmu_zap_page(vcpu->kvm, sp))
- n = bucket->first;
+ goto restart;
++vcpu->kvm->stat.mmu_flooded;
continue;
}
page_offset = offset;
level = sp->role.level;
npte = 1;
- if (sp->role.glevels == PT32_ROOT_LEVEL) {
+ if (!sp->role.cr4_pae) {
page_offset <<= 1; /* 32->64 */
/*
* A 32-bit pde maps 4MB while the shadow pdes map
@@ -2704,20 +2720,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
continue;
}
spte = &sp->spt[page_offset / sizeof(*spte)];
- if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
- gentry = 0;
- r = kvm_read_guest_atomic(vcpu->kvm,
- gpa & ~(u64)(pte_size - 1),
- &gentry, pte_size);
- new = (const void *)&gentry;
- if (r < 0)
- new = NULL;
- }
while (npte--) {
entry = *spte;
mmu_pte_write_zap_pte(vcpu, sp, spte);
- if (new)
- mmu_pte_write_new_pte(vcpu, sp, spte, new);
+ if (gentry)
+ mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
mmu_pte_write_flush_tlb(vcpu, entry, *spte);
++spte;
}
@@ -2897,10 +2904,11 @@ void kvm_mmu_zap_all(struct kvm *kvm)
struct kvm_mmu_page *sp, *node;
spin_lock(&kvm->mmu_lock);
+restart:
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
if (kvm_mmu_zap_page(kvm, sp))
- node = container_of(kvm->arch.active_mmu_pages.next,
- struct kvm_mmu_page, link);
+ goto restart;
+
spin_unlock(&kvm->mmu_lock);
kvm_flush_remote_tlbs(kvm);
@@ -3171,8 +3179,7 @@ static gva_t canonicalize(gva_t gva)
}
-typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp,
- u64 *sptep);
+typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
inspect_spte_fn fn)
@@ -3188,7 +3195,7 @@ static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
child = page_header(ent & PT64_BASE_ADDR_MASK);
__mmu_spte_walk(kvm, child, fn);
} else
- fn(kvm, sp, &sp->spt[i]);
+ fn(kvm, &sp->spt[i]);
}
}
}
@@ -3279,6 +3286,8 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
static int count_rmaps(struct kvm_vcpu *vcpu)
{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_memslots *slots;
int nmaps = 0;
int i, j, k, idx;
@@ -3312,7 +3321,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
return nmaps;
}
-void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
+void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
{
unsigned long *rmapp;
struct kvm_mmu_page *rev_sp;
@@ -3328,14 +3337,14 @@ void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
printk(KERN_ERR "%s: no memslot for gfn %ld\n",
audit_msg, gfn);
printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
- audit_msg, sptep - rev_sp->spt,
+ audit_msg, (long int)(sptep - rev_sp->spt),
rev_sp->gfn);
dump_stack();
return;
}
rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
- is_large_pte(*sptep));
+ rev_sp->role.level);
if (!*rmapp) {
if (!printk_ratelimit())
return;
@@ -3370,7 +3379,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
continue;
if (!(ent & PT_WRITABLE_MASK))
continue;
- inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]);
+ inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
}
}
return;
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3e4a5c6..3851f1f 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -6,8 +6,6 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvmmmu
-#define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE mmutrace
#define KVM_MMU_PAGE_FIELDS \
__field(__u64, gfn) \
@@ -30,9 +28,10 @@
\
role.word = __entry->role; \
\
- trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge" \
+ trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s %spge" \
" %snxe root %u %s%c", \
- __entry->gfn, role.level, role.glevels, \
+ __entry->gfn, role.level, \
+ role.cr4_pae ? " pae" : "", \
role.quadrant, \
role.direct ? " direct" : "", \
access_str[role.access], \
@@ -216,5 +215,10 @@ TRACE_EVENT(
#endif /* _TRACE_KVMMMU_H */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE mmutrace
+
/* This part must be outside protection */
#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 81eab9a..d9dea28 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -170,7 +170,7 @@ walk:
goto access_error;
#if PTTYPE == 64
- if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
+ if (fetch_fault && (pte & PT64_NX_MASK))
goto access_error;
#endif
@@ -258,11 +258,17 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
pt_element_t gpte;
unsigned pte_access;
pfn_t pfn;
+ u64 new_spte;
gpte = *(const pt_element_t *)pte;
if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
- if (!is_present_gpte(gpte))
- __set_spte(spte, shadow_notrap_nonpresent_pte);
+ if (!is_present_gpte(gpte)) {
+ if (page->unsync)
+ new_spte = shadow_trap_nonpresent_pte;
+ else
+ new_spte = shadow_notrap_nonpresent_pte;
+ __set_spte(spte, new_spte);
+ }
return;
}
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
@@ -457,6 +463,7 @@ out_unlock:
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
{
struct kvm_shadow_walk_iterator iterator;
+ gpa_t pte_gpa = -1;
int level;
u64 *sptep;
int need_flush = 0;
@@ -470,6 +477,10 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
if (level == PT_PAGE_TABLE_LEVEL ||
((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+ pte_gpa = (sp->gfn << PAGE_SHIFT);
+ pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
if (is_shadow_present_pte(*sptep)) {
rmap_remove(vcpu->kvm, sptep);
@@ -487,7 +498,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
if (need_flush)
kvm_flush_remote_tlbs(vcpu->kvm);
+
+ atomic_inc(&vcpu->kvm->arch.invlpg_counter);
+
spin_unlock(&vcpu->kvm->mmu_lock);
+
+ if (pte_gpa == -1)
+ return;
+
+ if (mmu_topup_memory_caches(vcpu))
+ return;
+ kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
}
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 445c594..ab78eb8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -44,10 +44,11 @@ MODULE_LICENSE("GPL");
#define SEG_TYPE_LDT 2
#define SEG_TYPE_BUSY_TSS16 3
-#define SVM_FEATURE_NPT (1 << 0)
-#define SVM_FEATURE_LBRV (1 << 1)
-#define SVM_FEATURE_SVML (1 << 2)
-#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
+#define SVM_FEATURE_NPT (1 << 0)
+#define SVM_FEATURE_LBRV (1 << 1)
+#define SVM_FEATURE_SVML (1 << 2)
+#define SVM_FEATURE_NRIP (1 << 3)
+#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
@@ -70,6 +71,7 @@ struct kvm_vcpu;
struct nested_state {
struct vmcb *hsave;
u64 hsave_msr;
+ u64 vm_cr_msr;
u64 vmcb;
/* These are the merged vectors */
@@ -77,6 +79,7 @@ struct nested_state {
/* gpa pointers to the real vectors */
u64 vmcb_msrpm;
+ u64 vmcb_iopm;
/* A VMEXIT is required but not yet emulated */
bool exit_required;
@@ -91,6 +94,9 @@ struct nested_state {
};
+#define MSRPM_OFFSETS 16
+static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
+
struct vcpu_svm {
struct kvm_vcpu vcpu;
struct vmcb *vmcb;
@@ -110,13 +116,39 @@ struct vcpu_svm {
struct nested_state nested;
bool nmi_singlestep;
+
+ unsigned int3_injected;
+ unsigned long int3_rip;
+};
+
+#define MSR_INVALID 0xffffffffU
+
+static struct svm_direct_access_msrs {
+ u32 index; /* Index of the MSR */
+ bool always; /* True if intercept is always on */
+} direct_access_msrs[] = {
+ { .index = MSR_K6_STAR, .always = true },
+ { .index = MSR_IA32_SYSENTER_CS, .always = true },
+#ifdef CONFIG_X86_64
+ { .index = MSR_GS_BASE, .always = true },
+ { .index = MSR_FS_BASE, .always = true },
+ { .index = MSR_KERNEL_GS_BASE, .always = true },
+ { .index = MSR_LSTAR, .always = true },
+ { .index = MSR_CSTAR, .always = true },
+ { .index = MSR_SYSCALL_MASK, .always = true },
+#endif
+ { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
+ { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
+ { .index = MSR_IA32_LASTINTFROMIP, .always = false },
+ { .index = MSR_IA32_LASTINTTOIP, .always = false },
+ { .index = MSR_INVALID, .always = false },
};
/* enable NPT for AMD64 and X86 with PAE */
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
static bool npt_enabled = true;
#else
-static bool npt_enabled = false;
+static bool npt_enabled;
#endif
static int npt = 1;
@@ -129,6 +161,7 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu);
static void svm_complete_interrupts(struct vcpu_svm *svm);
static int nested_svm_exit_handled(struct vcpu_svm *svm);
+static int nested_svm_intercept(struct vcpu_svm *svm);
static int nested_svm_vmexit(struct vcpu_svm *svm);
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
@@ -163,8 +196,8 @@ static unsigned long iopm_base;
struct kvm_ldttss_desc {
u16 limit0;
u16 base0;
- unsigned base1 : 8, type : 5, dpl : 2, p : 1;
- unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
+ unsigned base1:8, type:5, dpl:2, p:1;
+ unsigned limit1:4, zero0:3, g:1, base2:8;
u32 base3;
u32 zero1;
} __attribute__((packed));
@@ -194,6 +227,27 @@ static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
#define MSRS_RANGE_SIZE 2048
#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
+static u32 svm_msrpm_offset(u32 msr)
+{
+ u32 offset;
+ int i;
+
+ for (i = 0; i < NUM_MSR_MAPS; i++) {
+ if (msr < msrpm_ranges[i] ||
+ msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
+ continue;
+
+ offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
+ offset += (i * MSRS_RANGE_SIZE); /* add range offset */
+
+ /* Now we have the u8 offset - but need the u32 offset */
+ return offset / 4;
+ }
+
+ /* MSR not in any range */
+ return MSR_INVALID;
+}
+
#define MAX_INST_SIZE 15
static inline u32 svm_has(u32 feat)
@@ -213,7 +267,7 @@ static inline void stgi(void)
static inline void invlpga(unsigned long addr, u32 asid)
{
- asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid));
+ asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
}
static inline void force_new_asid(struct kvm_vcpu *vcpu)
@@ -235,23 +289,6 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
vcpu->arch.efer = efer;
}
-static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
- bool has_error_code, u32 error_code)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- /* If we are within a nested VM we'd better #VMEXIT and let the
- guest handle the exception */
- if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
- return;
-
- svm->vmcb->control.event_inj = nr
- | SVM_EVTINJ_VALID
- | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
- | SVM_EVTINJ_TYPE_EXEPT;
- svm->vmcb->control.event_inj_err = error_code;
-}
-
static int is_external_interrupt(u32 info)
{
info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
@@ -264,7 +301,7 @@ static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
u32 ret = 0;
if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
- ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS;
+ ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
return ret & mask;
}
@@ -283,6 +320,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ if (svm->vmcb->control.next_rip != 0)
+ svm->next_rip = svm->vmcb->control.next_rip;
+
if (!svm->next_rip) {
if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
EMULATE_DONE)
@@ -297,6 +337,41 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
svm_set_interrupt_shadow(vcpu, 0);
}
+static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
+ bool has_error_code, u32 error_code)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * If we are within a nested VM we'd better #VMEXIT and let the guest
+ * handle the exception
+ */
+ if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
+ return;
+
+ if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) {
+ unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
+
+ /*
+ * For guest debugging where we have to reinject #BP if some
+ * INT3 is guest-owned:
+ * Emulate nRIP by moving RIP forward. Will fail if injection
+ * raises a fault that is not intercepted. Still better than
+ * failing in all cases.
+ */
+ skip_emulated_instruction(&svm->vcpu);
+ rip = kvm_rip_read(&svm->vcpu);
+ svm->int3_rip = rip + svm->vmcb->save.cs.base;
+ svm->int3_injected = rip - old_rip;
+ }
+
+ svm->vmcb->control.event_inj = nr
+ | SVM_EVTINJ_VALID
+ | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
+ | SVM_EVTINJ_TYPE_EXEPT;
+ svm->vmcb->control.event_inj_err = error_code;
+}
+
static int has_svm(void)
{
const char *msg;
@@ -319,7 +394,7 @@ static int svm_hardware_enable(void *garbage)
struct svm_cpu_data *sd;
uint64_t efer;
- struct descriptor_table gdt_descr;
+ struct desc_ptr gdt_descr;
struct desc_struct *gdt;
int me = raw_smp_processor_id();
@@ -344,8 +419,8 @@ static int svm_hardware_enable(void *garbage)
sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
sd->next_asid = sd->max_asid + 1;
- kvm_get_gdt(&gdt_descr);
- gdt = (struct desc_struct *)gdt_descr.base;
+ native_store_gdt(&gdt_descr);
+ gdt = (struct desc_struct *)gdt_descr.address;
sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
wrmsrl(MSR_EFER, efer | EFER_SVME);
@@ -391,42 +466,98 @@ err_1:
}
+static bool valid_msr_intercept(u32 index)
+{
+ int i;
+
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
+ if (direct_access_msrs[i].index == index)
+ return true;
+
+ return false;
+}
+
static void set_msr_interception(u32 *msrpm, unsigned msr,
int read, int write)
{
+ u8 bit_read, bit_write;
+ unsigned long tmp;
+ u32 offset;
+
+ /*
+ * If this warning triggers extend the direct_access_msrs list at the
+ * beginning of the file
+ */
+ WARN_ON(!valid_msr_intercept(msr));
+
+ offset = svm_msrpm_offset(msr);
+ bit_read = 2 * (msr & 0x0f);
+ bit_write = 2 * (msr & 0x0f) + 1;
+ tmp = msrpm[offset];
+
+ BUG_ON(offset == MSR_INVALID);
+
+ read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
+ write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
+
+ msrpm[offset] = tmp;
+}
+
+static void svm_vcpu_init_msrpm(u32 *msrpm)
+{
int i;
- for (i = 0; i < NUM_MSR_MAPS; i++) {
- if (msr >= msrpm_ranges[i] &&
- msr < msrpm_ranges[i] + MSRS_IN_RANGE) {
- u32 msr_offset = (i * MSRS_IN_RANGE + msr -
- msrpm_ranges[i]) * 2;
-
- u32 *base = msrpm + (msr_offset / 32);
- u32 msr_shift = msr_offset % 32;
- u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
- *base = (*base & ~(0x3 << msr_shift)) |
- (mask << msr_shift);
+ memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+ if (!direct_access_msrs[i].always)
+ continue;
+
+ set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
+ }
+}
+
+static void add_msr_offset(u32 offset)
+{
+ int i;
+
+ for (i = 0; i < MSRPM_OFFSETS; ++i) {
+
+ /* Offset already in list? */
+ if (msrpm_offsets[i] == offset)
return;
- }
+
+ /* Slot used by another offset? */
+ if (msrpm_offsets[i] != MSR_INVALID)
+ continue;
+
+ /* Add offset to list */
+ msrpm_offsets[i] = offset;
+
+ return;
}
+
+ /*
+ * If this BUG triggers the msrpm_offsets table has an overflow. Just
+ * increase MSRPM_OFFSETS in this case.
+ */
BUG();
}
-static void svm_vcpu_init_msrpm(u32 *msrpm)
+static void init_msrpm_offsets(void)
{
- memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+ int i;
-#ifdef CONFIG_X86_64
- set_msr_interception(msrpm, MSR_GS_BASE, 1, 1);
- set_msr_interception(msrpm, MSR_FS_BASE, 1, 1);
- set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1);
- set_msr_interception(msrpm, MSR_LSTAR, 1, 1);
- set_msr_interception(msrpm, MSR_CSTAR, 1, 1);
- set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1);
-#endif
- set_msr_interception(msrpm, MSR_K6_STAR, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1);
+ memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
+
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+ u32 offset;
+
+ offset = svm_msrpm_offset(direct_access_msrs[i].index);
+ BUG_ON(offset == MSR_INVALID);
+
+ add_msr_offset(offset);
+ }
}
static void svm_enable_lbrv(struct vcpu_svm *svm)
@@ -467,6 +598,8 @@ static __init int svm_hardware_setup(void)
memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
+ init_msrpm_offsets();
+
if (boot_cpu_has(X86_FEATURE_NX))
kvm_enable_efer_bits(EFER_NX);
@@ -523,7 +656,7 @@ static void init_seg(struct vmcb_seg *seg)
{
seg->selector = 0;
seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
- SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
+ SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
seg->limit = 0xffff;
seg->base = 0;
}
@@ -543,16 +676,16 @@ static void init_vmcb(struct vcpu_svm *svm)
svm->vcpu.fpu_active = 1;
- control->intercept_cr_read = INTERCEPT_CR0_MASK |
+ control->intercept_cr_read = INTERCEPT_CR0_MASK |
INTERCEPT_CR3_MASK |
INTERCEPT_CR4_MASK;
- control->intercept_cr_write = INTERCEPT_CR0_MASK |
+ control->intercept_cr_write = INTERCEPT_CR0_MASK |
INTERCEPT_CR3_MASK |
INTERCEPT_CR4_MASK |
INTERCEPT_CR8_MASK;
- control->intercept_dr_read = INTERCEPT_DR0_MASK |
+ control->intercept_dr_read = INTERCEPT_DR0_MASK |
INTERCEPT_DR1_MASK |
INTERCEPT_DR2_MASK |
INTERCEPT_DR3_MASK |
@@ -561,7 +694,7 @@ static void init_vmcb(struct vcpu_svm *svm)
INTERCEPT_DR6_MASK |
INTERCEPT_DR7_MASK;
- control->intercept_dr_write = INTERCEPT_DR0_MASK |
+ control->intercept_dr_write = INTERCEPT_DR0_MASK |
INTERCEPT_DR1_MASK |
INTERCEPT_DR2_MASK |
INTERCEPT_DR3_MASK |
@@ -575,7 +708,7 @@ static void init_vmcb(struct vcpu_svm *svm)
(1 << MC_VECTOR);
- control->intercept = (1ULL << INTERCEPT_INTR) |
+ control->intercept = (1ULL << INTERCEPT_INTR) |
(1ULL << INTERCEPT_NMI) |
(1ULL << INTERCEPT_SMI) |
(1ULL << INTERCEPT_SELECTIVE_CR0) |
@@ -636,7 +769,8 @@ static void init_vmcb(struct vcpu_svm *svm)
save->rip = 0x0000fff0;
svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
- /* This is the guest-visible cr0 value.
+ /*
+ * This is the guest-visible cr0 value.
* svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
*/
svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
@@ -706,30 +840,30 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
if (err)
goto free_svm;
+ err = -ENOMEM;
page = alloc_page(GFP_KERNEL);
- if (!page) {
- err = -ENOMEM;
+ if (!page)
goto uninit;
- }
- err = -ENOMEM;
msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
if (!msrpm_pages)
- goto uninit;
+ goto free_page1;
nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
if (!nested_msrpm_pages)
- goto uninit;
-
- svm->msrpm = page_address(msrpm_pages);
- svm_vcpu_init_msrpm(svm->msrpm);
+ goto free_page2;
hsave_page = alloc_page(GFP_KERNEL);
if (!hsave_page)
- goto uninit;
+ goto free_page3;
+
svm->nested.hsave = page_address(hsave_page);
+ svm->msrpm = page_address(msrpm_pages);
+ svm_vcpu_init_msrpm(svm->msrpm);
+
svm->nested.msrpm = page_address(nested_msrpm_pages);
+ svm_vcpu_init_msrpm(svm->nested.msrpm);
svm->vmcb = page_address(page);
clear_page(svm->vmcb);
@@ -744,6 +878,12 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
return &svm->vcpu;
+free_page3:
+ __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
+free_page2:
+ __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
+free_page1:
+ __free_page(page);
uninit:
kvm_vcpu_uninit(&svm->vcpu);
free_svm:
@@ -877,7 +1017,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
- /* AMD's VMCB does not have an explicit unusable field, so emulate it
+ /*
+ * AMD's VMCB does not have an explicit unusable field, so emulate it
* for cross vendor migration purposes by "not present"
*/
var->unusable = !var->present || (var->type == 0);
@@ -913,7 +1054,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
var->type |= 0x1;
break;
case VCPU_SREG_SS:
- /* On AMD CPUs sometimes the DB bit in the segment
+ /*
+ * On AMD CPUs sometimes the DB bit in the segment
* descriptor is left as 1, although the whole segment has
* been made unusable. Clear it here to pass an Intel VMX
* entry check when cross vendor migrating.
@@ -931,36 +1073,36 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu)
return save->cpl;
}
-static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);
- dt->limit = svm->vmcb->save.idtr.limit;
- dt->base = svm->vmcb->save.idtr.base;
+ dt->size = svm->vmcb->save.idtr.limit;
+ dt->address = svm->vmcb->save.idtr.base;
}
-static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->save.idtr.limit = dt->limit;
- svm->vmcb->save.idtr.base = dt->base ;
+ svm->vmcb->save.idtr.limit = dt->size;
+ svm->vmcb->save.idtr.base = dt->address ;
}
-static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);
- dt->limit = svm->vmcb->save.gdtr.limit;
- dt->base = svm->vmcb->save.gdtr.base;
+ dt->size = svm->vmcb->save.gdtr.limit;
+ dt->address = svm->vmcb->save.gdtr.base;
}
-static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->save.gdtr.limit = dt->limit;
- svm->vmcb->save.gdtr.base = dt->base ;
+ svm->vmcb->save.gdtr.limit = dt->size;
+ svm->vmcb->save.gdtr.base = dt->address ;
}
static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
@@ -973,6 +1115,7 @@ static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
static void update_cr0_intercept(struct vcpu_svm *svm)
{
+ struct vmcb *vmcb = svm->vmcb;
ulong gcr0 = svm->vcpu.arch.cr0;
u64 *hcr0 = &svm->vmcb->save.cr0;
@@ -984,11 +1127,25 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
- svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
- svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
+ vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
+ vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
+ if (is_nested(svm)) {
+ struct vmcb *hsave = svm->nested.hsave;
+
+ hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
+ hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
+ vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read;
+ vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
+ }
} else {
svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
+ if (is_nested(svm)) {
+ struct vmcb *hsave = svm->nested.hsave;
+
+ hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
+ hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
+ }
}
}
@@ -996,6 +1153,27 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ if (is_nested(svm)) {
+ /*
+ * We are here because we run in nested mode, the host kvm
+ * intercepts cr0 writes but the l1 hypervisor does not.
+ * But the L1 hypervisor may intercept selective cr0 writes.
+ * This needs to be checked here.
+ */
+ unsigned long old, new;
+
+ /* Remove bits that would trigger a real cr0 write intercept */
+ old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK;
+ new = cr0 & SVM_CR0_SELECTIVE_MASK;
+
+ if (old == new) {
+ /* cr0 write with ts and mp unchanged */
+ svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+ if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE)
+ return;
+ }
+ }
+
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
@@ -1129,70 +1307,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
svm->vmcb->control.asid = sd->next_asid++;
}
-static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- switch (dr) {
- case 0 ... 3:
- *dest = vcpu->arch.db[dr];
- break;
- case 4:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return EMULATE_FAIL; /* will re-inject UD */
- /* fall through */
- case 6:
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- *dest = vcpu->arch.dr6;
- else
- *dest = svm->vmcb->save.dr6;
- break;
- case 5:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return EMULATE_FAIL; /* will re-inject UD */
- /* fall through */
- case 7:
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- *dest = vcpu->arch.dr7;
- else
- *dest = svm->vmcb->save.dr7;
- break;
- }
-
- return EMULATE_DONE;
-}
-
-static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value)
+static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
{
struct vcpu_svm *svm = to_svm(vcpu);
- switch (dr) {
- case 0 ... 3:
- vcpu->arch.db[dr] = value;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
- vcpu->arch.eff_db[dr] = value;
- break;
- case 4:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return EMULATE_FAIL; /* will re-inject UD */
- /* fall through */
- case 6:
- vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
- break;
- case 5:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return EMULATE_FAIL; /* will re-inject UD */
- /* fall through */
- case 7:
- vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
- svm->vmcb->save.dr7 = vcpu->arch.dr7;
- vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
- }
- break;
- }
-
- return EMULATE_DONE;
+ svm->vmcb->save.dr7 = value;
}
static int pf_interception(struct vcpu_svm *svm)
@@ -1229,7 +1348,7 @@ static int db_interception(struct vcpu_svm *svm)
}
if (svm->vcpu.guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
kvm_run->exit_reason = KVM_EXIT_DEBUG;
kvm_run->debug.arch.pc =
svm->vmcb->save.cs.base + svm->vmcb->save.rip;
@@ -1263,7 +1382,22 @@ static int ud_interception(struct vcpu_svm *svm)
static void svm_fpu_activate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
+ u32 excp;
+
+ if (is_nested(svm)) {
+ u32 h_excp, n_excp;
+
+ h_excp = svm->nested.hsave->control.intercept_exceptions;
+ n_excp = svm->nested.intercept_exceptions;
+ h_excp &= ~(1 << NM_VECTOR);
+ excp = h_excp | n_excp;
+ } else {
+ excp = svm->vmcb->control.intercept_exceptions;
+ excp &= ~(1 << NM_VECTOR);
+ }
+
+ svm->vmcb->control.intercept_exceptions = excp;
+
svm->vcpu.fpu_active = 1;
update_cr0_intercept(svm);
}
@@ -1304,29 +1438,23 @@ static int shutdown_interception(struct vcpu_svm *svm)
static int io_interception(struct vcpu_svm *svm)
{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
int size, in, string;
unsigned port;
++svm->vcpu.stat.io_exits;
-
- svm->next_rip = svm->vmcb->control.exit_info_2;
-
string = (io_info & SVM_IOIO_STR_MASK) != 0;
-
- if (string) {
- if (emulate_instruction(&svm->vcpu,
- 0, 0, 0) == EMULATE_DO_MMIO)
- return 0;
- return 1;
- }
-
in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
+ if (string || in)
+ return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
+
port = io_info >> 16;
size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
-
+ svm->next_rip = svm->vmcb->control.exit_info_2;
skip_emulated_instruction(&svm->vcpu);
- return kvm_emulate_pio(&svm->vcpu, in, size, port);
+
+ return kvm_fast_pio_out(vcpu, size, port);
}
static int nmi_interception(struct vcpu_svm *svm)
@@ -1379,6 +1507,8 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code)
{
+ int vmexit;
+
if (!is_nested(svm))
return 0;
@@ -1387,21 +1517,28 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
svm->vmcb->control.exit_info_1 = error_code;
svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
- return nested_svm_exit_handled(svm);
+ vmexit = nested_svm_intercept(svm);
+ if (vmexit == NESTED_EXIT_DONE)
+ svm->nested.exit_required = true;
+
+ return vmexit;
}
-static inline int nested_svm_intr(struct vcpu_svm *svm)
+/* This function returns true if it is save to enable the irq window */
+static inline bool nested_svm_intr(struct vcpu_svm *svm)
{
if (!is_nested(svm))
- return 0;
+ return true;
if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
- return 0;
+ return true;
if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
- return 0;
+ return false;
- svm->vmcb->control.exit_code = SVM_EXIT_INTR;
+ svm->vmcb->control.exit_code = SVM_EXIT_INTR;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
if (svm->nested.intercept & 1ULL) {
/*
@@ -1412,21 +1549,40 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
*/
svm->nested.exit_required = true;
trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
- return 1;
+ return false;
}
- return 0;
+ return true;
+}
+
+/* This function returns true if it is save to enable the nmi window */
+static inline bool nested_svm_nmi(struct vcpu_svm *svm)
+{
+ if (!is_nested(svm))
+ return true;
+
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
+ return true;
+
+ svm->vmcb->control.exit_code = SVM_EXIT_NMI;
+ svm->nested.exit_required = true;
+
+ return false;
}
-static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
+static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
{
struct page *page;
+ might_sleep();
+
page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
if (is_error_page(page))
goto error;
- return kmap_atomic(page, idx);
+ *_page = page;
+
+ return kmap(page);
error:
kvm_release_page_clean(page);
@@ -1435,61 +1591,55 @@ error:
return NULL;
}
-static void nested_svm_unmap(void *addr, enum km_type idx)
+static void nested_svm_unmap(struct page *page)
{
- struct page *page;
+ kunmap(page);
+ kvm_release_page_dirty(page);
+}
- if (!addr)
- return;
+static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
+{
+ unsigned port;
+ u8 val, bit;
+ u64 gpa;
- page = kmap_atomic_to_page(addr);
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
+ return NESTED_EXIT_HOST;
- kunmap_atomic(addr, idx);
- kvm_release_page_dirty(page);
+ port = svm->vmcb->control.exit_info_1 >> 16;
+ gpa = svm->nested.vmcb_iopm + (port / 8);
+ bit = port % 8;
+ val = 0;
+
+ if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1))
+ val &= (1 << bit);
+
+ return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}
-static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
+static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
{
- u32 param = svm->vmcb->control.exit_info_1 & 1;
- u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- bool ret = false;
- u32 t0, t1;
- u8 *msrpm;
+ u32 offset, msr, value;
+ int write, mask;
if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
- return false;
-
- msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
+ return NESTED_EXIT_HOST;
- if (!msrpm)
- goto out;
+ msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ offset = svm_msrpm_offset(msr);
+ write = svm->vmcb->control.exit_info_1 & 1;
+ mask = 1 << ((2 * (msr & 0xf)) + write);
- switch (msr) {
- case 0 ... 0x1fff:
- t0 = (msr * 2) % 8;
- t1 = msr / 8;
- break;
- case 0xc0000000 ... 0xc0001fff:
- t0 = (8192 + msr - 0xc0000000) * 2;
- t1 = (t0 / 8);
- t0 %= 8;
- break;
- case 0xc0010000 ... 0xc0011fff:
- t0 = (16384 + msr - 0xc0010000) * 2;
- t1 = (t0 / 8);
- t0 %= 8;
- break;
- default:
- ret = true;
- goto out;
- }
+ if (offset == MSR_INVALID)
+ return NESTED_EXIT_DONE;
- ret = msrpm[t1] & ((1 << param) << t0);
+ /* Offset is in 32 bit units but need in 8 bit units */
+ offset *= 4;
-out:
- nested_svm_unmap(msrpm, KM_USER0);
+ if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4))
+ return NESTED_EXIT_DONE;
- return ret;
+ return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}
static int nested_svm_exit_special(struct vcpu_svm *svm)
@@ -1500,16 +1650,19 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
case SVM_EXIT_INTR:
case SVM_EXIT_NMI:
return NESTED_EXIT_HOST;
- /* For now we are always handling NPFs when using them */
case SVM_EXIT_NPF:
+ /* For now we are always handling NPFs when using them */
if (npt_enabled)
return NESTED_EXIT_HOST;
break;
- /* When we're shadowing, trap PFs */
case SVM_EXIT_EXCP_BASE + PF_VECTOR:
+ /* When we're shadowing, trap PFs */
if (!npt_enabled)
return NESTED_EXIT_HOST;
break;
+ case SVM_EXIT_EXCP_BASE + NM_VECTOR:
+ nm_interception(svm);
+ break;
default:
break;
}
@@ -1520,7 +1673,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
/*
* If this function returns true, this #vmexit was already handled
*/
-static int nested_svm_exit_handled(struct vcpu_svm *svm)
+static int nested_svm_intercept(struct vcpu_svm *svm)
{
u32 exit_code = svm->vmcb->control.exit_code;
int vmexit = NESTED_EXIT_HOST;
@@ -1529,6 +1682,9 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
case SVM_EXIT_MSR:
vmexit = nested_svm_exit_handled_msr(svm);
break;
+ case SVM_EXIT_IOIO:
+ vmexit = nested_svm_intercept_ioio(svm);
+ break;
case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
if (svm->nested.intercept_cr_read & cr_bits)
@@ -1566,9 +1722,17 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
}
}
- if (vmexit == NESTED_EXIT_DONE) {
+ return vmexit;
+}
+
+static int nested_svm_exit_handled(struct vcpu_svm *svm)
+{
+ int vmexit;
+
+ vmexit = nested_svm_intercept(svm);
+
+ if (vmexit == NESTED_EXIT_DONE)
nested_svm_vmexit(svm);
- }
return vmexit;
}
@@ -1610,6 +1774,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
+ struct page *page;
trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
vmcb->control.exit_info_1,
@@ -1617,10 +1782,13 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb->control.exit_int_info,
vmcb->control.exit_int_info_err);
- nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0);
+ nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
if (!nested_vmcb)
return 1;
+ /* Exit nested SVM mode */
+ svm->nested.vmcb = 0;
+
/* Give the current vmcb to the guest */
disable_gif(svm);
@@ -1630,9 +1798,13 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
nested_vmcb->save.ds = vmcb->save.ds;
nested_vmcb->save.gdtr = vmcb->save.gdtr;
nested_vmcb->save.idtr = vmcb->save.idtr;
+ nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
if (npt_enabled)
nested_vmcb->save.cr3 = vmcb->save.cr3;
+ else
+ nested_vmcb->save.cr3 = svm->vcpu.arch.cr3;
nested_vmcb->save.cr2 = vmcb->save.cr2;
+ nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
nested_vmcb->save.rflags = vmcb->save.rflags;
nested_vmcb->save.rip = vmcb->save.rip;
nested_vmcb->save.rsp = vmcb->save.rsp;
@@ -1704,10 +1876,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
svm->vmcb->save.cpl = 0;
svm->vmcb->control.exit_int_info = 0;
- /* Exit nested SVM mode */
- svm->nested.vmcb = 0;
-
- nested_svm_unmap(nested_vmcb, KM_USER0);
+ nested_svm_unmap(page);
kvm_mmu_reset_context(&svm->vcpu);
kvm_mmu_load(&svm->vcpu);
@@ -1717,19 +1886,33 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
- u32 *nested_msrpm;
+ /*
+ * This function merges the msr permission bitmaps of kvm and the
+ * nested vmcb. It is omptimized in that it only merges the parts where
+ * the kvm msr permission bitmap may contain zero bits
+ */
int i;
- nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
- if (!nested_msrpm)
- return false;
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+ return true;
- for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
- svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
+ for (i = 0; i < MSRPM_OFFSETS; i++) {
+ u32 value, p;
+ u64 offset;
- svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
+ if (msrpm_offsets[i] == 0xffffffff)
+ break;
+
+ p = msrpm_offsets[i];
+ offset = svm->nested.vmcb_msrpm + (p * 4);
+
+ if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
+ return false;
+
+ svm->nested.msrpm[p] = svm->msrpm[p] | value;
+ }
- nested_svm_unmap(nested_msrpm, KM_USER0);
+ svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
return true;
}
@@ -1739,26 +1922,34 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
+ struct page *page;
+ u64 vmcb_gpa;
+
+ vmcb_gpa = svm->vmcb->save.rax;
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
if (!nested_vmcb)
return false;
- /* nested_vmcb is our indicator if nested SVM is activated */
- svm->nested.vmcb = svm->vmcb->save.rax;
-
- trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
+ trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa,
nested_vmcb->save.rip,
nested_vmcb->control.int_ctl,
nested_vmcb->control.event_inj,
nested_vmcb->control.nested_ctl);
+ trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read,
+ nested_vmcb->control.intercept_cr_write,
+ nested_vmcb->control.intercept_exceptions,
+ nested_vmcb->control.intercept);
+
/* Clear internal status */
kvm_clear_exception_queue(&svm->vcpu);
kvm_clear_interrupt_queue(&svm->vcpu);
- /* Save the old vmcb, so we don't need to pick what we save, but
- can restore everything when a VMEXIT occurs */
+ /*
+ * Save the old vmcb, so we don't need to pick what we save, but can
+ * restore everything when a VMEXIT occurs
+ */
hsave->save.es = vmcb->save.es;
hsave->save.cs = vmcb->save.cs;
hsave->save.ss = vmcb->save.ss;
@@ -1798,14 +1989,17 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
if (npt_enabled) {
svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
- } else {
+ } else
kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
- kvm_mmu_reset_context(&svm->vcpu);
- }
+
+ /* Guest paging mode is active - reset mmu */
+ kvm_mmu_reset_context(&svm->vcpu);
+
svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
+
/* In case we don't even reach vcpu_run, the fields are not updated */
svm->vmcb->save.rax = nested_vmcb->save.rax;
svm->vmcb->save.rsp = nested_vmcb->save.rsp;
@@ -1814,22 +2008,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
svm->vmcb->save.cpl = nested_vmcb->save.cpl;
- /* We don't want a nested guest to be more powerful than the guest,
- so all intercepts are ORed */
- svm->vmcb->control.intercept_cr_read |=
- nested_vmcb->control.intercept_cr_read;
- svm->vmcb->control.intercept_cr_write |=
- nested_vmcb->control.intercept_cr_write;
- svm->vmcb->control.intercept_dr_read |=
- nested_vmcb->control.intercept_dr_read;
- svm->vmcb->control.intercept_dr_write |=
- nested_vmcb->control.intercept_dr_write;
- svm->vmcb->control.intercept_exceptions |=
- nested_vmcb->control.intercept_exceptions;
-
- svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
-
- svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
+ svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
+ svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
/* cache intercepts */
svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read;
@@ -1846,13 +2026,40 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
else
svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
+ if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
+ /* We only want the cr8 intercept bits of the guest */
+ svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK;
+ svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
+ }
+
+ /*
+ * We don't want a nested guest to be more powerful than the guest, so
+ * all intercepts are ORed
+ */
+ svm->vmcb->control.intercept_cr_read |=
+ nested_vmcb->control.intercept_cr_read;
+ svm->vmcb->control.intercept_cr_write |=
+ nested_vmcb->control.intercept_cr_write;
+ svm->vmcb->control.intercept_dr_read |=
+ nested_vmcb->control.intercept_dr_read;
+ svm->vmcb->control.intercept_dr_write |=
+ nested_vmcb->control.intercept_dr_write;
+ svm->vmcb->control.intercept_exceptions |=
+ nested_vmcb->control.intercept_exceptions;
+
+ svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
+
+ svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
svm->vmcb->control.int_state = nested_vmcb->control.int_state;
svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
- nested_svm_unmap(nested_vmcb, KM_USER0);
+ nested_svm_unmap(page);
+
+ /* nested_vmcb is our indicator if nested SVM is activated */
+ svm->nested.vmcb = vmcb_gpa;
enable_gif(svm);
@@ -1878,6 +2085,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
static int vmload_interception(struct vcpu_svm *svm)
{
struct vmcb *nested_vmcb;
+ struct page *page;
if (nested_svm_check_permissions(svm))
return 1;
@@ -1885,12 +2093,12 @@ static int vmload_interception(struct vcpu_svm *svm)
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
if (!nested_vmcb)
return 1;
nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
- nested_svm_unmap(nested_vmcb, KM_USER0);
+ nested_svm_unmap(page);
return 1;
}
@@ -1898,6 +2106,7 @@ static int vmload_interception(struct vcpu_svm *svm)
static int vmsave_interception(struct vcpu_svm *svm)
{
struct vmcb *nested_vmcb;
+ struct page *page;
if (nested_svm_check_permissions(svm))
return 1;
@@ -1905,12 +2114,12 @@ static int vmsave_interception(struct vcpu_svm *svm)
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
skip_emulated_instruction(&svm->vcpu);
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
if (!nested_vmcb)
return 1;
nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
- nested_svm_unmap(nested_vmcb, KM_USER0);
+ nested_svm_unmap(page);
return 1;
}
@@ -2013,6 +2222,8 @@ static int task_switch_interception(struct vcpu_svm *svm)
svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
uint32_t idt_v =
svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
+ bool has_error_code = false;
+ u32 error_code = 0;
tss_selector = (u16)svm->vmcb->control.exit_info_1;
@@ -2033,6 +2244,12 @@ static int task_switch_interception(struct vcpu_svm *svm)
svm->vcpu.arch.nmi_injected = false;
break;
case SVM_EXITINTINFO_TYPE_EXEPT:
+ if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
+ has_error_code = true;
+ error_code =
+ (u32)svm->vmcb->control.exit_info_2;
+ }
kvm_clear_exception_queue(&svm->vcpu);
break;
case SVM_EXITINTINFO_TYPE_INTR:
@@ -2049,7 +2266,14 @@ static int task_switch_interception(struct vcpu_svm *svm)
(int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
skip_emulated_instruction(&svm->vcpu);
- return kvm_task_switch(&svm->vcpu, tss_selector, reason);
+ if (kvm_task_switch(&svm->vcpu, tss_selector, reason,
+ has_error_code, error_code) == EMULATE_FAIL) {
+ svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ svm->vcpu.run->internal.ndata = 0;
+ return 0;
+ }
+ return 1;
}
static int cpuid_interception(struct vcpu_svm *svm)
@@ -2140,9 +2364,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
case MSR_IA32_SYSENTER_ESP:
*data = svm->sysenter_esp;
break;
- /* Nobody will change the following 5 values in the VMCB so
- we can safely return them on rdmsr. They will always be 0
- until LBRV is implemented. */
+ /*
+ * Nobody will change the following 5 values in the VMCB so we can
+ * safely return them on rdmsr. They will always be 0 until LBRV is
+ * implemented.
+ */
case MSR_IA32_DEBUGCTLMSR:
*data = svm->vmcb->save.dbgctl;
break;
@@ -2162,7 +2388,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
*data = svm->nested.hsave_msr;
break;
case MSR_VM_CR:
- *data = 0;
+ *data = svm->nested.vm_cr_msr;
break;
case MSR_IA32_UCODE_REV:
*data = 0x01000065;
@@ -2192,6 +2418,31 @@ static int rdmsr_interception(struct vcpu_svm *svm)
return 1;
}
+static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int svm_dis, chg_mask;
+
+ if (data & ~SVM_VM_CR_VALID_MASK)
+ return 1;
+
+ chg_mask = SVM_VM_CR_VALID_MASK;
+
+ if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
+ chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
+
+ svm->nested.vm_cr_msr &= ~chg_mask;
+ svm->nested.vm_cr_msr |= (data & chg_mask);
+
+ svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
+
+ /* check for svm_disable while efer.svme is set */
+ if (svm_dis && (vcpu->arch.efer & EFER_SVME))
+ return 1;
+
+ return 0;
+}
+
static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -2258,6 +2509,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
svm->nested.hsave_msr = data;
break;
case MSR_VM_CR:
+ return svm_set_vm_cr(vcpu, data);
case MSR_VM_IGNNE:
pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
break;
@@ -2321,16 +2573,16 @@ static int pause_interception(struct vcpu_svm *svm)
}
static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
- [SVM_EXIT_READ_CR0] = emulate_on_interception,
- [SVM_EXIT_READ_CR3] = emulate_on_interception,
- [SVM_EXIT_READ_CR4] = emulate_on_interception,
- [SVM_EXIT_READ_CR8] = emulate_on_interception,
+ [SVM_EXIT_READ_CR0] = emulate_on_interception,
+ [SVM_EXIT_READ_CR3] = emulate_on_interception,
+ [SVM_EXIT_READ_CR4] = emulate_on_interception,
+ [SVM_EXIT_READ_CR8] = emulate_on_interception,
[SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
- [SVM_EXIT_READ_DR0] = emulate_on_interception,
+ [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
+ [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
+ [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
+ [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
+ [SVM_EXIT_READ_DR0] = emulate_on_interception,
[SVM_EXIT_READ_DR1] = emulate_on_interception,
[SVM_EXIT_READ_DR2] = emulate_on_interception,
[SVM_EXIT_READ_DR3] = emulate_on_interception,
@@ -2349,15 +2601,14 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
[SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
[SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
- [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
- [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
- [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
- [SVM_EXIT_INTR] = intr_interception,
+ [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
+ [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
+ [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
+ [SVM_EXIT_INTR] = intr_interception,
[SVM_EXIT_NMI] = nmi_interception,
[SVM_EXIT_SMI] = nop_on_interception,
[SVM_EXIT_INIT] = nop_on_interception,
[SVM_EXIT_VINTR] = interrupt_window_interception,
- /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
[SVM_EXIT_CPUID] = cpuid_interception,
[SVM_EXIT_IRET] = iret_interception,
[SVM_EXIT_INVD] = emulate_on_interception,
@@ -2365,7 +2616,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_HLT] = halt_interception,
[SVM_EXIT_INVLPG] = invlpg_interception,
[SVM_EXIT_INVLPGA] = invlpga_interception,
- [SVM_EXIT_IOIO] = io_interception,
+ [SVM_EXIT_IOIO] = io_interception,
[SVM_EXIT_MSR] = msr_interception,
[SVM_EXIT_TASK_SWITCH] = task_switch_interception,
[SVM_EXIT_SHUTDOWN] = shutdown_interception,
@@ -2388,7 +2639,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
struct kvm_run *kvm_run = vcpu->run;
u32 exit_code = svm->vmcb->control.exit_code;
- trace_kvm_exit(exit_code, svm->vmcb->save.rip);
+ trace_kvm_exit(exit_code, vcpu);
if (unlikely(svm->nested.exit_required)) {
nested_svm_vmexit(svm);
@@ -2506,6 +2757,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+ return;
+
if (irr == -1)
return;
@@ -2563,13 +2817,13 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- nested_svm_intr(svm);
-
- /* In case GIF=0 we can't rely on the CPU to tell us when
- * GIF becomes 1, because that's a separate STGI/VMRUN intercept.
- * The next time we get that intercept, this function will be
- * called again though and we'll get the vintr intercept. */
- if (gif_set(svm)) {
+ /*
+ * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
+ * 1, because that's a separate STGI/VMRUN intercept. The next time we
+ * get that intercept, this function will be called again though and
+ * we'll get the vintr intercept.
+ */
+ if (gif_set(svm) && nested_svm_intr(svm)) {
svm_set_vintr(svm);
svm_inject_irq(svm, 0x0);
}
@@ -2583,12 +2837,15 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
== HF_NMI_MASK)
return; /* IRET will cause a vm exit */
- /* Something prevents NMI from been injected. Single step over
- possible problem (IRET or exception injection or interrupt
- shadow) */
- svm->nmi_singlestep = true;
- svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
- update_db_intercept(vcpu);
+ /*
+ * Something prevents NMI from been injected. Single step over possible
+ * problem (IRET or exception injection or interrupt shadow)
+ */
+ if (gif_set(svm) && nested_svm_nmi(svm)) {
+ svm->nmi_singlestep = true;
+ svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+ update_db_intercept(vcpu);
+ }
}
static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -2609,6 +2866,9 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+ return;
+
if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
kvm_set_cr8(vcpu, cr8);
@@ -2620,6 +2880,9 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
u64 cr8;
+ if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+ return;
+
cr8 = kvm_get_cr8(vcpu);
svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
@@ -2630,6 +2893,9 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
u8 vector;
int type;
u32 exitintinfo = svm->vmcb->control.exit_int_info;
+ unsigned int3_injected = svm->int3_injected;
+
+ svm->int3_injected = 0;
if (svm->vcpu.arch.hflags & HF_IRET_MASK)
svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
@@ -2649,12 +2915,21 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
svm->vcpu.arch.nmi_injected = true;
break;
case SVM_EXITINTINFO_TYPE_EXEPT:
- /* In case of software exception do not reinject an exception
- vector, but re-execute and instruction instead */
if (is_nested(svm))
break;
- if (kvm_exception_is_soft(vector))
+ /*
+ * In case of software exceptions, do not reinject the vector,
+ * but re-execute the instruction instead. Rewind RIP first
+ * if we emulated INT3 before.
+ */
+ if (kvm_exception_is_soft(vector)) {
+ if (vector == BP_VECTOR && int3_injected &&
+ kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
+ kvm_rip_write(&svm->vcpu,
+ kvm_rip_read(&svm->vcpu) -
+ int3_injected);
break;
+ }
if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
u32 err = svm->vmcb->control.exit_int_info_err;
kvm_queue_exception_e(&svm->vcpu, vector, err);
@@ -2875,24 +3150,24 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
}
static const struct trace_print_flags svm_exit_reasons_str[] = {
- { SVM_EXIT_READ_CR0, "read_cr0" },
- { SVM_EXIT_READ_CR3, "read_cr3" },
- { SVM_EXIT_READ_CR4, "read_cr4" },
- { SVM_EXIT_READ_CR8, "read_cr8" },
- { SVM_EXIT_WRITE_CR0, "write_cr0" },
- { SVM_EXIT_WRITE_CR3, "write_cr3" },
- { SVM_EXIT_WRITE_CR4, "write_cr4" },
- { SVM_EXIT_WRITE_CR8, "write_cr8" },
- { SVM_EXIT_READ_DR0, "read_dr0" },
- { SVM_EXIT_READ_DR1, "read_dr1" },
- { SVM_EXIT_READ_DR2, "read_dr2" },
- { SVM_EXIT_READ_DR3, "read_dr3" },
- { SVM_EXIT_WRITE_DR0, "write_dr0" },
- { SVM_EXIT_WRITE_DR1, "write_dr1" },
- { SVM_EXIT_WRITE_DR2, "write_dr2" },
- { SVM_EXIT_WRITE_DR3, "write_dr3" },
- { SVM_EXIT_WRITE_DR5, "write_dr5" },
- { SVM_EXIT_WRITE_DR7, "write_dr7" },
+ { SVM_EXIT_READ_CR0, "read_cr0" },
+ { SVM_EXIT_READ_CR3, "read_cr3" },
+ { SVM_EXIT_READ_CR4, "read_cr4" },
+ { SVM_EXIT_READ_CR8, "read_cr8" },
+ { SVM_EXIT_WRITE_CR0, "write_cr0" },
+ { SVM_EXIT_WRITE_CR3, "write_cr3" },
+ { SVM_EXIT_WRITE_CR4, "write_cr4" },
+ { SVM_EXIT_WRITE_CR8, "write_cr8" },
+ { SVM_EXIT_READ_DR0, "read_dr0" },
+ { SVM_EXIT_READ_DR1, "read_dr1" },
+ { SVM_EXIT_READ_DR2, "read_dr2" },
+ { SVM_EXIT_READ_DR3, "read_dr3" },
+ { SVM_EXIT_WRITE_DR0, "write_dr0" },
+ { SVM_EXIT_WRITE_DR1, "write_dr1" },
+ { SVM_EXIT_WRITE_DR2, "write_dr2" },
+ { SVM_EXIT_WRITE_DR3, "write_dr3" },
+ { SVM_EXIT_WRITE_DR5, "write_dr5" },
+ { SVM_EXIT_WRITE_DR7, "write_dr7" },
{ SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" },
{ SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" },
{ SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" },
@@ -2941,8 +3216,10 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- update_cr0_intercept(svm);
svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
+ if (is_nested(svm))
+ svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
+ update_cr0_intercept(svm);
}
static struct kvm_x86_ops svm_x86_ops = {
@@ -2981,8 +3258,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.set_idt = svm_set_idt,
.get_gdt = svm_get_gdt,
.set_gdt = svm_set_gdt,
- .get_dr = svm_get_dr,
- .set_dr = svm_set_dr,
+ .set_dr7 = svm_set_dr7,
.cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index eea4043..4ddadb1 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -12,7 +12,8 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
/*
* There is a race window between reading and incrementing, but we do
* not care about potentially loosing timer events in the !reinject
- * case anyway.
+ * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
+ * in vcpu_enter_guest.
*/
if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
atomic_inc(&ktimer->pending);
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 6ad30a2..a6544b8 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -5,8 +5,6 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvm
-#define TRACE_INCLUDE_PATH arch/x86/kvm
-#define TRACE_INCLUDE_FILE trace
/*
* Tracepoint for guest mode entry.
@@ -184,8 +182,8 @@ TRACE_EVENT(kvm_apic,
* Tracepoint for kvm guest exit:
*/
TRACE_EVENT(kvm_exit,
- TP_PROTO(unsigned int exit_reason, unsigned long guest_rip),
- TP_ARGS(exit_reason, guest_rip),
+ TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu),
+ TP_ARGS(exit_reason, vcpu),
TP_STRUCT__entry(
__field( unsigned int, exit_reason )
@@ -194,7 +192,7 @@ TRACE_EVENT(kvm_exit,
TP_fast_assign(
__entry->exit_reason = exit_reason;
- __entry->guest_rip = guest_rip;
+ __entry->guest_rip = kvm_rip_read(vcpu);
),
TP_printk("reason %s rip 0x%lx",
@@ -221,6 +219,38 @@ TRACE_EVENT(kvm_inj_virq,
TP_printk("irq %u", __entry->irq)
);
+#define EXS(x) { x##_VECTOR, "#" #x }
+
+#define kvm_trace_sym_exc \
+ EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \
+ EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \
+ EXS(MF), EXS(MC)
+
+/*
+ * Tracepoint for kvm interrupt injection:
+ */
+TRACE_EVENT(kvm_inj_exception,
+ TP_PROTO(unsigned exception, bool has_error, unsigned error_code),
+ TP_ARGS(exception, has_error, error_code),
+
+ TP_STRUCT__entry(
+ __field( u8, exception )
+ __field( u8, has_error )
+ __field( u32, error_code )
+ ),
+
+ TP_fast_assign(
+ __entry->exception = exception;
+ __entry->has_error = has_error;
+ __entry->error_code = error_code;
+ ),
+
+ TP_printk("%s (0x%x)",
+ __print_symbolic(__entry->exception, kvm_trace_sym_exc),
+ /* FIXME: don't print error_code if not present */
+ __entry->has_error ? __entry->error_code : 0)
+);
+
/*
* Tracepoint for page fault.
*/
@@ -413,12 +443,34 @@ TRACE_EVENT(kvm_nested_vmrun,
),
TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
- "event_inj: 0x%08x npt: %s\n",
+ "event_inj: 0x%08x npt: %s",
__entry->rip, __entry->vmcb, __entry->nested_rip,
__entry->int_ctl, __entry->event_inj,
__entry->npt ? "on" : "off")
);
+TRACE_EVENT(kvm_nested_intercepts,
+ TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept),
+ TP_ARGS(cr_read, cr_write, exceptions, intercept),
+
+ TP_STRUCT__entry(
+ __field( __u16, cr_read )
+ __field( __u16, cr_write )
+ __field( __u32, exceptions )
+ __field( __u64, intercept )
+ ),
+
+ TP_fast_assign(
+ __entry->cr_read = cr_read;
+ __entry->cr_write = cr_write;
+ __entry->exceptions = exceptions;
+ __entry->intercept = intercept;
+ ),
+
+ TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx",
+ __entry->cr_read, __entry->cr_write, __entry->exceptions,
+ __entry->intercept)
+);
/*
* Tracepoint for #VMEXIT while nested
*/
@@ -447,7 +499,7 @@ TRACE_EVENT(kvm_nested_vmexit,
__entry->exit_int_info_err = exit_int_info_err;
),
TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
- "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
+ "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
__entry->rip,
ftrace_print_symbols_seq(p, __entry->exit_code,
kvm_x86_ops->exit_reasons_str),
@@ -482,7 +534,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
),
TP_printk("reason: %s ext_inf1: 0x%016llx "
- "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
+ "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
ftrace_print_symbols_seq(p, __entry->exit_code,
kvm_x86_ops->exit_reasons_str),
__entry->exit_info1, __entry->exit_info2,
@@ -504,7 +556,7 @@ TRACE_EVENT(kvm_nested_intr_vmexit,
__entry->rip = rip
),
- TP_printk("rip: 0x%016llx\n", __entry->rip)
+ TP_printk("rip: 0x%016llx", __entry->rip)
);
/*
@@ -526,7 +578,7 @@ TRACE_EVENT(kvm_invlpga,
__entry->address = address;
),
- TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n",
+ TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx",
__entry->rip, __entry->asid, __entry->address)
);
@@ -547,11 +599,102 @@ TRACE_EVENT(kvm_skinit,
__entry->slb = slb;
),
- TP_printk("rip: 0x%016llx slb: 0x%08x\n",
+ TP_printk("rip: 0x%016llx slb: 0x%08x",
__entry->rip, __entry->slb)
);
+#define __print_insn(insn, ilen) ({ \
+ int i; \
+ const char *ret = p->buffer + p->len; \
+ \
+ for (i = 0; i < ilen; ++i) \
+ trace_seq_printf(p, " %02x", insn[i]); \
+ trace_seq_printf(p, "%c", 0); \
+ ret; \
+ })
+
+#define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
+#define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
+#define KVM_EMUL_INSN_F_CS_D (1 << 2)
+#define KVM_EMUL_INSN_F_CS_L (1 << 3)
+
+#define kvm_trace_symbol_emul_flags \
+ { 0, "real" }, \
+ { KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_EFL_VM, "vm16" }, \
+ { KVM_EMUL_INSN_F_CR0_PE, "prot16" }, \
+ { KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_D, "prot32" }, \
+ { KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_L, "prot64" }
+
+#define kei_decode_mode(mode) ({ \
+ u8 flags = 0xff; \
+ switch (mode) { \
+ case X86EMUL_MODE_REAL: \
+ flags = 0; \
+ break; \
+ case X86EMUL_MODE_VM86: \
+ flags = KVM_EMUL_INSN_F_EFL_VM; \
+ break; \
+ case X86EMUL_MODE_PROT16: \
+ flags = KVM_EMUL_INSN_F_CR0_PE; \
+ break; \
+ case X86EMUL_MODE_PROT32: \
+ flags = KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_D; \
+ break; \
+ case X86EMUL_MODE_PROT64: \
+ flags = KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_L; \
+ break; \
+ } \
+ flags; \
+ })
+
+TRACE_EVENT(kvm_emulate_insn,
+ TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed),
+ TP_ARGS(vcpu, failed),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u32, csbase )
+ __field( __u8, len )
+ __array( __u8, insn, 15 )
+ __field( __u8, flags )
+ __field( __u8, failed )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start;
+ __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
+ __entry->len = vcpu->arch.emulate_ctxt.decode.eip
+ - vcpu->arch.emulate_ctxt.decode.fetch.start;
+ memcpy(__entry->insn,
+ vcpu->arch.emulate_ctxt.decode.fetch.data,
+ 15);
+ __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
+ __entry->failed = failed;
+ ),
+
+ TP_printk("%x:%llx:%s (%s)%s",
+ __entry->csbase, __entry->rip,
+ __print_insn(__entry->insn, __entry->len),
+ __print_symbolic(__entry->flags,
+ kvm_trace_symbol_emul_flags),
+ __entry->failed ? " failed" : ""
+ )
+ );
+
+#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
+#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
+
#endif /* _TRACE_KVM_H */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH arch/x86/kvm
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
/* This part must be outside protection */
#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 82be6da..0b896ac 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -77,6 +77,8 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
+#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
+
/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* ple_gap: upper bound on the amount of time between two successive
@@ -131,7 +133,7 @@ struct vcpu_vmx {
} host_state;
struct {
int vm86_active;
- u8 save_iopl;
+ ulong save_rflags;
struct kvm_save_segment {
u16 selector;
unsigned long base;
@@ -232,56 +234,56 @@ static const u32 vmx_msr_index[] = {
};
#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
-static inline int is_page_fault(u32 intr_info)
+static inline bool is_page_fault(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
INTR_INFO_VALID_MASK)) ==
(INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
}
-static inline int is_no_device(u32 intr_info)
+static inline bool is_no_device(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
INTR_INFO_VALID_MASK)) ==
(INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
}
-static inline int is_invalid_opcode(u32 intr_info)
+static inline bool is_invalid_opcode(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
INTR_INFO_VALID_MASK)) ==
(INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
}
-static inline int is_external_interrupt(u32 intr_info)
+static inline bool is_external_interrupt(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
}
-static inline int is_machine_check(u32 intr_info)
+static inline bool is_machine_check(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
INTR_INFO_VALID_MASK)) ==
(INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
}
-static inline int cpu_has_vmx_msr_bitmap(void)
+static inline bool cpu_has_vmx_msr_bitmap(void)
{
return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
}
-static inline int cpu_has_vmx_tpr_shadow(void)
+static inline bool cpu_has_vmx_tpr_shadow(void)
{
return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
}
-static inline int vm_need_tpr_shadow(struct kvm *kvm)
+static inline bool vm_need_tpr_shadow(struct kvm *kvm)
{
return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
}
-static inline int cpu_has_secondary_exec_ctrls(void)
+static inline bool cpu_has_secondary_exec_ctrls(void)
{
return vmcs_config.cpu_based_exec_ctrl &
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -301,80 +303,80 @@ static inline bool cpu_has_vmx_flexpriority(void)
static inline bool cpu_has_vmx_ept_execute_only(void)
{
- return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT);
+ return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
}
static inline bool cpu_has_vmx_eptp_uncacheable(void)
{
- return !!(vmx_capability.ept & VMX_EPTP_UC_BIT);
+ return vmx_capability.ept & VMX_EPTP_UC_BIT;
}
static inline bool cpu_has_vmx_eptp_writeback(void)
{
- return !!(vmx_capability.ept & VMX_EPTP_WB_BIT);
+ return vmx_capability.ept & VMX_EPTP_WB_BIT;
}
static inline bool cpu_has_vmx_ept_2m_page(void)
{
- return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
+ return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
}
static inline bool cpu_has_vmx_ept_1g_page(void)
{
- return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT);
+ return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
}
-static inline int cpu_has_vmx_invept_individual_addr(void)
+static inline bool cpu_has_vmx_invept_individual_addr(void)
{
- return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
+ return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
}
-static inline int cpu_has_vmx_invept_context(void)
+static inline bool cpu_has_vmx_invept_context(void)
{
- return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT);
+ return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
}
-static inline int cpu_has_vmx_invept_global(void)
+static inline bool cpu_has_vmx_invept_global(void)
{
- return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT);
+ return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
}
-static inline int cpu_has_vmx_ept(void)
+static inline bool cpu_has_vmx_ept(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_ENABLE_EPT;
}
-static inline int cpu_has_vmx_unrestricted_guest(void)
+static inline bool cpu_has_vmx_unrestricted_guest(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_UNRESTRICTED_GUEST;
}
-static inline int cpu_has_vmx_ple(void)
+static inline bool cpu_has_vmx_ple(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_PAUSE_LOOP_EXITING;
}
-static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
+static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
{
return flexpriority_enabled && irqchip_in_kernel(kvm);
}
-static inline int cpu_has_vmx_vpid(void)
+static inline bool cpu_has_vmx_vpid(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_ENABLE_VPID;
}
-static inline int cpu_has_vmx_rdtscp(void)
+static inline bool cpu_has_vmx_rdtscp(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_RDTSCP;
}
-static inline int cpu_has_virtual_nmis(void)
+static inline bool cpu_has_virtual_nmis(void)
{
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
}
@@ -598,11 +600,11 @@ static void reload_tss(void)
/*
* VT restores TR but not its size. Useless.
*/
- struct descriptor_table gdt;
+ struct desc_ptr gdt;
struct desc_struct *descs;
- kvm_get_gdt(&gdt);
- descs = (void *)gdt.base;
+ native_store_gdt(&gdt);
+ descs = (void *)gdt.address;
descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
load_TR_desc();
}
@@ -632,6 +634,43 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
return true;
}
+static unsigned long segment_base(u16 selector)
+{
+ struct desc_ptr gdt;
+ struct desc_struct *d;
+ unsigned long table_base;
+ unsigned long v;
+
+ if (!(selector & ~3))
+ return 0;
+
+ native_store_gdt(&gdt);
+ table_base = gdt.address;
+
+ if (selector & 4) { /* from ldt */
+ u16 ldt_selector = kvm_read_ldt();
+
+ if (!(ldt_selector & ~3))
+ return 0;
+
+ table_base = segment_base(ldt_selector);
+ }
+ d = (struct desc_struct *)(table_base + (selector & ~7));
+ v = get_desc_base(d);
+#ifdef CONFIG_X86_64
+ if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
+ v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
+#endif
+ return v;
+}
+
+static inline unsigned long kvm_read_tr_base(void)
+{
+ u16 tr;
+ asm("str %0" : "=g"(tr));
+ return segment_base(tr);
+}
+
static void vmx_save_host_state(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -756,7 +795,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
}
if (vcpu->cpu != cpu) {
- struct descriptor_table dt;
+ struct desc_ptr dt;
unsigned long sysenter_esp;
vcpu->cpu = cpu;
@@ -765,8 +804,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
* processors.
*/
vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
- kvm_get_gdt(&dt);
- vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */
+ native_store_gdt(&dt);
+ vmcs_writel(HOST_GDTR_BASE, dt.address); /* 22.2.4 */
rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
@@ -818,18 +857,23 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
- unsigned long rflags;
+ unsigned long rflags, save_rflags;
rflags = vmcs_readl(GUEST_RFLAGS);
- if (to_vmx(vcpu)->rmode.vm86_active)
- rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
+ if (to_vmx(vcpu)->rmode.vm86_active) {
+ rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
+ save_rflags = to_vmx(vcpu)->rmode.save_rflags;
+ rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
+ }
return rflags;
}
static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
- if (to_vmx(vcpu)->rmode.vm86_active)
+ if (to_vmx(vcpu)->rmode.vm86_active) {
+ to_vmx(vcpu)->rmode.save_rflags = rflags;
rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
+ }
vmcs_writel(GUEST_RFLAGS, rflags);
}
@@ -839,9 +883,9 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
int ret = 0;
if (interruptibility & GUEST_INTR_STATE_STI)
- ret |= X86_SHADOW_INT_STI;
+ ret |= KVM_X86_SHADOW_INT_STI;
if (interruptibility & GUEST_INTR_STATE_MOV_SS)
- ret |= X86_SHADOW_INT_MOV_SS;
+ ret |= KVM_X86_SHADOW_INT_MOV_SS;
return ret & mask;
}
@@ -853,9 +897,9 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
- if (mask & X86_SHADOW_INT_MOV_SS)
+ if (mask & KVM_X86_SHADOW_INT_MOV_SS)
interruptibility |= GUEST_INTR_STATE_MOV_SS;
- if (mask & X86_SHADOW_INT_STI)
+ else if (mask & KVM_X86_SHADOW_INT_STI)
interruptibility |= GUEST_INTR_STATE_STI;
if ((interruptibility != interruptibility_old))
@@ -1483,8 +1527,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
flags = vmcs_readl(GUEST_RFLAGS);
- flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
- flags |= (vmx->rmode.save_iopl << IOPL_SHIFT);
+ flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
+ flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
vmcs_writel(GUEST_RFLAGS, flags);
vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
@@ -1557,8 +1601,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
flags = vmcs_readl(GUEST_RFLAGS);
- vmx->rmode.save_iopl
- = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+ vmx->rmode.save_rflags = flags;
flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
@@ -1928,28 +1971,28 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
*l = (ar >> 13) & 1;
}
-static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
- dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
- dt->base = vmcs_readl(GUEST_IDTR_BASE);
+ dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
+ dt->address = vmcs_readl(GUEST_IDTR_BASE);
}
-static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
- vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
- vmcs_writel(GUEST_IDTR_BASE, dt->base);
+ vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
+ vmcs_writel(GUEST_IDTR_BASE, dt->address);
}
-static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
- dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
- dt->base = vmcs_readl(GUEST_GDTR_BASE);
+ dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
+ dt->address = vmcs_readl(GUEST_GDTR_BASE);
}
-static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
- vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
- vmcs_writel(GUEST_GDTR_BASE, dt->base);
+ vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
+ vmcs_writel(GUEST_GDTR_BASE, dt->address);
}
static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
@@ -2328,7 +2371,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
u32 junk;
u64 host_pat, tsc_this, tsc_base;
unsigned long a;
- struct descriptor_table dt;
+ struct desc_ptr dt;
int i;
unsigned long kvm_vmx_return;
u32 exec_control;
@@ -2409,8 +2452,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
- kvm_get_idt(&dt);
- vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
+ native_store_idt(&dt);
+ vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
@@ -2942,22 +2985,20 @@ static int handle_io(struct kvm_vcpu *vcpu)
int size, in, string;
unsigned port;
- ++vcpu->stat.io_exits;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
string = (exit_qualification & 16) != 0;
+ in = (exit_qualification & 8) != 0;
- if (string) {
- if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
- return 0;
- return 1;
- }
+ ++vcpu->stat.io_exits;
- size = (exit_qualification & 7) + 1;
- in = (exit_qualification & 8) != 0;
- port = exit_qualification >> 16;
+ if (string || in)
+ return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
+ port = exit_qualification >> 16;
+ size = (exit_qualification & 7) + 1;
skip_emulated_instruction(vcpu);
- return kvm_emulate_pio(vcpu, in, size, port);
+
+ return kvm_fast_pio_out(vcpu, size, port);
}
static void
@@ -3048,19 +3089,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
return 0;
}
-static int check_dr_alias(struct kvm_vcpu *vcpu)
-{
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return -1;
- }
- return 0;
-}
-
static int handle_dr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
- unsigned long val;
int dr, reg;
/* Do not handle if the CPL > 0, will trigger GP on re-entry */
@@ -3095,67 +3126,20 @@ static int handle_dr(struct kvm_vcpu *vcpu)
dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
reg = DEBUG_REG_ACCESS_REG(exit_qualification);
if (exit_qualification & TYPE_MOV_FROM_DR) {
- switch (dr) {
- case 0 ... 3:
- val = vcpu->arch.db[dr];
- break;
- case 4:
- if (check_dr_alias(vcpu) < 0)
- return 1;
- /* fall through */
- case 6:
- val = vcpu->arch.dr6;
- break;
- case 5:
- if (check_dr_alias(vcpu) < 0)
- return 1;
- /* fall through */
- default: /* 7 */
- val = vcpu->arch.dr7;
- break;
- }
- kvm_register_write(vcpu, reg, val);
- } else {
- val = vcpu->arch.regs[reg];
- switch (dr) {
- case 0 ... 3:
- vcpu->arch.db[dr] = val;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
- vcpu->arch.eff_db[dr] = val;
- break;
- case 4:
- if (check_dr_alias(vcpu) < 0)
- return 1;
- /* fall through */
- case 6:
- if (val & 0xffffffff00000000ULL) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
- break;
- case 5:
- if (check_dr_alias(vcpu) < 0)
- return 1;
- /* fall through */
- default: /* 7 */
- if (val & 0xffffffff00000000ULL) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
- vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
- vcpu->arch.switch_db_regs =
- (val & DR7_BP_EN_MASK);
- }
- break;
- }
- }
+ unsigned long val;
+ if (!kvm_get_dr(vcpu, dr, &val))
+ kvm_register_write(vcpu, reg, val);
+ } else
+ kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]);
skip_emulated_instruction(vcpu);
return 1;
}
+static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ vmcs_writel(GUEST_DR7, val);
+}
+
static int handle_cpuid(struct kvm_vcpu *vcpu)
{
kvm_emulate_cpuid(vcpu);
@@ -3287,6 +3271,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long exit_qualification;
+ bool has_error_code = false;
+ u32 error_code = 0;
u16 tss_selector;
int reason, type, idt_v;
@@ -3309,6 +3295,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
kvm_clear_interrupt_queue(vcpu);
break;
case INTR_TYPE_HARD_EXCEPTION:
+ if (vmx->idt_vectoring_info &
+ VECTORING_INFO_DELIVER_CODE_MASK) {
+ has_error_code = true;
+ error_code =
+ vmcs_read32(IDT_VECTORING_ERROR_CODE);
+ }
+ /* fall through */
case INTR_TYPE_SOFT_EXCEPTION:
kvm_clear_exception_queue(vcpu);
break;
@@ -3323,8 +3316,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
type != INTR_TYPE_NMI_INTR))
skip_emulated_instruction(vcpu);
- if (!kvm_task_switch(vcpu, tss_selector, reason))
+ if (kvm_task_switch(vcpu, tss_selector, reason,
+ has_error_code, error_code) == EMULATE_FAIL) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
return 0;
+ }
/* clear all local breakpoint enable flags */
vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
@@ -3569,7 +3567,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
u32 exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
- trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
+ trace_kvm_exit(exit_reason, vcpu);
/* If guest state is invalid, start emulating */
if (vmx->emulation_required && emulate_invalid_guest_state)
@@ -4149,6 +4147,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_idt = vmx_set_idt,
.get_gdt = vmx_get_gdt,
.set_gdt = vmx_set_gdt,
+ .set_dr7 = vmx_set_dr7,
.cache_reg = vmx_cache_reg,
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c3a33b2..58a96e6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -42,7 +42,7 @@
#include <linux/slab.h>
#include <linux/perf_event.h>
#include <trace/events/kvm.h>
-#undef TRACE_INCLUDE_FILE
+
#define CREATE_TRACE_POINTS
#include "trace.h"
@@ -224,34 +224,6 @@ static void drop_user_return_notifiers(void *ignore)
kvm_on_user_return(&smsr->urn);
}
-unsigned long segment_base(u16 selector)
-{
- struct descriptor_table gdt;
- struct desc_struct *d;
- unsigned long table_base;
- unsigned long v;
-
- if (selector == 0)
- return 0;
-
- kvm_get_gdt(&gdt);
- table_base = gdt.base;
-
- if (selector & 4) { /* from ldt */
- u16 ldt_selector = kvm_read_ldt();
-
- table_base = segment_base(ldt_selector);
- }
- d = (struct desc_struct *)(table_base + (selector & ~7));
- v = get_desc_base(d);
-#ifdef CONFIG_X86_64
- if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
- v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
-#endif
- return v;
-}
-EXPORT_SYMBOL_GPL(segment_base);
-
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
{
if (irqchip_in_kernel(vcpu->kvm))
@@ -434,8 +406,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
#ifdef CONFIG_X86_64
if (cr0 & 0xffffffff00000000UL) {
- printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
- cr0, kvm_read_cr0(vcpu));
kvm_inject_gp(vcpu, 0);
return;
}
@@ -444,14 +414,11 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
cr0 &= ~CR0_RESERVED_BITS;
if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
- printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
kvm_inject_gp(vcpu, 0);
return;
}
if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
- printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
- "and a clear PE flag\n");
kvm_inject_gp(vcpu, 0);
return;
}
@@ -462,15 +429,11 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
int cs_db, cs_l;
if (!is_pae(vcpu)) {
- printk(KERN_DEBUG "set_cr0: #GP, start paging "
- "in long mode while PAE is disabled\n");
kvm_inject_gp(vcpu, 0);
return;
}
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
if (cs_l) {
- printk(KERN_DEBUG "set_cr0: #GP, start paging "
- "in long mode while CS.L == 1\n");
kvm_inject_gp(vcpu, 0);
return;
@@ -478,8 +441,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
} else
#endif
if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
- printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
- "reserved bits\n");
kvm_inject_gp(vcpu, 0);
return;
}
@@ -487,7 +448,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
}
kvm_x86_ops->set_cr0(vcpu, cr0);
- vcpu->arch.cr0 = cr0;
kvm_mmu_reset_context(vcpu);
return;
@@ -506,28 +466,23 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
if (cr4 & CR4_RESERVED_BITS) {
- printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
kvm_inject_gp(vcpu, 0);
return;
}
if (is_long_mode(vcpu)) {
if (!(cr4 & X86_CR4_PAE)) {
- printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
- "in long mode\n");
kvm_inject_gp(vcpu, 0);
return;
}
} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
&& ((cr4 ^ old_cr4) & pdptr_bits)
&& !load_pdptrs(vcpu, vcpu->arch.cr3)) {
- printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
kvm_inject_gp(vcpu, 0);
return;
}
if (cr4 & X86_CR4_VMXE) {
- printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
kvm_inject_gp(vcpu, 0);
return;
}
@@ -548,21 +503,16 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
if (is_long_mode(vcpu)) {
if (cr3 & CR3_L_MODE_RESERVED_BITS) {
- printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
kvm_inject_gp(vcpu, 0);
return;
}
} else {
if (is_pae(vcpu)) {
if (cr3 & CR3_PAE_RESERVED_BITS) {
- printk(KERN_DEBUG
- "set_cr3: #GP, reserved bits\n");
kvm_inject_gp(vcpu, 0);
return;
}
if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
- printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
- "reserved bits\n");
kvm_inject_gp(vcpu, 0);
return;
}
@@ -594,7 +544,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr3);
void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
if (cr8 & CR8_RESERVED_BITS) {
- printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
kvm_inject_gp(vcpu, 0);
return;
}
@@ -614,6 +563,80 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_get_cr8);
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+{
+ switch (dr) {
+ case 0 ... 3:
+ vcpu->arch.db[dr] = val;
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+ vcpu->arch.eff_db[dr] = val;
+ break;
+ case 4:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ /* fall through */
+ case 6:
+ if (val & 0xffffffff00000000ULL) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
+ break;
+ case 5:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ /* fall through */
+ default: /* 7 */
+ if (val & 0xffffffff00000000ULL) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+ kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
+ vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
+ }
+ break;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_dr);
+
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+{
+ switch (dr) {
+ case 0 ... 3:
+ *val = vcpu->arch.db[dr];
+ break;
+ case 4:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ /* fall through */
+ case 6:
+ *val = vcpu->arch.dr6;
+ break;
+ case 5:
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ /* fall through */
+ default: /* 7 */
+ *val = vcpu->arch.dr7;
+ break;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_get_dr);
+
static inline u32 bit(int bitno)
{
return 1 << (bitno & 31);
@@ -650,15 +673,12 @@ static u32 emulated_msrs[] = {
static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
if (efer & efer_reserved_bits) {
- printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
- efer);
kvm_inject_gp(vcpu, 0);
return;
}
if (is_paging(vcpu)
&& (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
- printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
kvm_inject_gp(vcpu, 0);
return;
}
@@ -668,7 +688,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
- printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
kvm_inject_gp(vcpu, 0);
return;
}
@@ -679,7 +698,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
- printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
kvm_inject_gp(vcpu, 0);
return;
}
@@ -968,9 +986,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (msr >= MSR_IA32_MC0_CTL &&
msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
u32 offset = msr - MSR_IA32_MC0_CTL;
- /* only 0 or all 1s can be written to IA32_MCi_CTL */
+ /* only 0 or all 1s can be written to IA32_MCi_CTL
+ * some Linux kernels though clear bit 10 in bank 4 to
+ * workaround a BIOS/GART TBL issue on AMD K8s, ignore
+ * this to avoid an uncatched #GP in the guest
+ */
if ((offset & 0x3) == 0 &&
- data != 0 && data != ~(u64)0)
+ data != 0 && (data | (1 << 10)) != ~(u64)0)
return -1;
vcpu->arch.mce_banks[offset] = data;
break;
@@ -1114,6 +1136,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
break;
case MSR_K7_HWCR:
data &= ~(u64)0x40; /* ignore flush filter disable */
+ data &= ~(u64)0x100; /* ignore ignne emulation enable */
if (data != 0) {
pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
data);
@@ -1572,6 +1595,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_HYPERV_VAPIC:
case KVM_CAP_HYPERV_SPIN:
case KVM_CAP_PCI_SEGMENT:
+ case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
r = 1;
break;
@@ -2124,14 +2148,20 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
{
vcpu_load(vcpu);
- events->exception.injected = vcpu->arch.exception.pending;
+ events->exception.injected =
+ vcpu->arch.exception.pending &&
+ !kvm_exception_is_soft(vcpu->arch.exception.nr);
events->exception.nr = vcpu->arch.exception.nr;
events->exception.has_error_code = vcpu->arch.exception.has_error_code;
events->exception.error_code = vcpu->arch.exception.error_code;
- events->interrupt.injected = vcpu->arch.interrupt.pending;
+ events->interrupt.injected =
+ vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
events->interrupt.nr = vcpu->arch.interrupt.nr;
- events->interrupt.soft = vcpu->arch.interrupt.soft;
+ events->interrupt.soft = 0;
+ events->interrupt.shadow =
+ kvm_x86_ops->get_interrupt_shadow(vcpu,
+ KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
events->nmi.injected = vcpu->arch.nmi_injected;
events->nmi.pending = vcpu->arch.nmi_pending;
@@ -2140,7 +2170,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
events->sipi_vector = vcpu->arch.sipi_vector;
events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
- | KVM_VCPUEVENT_VALID_SIPI_VECTOR);
+ | KVM_VCPUEVENT_VALID_SIPI_VECTOR
+ | KVM_VCPUEVENT_VALID_SHADOW);
vcpu_put(vcpu);
}
@@ -2149,7 +2180,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
- | KVM_VCPUEVENT_VALID_SIPI_VECTOR))
+ | KVM_VCPUEVENT_VALID_SIPI_VECTOR
+ | KVM_VCPUEVENT_VALID_SHADOW))
return -EINVAL;
vcpu_load(vcpu);
@@ -2164,6 +2196,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.interrupt.soft = events->interrupt.soft;
if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
kvm_pic_clear_isr_ack(vcpu->kvm);
+ if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
+ kvm_x86_ops->set_interrupt_shadow(vcpu,
+ events->interrupt.shadow);
vcpu->arch.nmi_injected = events->nmi.injected;
if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
@@ -2178,6 +2213,36 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
return 0;
}
+static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
+ struct kvm_debugregs *dbgregs)
+{
+ vcpu_load(vcpu);
+
+ memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
+ dbgregs->dr6 = vcpu->arch.dr6;
+ dbgregs->dr7 = vcpu->arch.dr7;
+ dbgregs->flags = 0;
+
+ vcpu_put(vcpu);
+}
+
+static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
+ struct kvm_debugregs *dbgregs)
+{
+ if (dbgregs->flags)
+ return -EINVAL;
+
+ vcpu_load(vcpu);
+
+ memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
+ vcpu->arch.dr6 = dbgregs->dr6;
+ vcpu->arch.dr7 = dbgregs->dr7;
+
+ vcpu_put(vcpu);
+
+ return 0;
+}
+
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -2356,6 +2421,29 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
break;
}
+ case KVM_GET_DEBUGREGS: {
+ struct kvm_debugregs dbgregs;
+
+ kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, &dbgregs,
+ sizeof(struct kvm_debugregs)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_DEBUGREGS: {
+ struct kvm_debugregs dbgregs;
+
+ r = -EFAULT;
+ if (copy_from_user(&dbgregs, argp,
+ sizeof(struct kvm_debugregs)))
+ break;
+
+ r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
+ break;
+ }
default:
r = -EINVAL;
}
@@ -2636,8 +2724,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_dirty_log *log)
{
- int r, n, i;
+ int r, i;
struct kvm_memory_slot *memslot;
+ unsigned long n;
unsigned long is_dirty = 0;
unsigned long *dirty_bitmap = NULL;
@@ -2652,7 +2741,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
if (!memslot->dirty_bitmap)
goto out;
- n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
+ n = kvm_dirty_bitmap_bytes(memslot);
r = -ENOMEM;
dirty_bitmap = vmalloc(n);
@@ -2822,11 +2911,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = -EFAULT;
if (copy_from_user(&irq_event, argp, sizeof irq_event))
goto out;
+ r = -ENXIO;
if (irqchip_in_kernel(kvm)) {
__s32 status;
status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
irq_event.irq, irq_event.level);
if (ioctl == KVM_IRQ_LINE_STATUS) {
+ r = -EFAULT;
irq_event.status = status;
if (copy_to_user(argp, &irq_event,
sizeof irq_event))
@@ -3042,6 +3133,18 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
}
+static void kvm_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ kvm_x86_ops->set_segment(vcpu, var, seg);
+}
+
+void kvm_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ kvm_x86_ops->get_segment(vcpu, var, seg);
+}
+
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
@@ -3122,14 +3225,17 @@ static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
}
-static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu, u32 *error)
+static int kvm_write_guest_virt_system(gva_t addr, void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu,
+ u32 *error)
{
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);
+ gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr,
+ PFERR_WRITE_MASK, error);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
@@ -3152,7 +3258,6 @@ out:
return r;
}
-
static int emulator_read_emulated(unsigned long addr,
void *val,
unsigned int bytes,
@@ -3255,9 +3360,9 @@ mmio:
}
int emulator_write_emulated(unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu)
+ const void *val,
+ unsigned int bytes,
+ struct kvm_vcpu *vcpu)
{
/* Crossing a page boundary? */
if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
@@ -3275,45 +3380,150 @@ int emulator_write_emulated(unsigned long addr,
}
EXPORT_SYMBOL_GPL(emulator_write_emulated);
+#define CMPXCHG_TYPE(t, ptr, old, new) \
+ (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
+
+#ifdef CONFIG_X86_64
+# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
+#else
+# define CMPXCHG64(ptr, old, new) \
+ (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
+#endif
+
static int emulator_cmpxchg_emulated(unsigned long addr,
const void *old,
const void *new,
unsigned int bytes,
struct kvm_vcpu *vcpu)
{
- printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
-#ifndef CONFIG_X86_64
- /* guests cmpxchg8b have to be emulated atomically */
- if (bytes == 8) {
- gpa_t gpa;
- struct page *page;
- char *kaddr;
- u64 val;
+ gpa_t gpa;
+ struct page *page;
+ char *kaddr;
+ bool exchanged;
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
+ /* guests cmpxchg8b have to be emulated atomically */
+ if (bytes > 8 || (bytes & (bytes - 1)))
+ goto emul_write;
- if (gpa == UNMAPPED_GVA ||
- (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
- goto emul_write;
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
- if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
- goto emul_write;
+ if (gpa == UNMAPPED_GVA ||
+ (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ goto emul_write;
- val = *(u64 *)new;
+ if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
+ goto emul_write;
- page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- kaddr = kmap_atomic(page, KM_USER0);
- set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
- kunmap_atomic(kaddr, KM_USER0);
- kvm_release_page_dirty(page);
+ kaddr = kmap_atomic(page, KM_USER0);
+ kaddr += offset_in_page(gpa);
+ switch (bytes) {
+ case 1:
+ exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
+ break;
+ case 2:
+ exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
+ break;
+ case 4:
+ exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
+ break;
+ case 8:
+ exchanged = CMPXCHG64(kaddr, old, new);
+ break;
+ default:
+ BUG();
}
+ kunmap_atomic(kaddr, KM_USER0);
+ kvm_release_page_dirty(page);
+
+ if (!exchanged)
+ return X86EMUL_CMPXCHG_FAILED;
+
+ kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1);
+
+ return X86EMUL_CONTINUE;
+
emul_write:
-#endif
+ printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
return emulator_write_emulated(addr, new, bytes, vcpu);
}
+static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
+{
+ /* TODO: String I/O for in kernel device */
+ int r;
+
+ if (vcpu->arch.pio.in)
+ r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
+ vcpu->arch.pio.size, pd);
+ else
+ r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
+ vcpu->arch.pio.port, vcpu->arch.pio.size,
+ pd);
+ return r;
+}
+
+
+static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
+ unsigned int count, struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.pio.count)
+ goto data_avail;
+
+ trace_kvm_pio(1, port, size, 1);
+
+ vcpu->arch.pio.port = port;
+ vcpu->arch.pio.in = 1;
+ vcpu->arch.pio.count = count;
+ vcpu->arch.pio.size = size;
+
+ if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+ data_avail:
+ memcpy(val, vcpu->arch.pio_data, size * count);
+ vcpu->arch.pio.count = 0;
+ return 1;
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_IO;
+ vcpu->run->io.direction = KVM_EXIT_IO_IN;
+ vcpu->run->io.size = size;
+ vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+ vcpu->run->io.count = count;
+ vcpu->run->io.port = port;
+
+ return 0;
+}
+
+static int emulator_pio_out_emulated(int size, unsigned short port,
+ const void *val, unsigned int count,
+ struct kvm_vcpu *vcpu)
+{
+ trace_kvm_pio(0, port, size, 1);
+
+ vcpu->arch.pio.port = port;
+ vcpu->arch.pio.in = 0;
+ vcpu->arch.pio.count = count;
+ vcpu->arch.pio.size = size;
+
+ memcpy(vcpu->arch.pio_data, val, size * count);
+
+ if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+ vcpu->arch.pio.count = 0;
+ return 1;
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_IO;
+ vcpu->run->io.direction = KVM_EXIT_IO_OUT;
+ vcpu->run->io.size = size;
+ vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+ vcpu->run->io.count = count;
+ vcpu->run->io.port = port;
+
+ return 0;
+}
+
static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
return kvm_x86_ops->get_segment_base(vcpu, seg);
@@ -3334,14 +3544,14 @@ int emulate_clts(struct kvm_vcpu *vcpu)
int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
{
- return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest);
+ return kvm_get_dr(ctxt->vcpu, dr, dest);
}
int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
{
unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
- return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask);
+ return kvm_set_dr(ctxt->vcpu, dr, value & mask);
}
void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
@@ -3362,12 +3572,167 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
}
EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
+static u64 mk_cr_64(u64 curr_cr, u32 new_val)
+{
+ return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
+}
+
+static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
+{
+ unsigned long value;
+
+ switch (cr) {
+ case 0:
+ value = kvm_read_cr0(vcpu);
+ break;
+ case 2:
+ value = vcpu->arch.cr2;
+ break;
+ case 3:
+ value = vcpu->arch.cr3;
+ break;
+ case 4:
+ value = kvm_read_cr4(vcpu);
+ break;
+ case 8:
+ value = kvm_get_cr8(vcpu);
+ break;
+ default:
+ vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+ return 0;
+ }
+
+ return value;
+}
+
+static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
+{
+ switch (cr) {
+ case 0:
+ kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+ break;
+ case 2:
+ vcpu->arch.cr2 = val;
+ break;
+ case 3:
+ kvm_set_cr3(vcpu, val);
+ break;
+ case 4:
+ kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+ break;
+ case 8:
+ kvm_set_cr8(vcpu, val & 0xfUL);
+ break;
+ default:
+ vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+ }
+}
+
+static int emulator_get_cpl(struct kvm_vcpu *vcpu)
+{
+ return kvm_x86_ops->get_cpl(vcpu);
+}
+
+static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops->get_gdt(vcpu, dt);
+}
+
+static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
+ struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment var;
+
+ kvm_get_segment(vcpu, &var, seg);
+
+ if (var.unusable)
+ return false;
+
+ if (var.g)
+ var.limit >>= 12;
+ set_desc_limit(desc, var.limit);
+ set_desc_base(desc, (unsigned long)var.base);
+ desc->type = var.type;
+ desc->s = var.s;
+ desc->dpl = var.dpl;
+ desc->p = var.present;
+ desc->avl = var.avl;
+ desc->l = var.l;
+ desc->d = var.db;
+ desc->g = var.g;
+
+ return true;
+}
+
+static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
+ struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment var;
+
+ /* needed to preserve selector */
+ kvm_get_segment(vcpu, &var, seg);
+
+ var.base = get_desc_base(desc);
+ var.limit = get_desc_limit(desc);
+ if (desc->g)
+ var.limit = (var.limit << 12) | 0xfff;
+ var.type = desc->type;
+ var.present = desc->p;
+ var.dpl = desc->dpl;
+ var.db = desc->d;
+ var.s = desc->s;
+ var.l = desc->l;
+ var.g = desc->g;
+ var.avl = desc->avl;
+ var.present = desc->p;
+ var.unusable = !var.present;
+ var.padding = 0;
+
+ kvm_set_segment(vcpu, &var, seg);
+ return;
+}
+
+static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment kvm_seg;
+
+ kvm_get_segment(vcpu, &kvm_seg, seg);
+ return kvm_seg.selector;
+}
+
+static void emulator_set_segment_selector(u16 sel, int seg,
+ struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment kvm_seg;
+
+ kvm_get_segment(vcpu, &kvm_seg, seg);
+ kvm_seg.selector = sel;
+ kvm_set_segment(vcpu, &kvm_seg, seg);
+}
+
+static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ kvm_x86_ops->set_rflags(vcpu, rflags);
+}
+
static struct x86_emulate_ops emulate_ops = {
.read_std = kvm_read_guest_virt_system,
+ .write_std = kvm_write_guest_virt_system,
.fetch = kvm_fetch_guest_virt,
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
.cmpxchg_emulated = emulator_cmpxchg_emulated,
+ .pio_in_emulated = emulator_pio_in_emulated,
+ .pio_out_emulated = emulator_pio_out_emulated,
+ .get_cached_descriptor = emulator_get_cached_descriptor,
+ .set_cached_descriptor = emulator_set_cached_descriptor,
+ .get_segment_selector = emulator_get_segment_selector,
+ .set_segment_selector = emulator_set_segment_selector,
+ .get_gdt = emulator_get_gdt,
+ .get_cr = emulator_get_cr,
+ .set_cr = emulator_set_cr,
+ .cpl = emulator_get_cpl,
+ .set_rflags = emulator_set_rflags,
};
static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3398,14 +3763,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
cache_all_regs(vcpu);
vcpu->mmio_is_write = 0;
- vcpu->arch.pio.string = 0;
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
int cs_db, cs_l;
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
vcpu->arch.emulate_ctxt.vcpu = vcpu;
- vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
+ vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+ vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
vcpu->arch.emulate_ctxt.mode =
(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
@@ -3414,6 +3779,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+ trace_kvm_emulate_insn_start(vcpu);
/* Only allow emulation of specific instructions on #UD
* (namely VMMCALL, sysenter, sysexit, syscall)*/
@@ -3446,6 +3812,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
++vcpu->stat.insn_emulation;
if (r) {
++vcpu->stat.insn_emulation_fail;
+ trace_kvm_emulate_insn_failed(vcpu);
if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
return EMULATE_DONE;
return EMULATE_FAIL;
@@ -3457,16 +3824,20 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
return EMULATE_DONE;
}
+restart:
r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
if (r == 0)
kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
- if (vcpu->arch.pio.string)
+ if (vcpu->arch.pio.count) {
+ if (!vcpu->arch.pio.in)
+ vcpu->arch.pio.count = 0;
return EMULATE_DO_MMIO;
+ }
- if ((r || vcpu->mmio_is_write) && run) {
+ if (r || vcpu->mmio_is_write) {
run->exit_reason = KVM_EXIT_MMIO;
run->mmio.phys_addr = vcpu->mmio_phys_addr;
memcpy(run->mmio.data, vcpu->mmio_data, 8);
@@ -3476,222 +3847,41 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
if (r) {
if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
- return EMULATE_DONE;
+ goto done;
if (!vcpu->mmio_needed) {
+ ++vcpu->stat.insn_emulation_fail;
+ trace_kvm_emulate_insn_failed(vcpu);
kvm_report_emulation_failure(vcpu, "mmio");
return EMULATE_FAIL;
}
return EMULATE_DO_MMIO;
}
- kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
-
if (vcpu->mmio_is_write) {
vcpu->mmio_needed = 0;
return EMULATE_DO_MMIO;
}
- return EMULATE_DONE;
-}
-EXPORT_SYMBOL_GPL(emulate_instruction);
-
-static int pio_copy_data(struct kvm_vcpu *vcpu)
-{
- void *p = vcpu->arch.pio_data;
- gva_t q = vcpu->arch.pio.guest_gva;
- unsigned bytes;
- int ret;
- u32 error_code;
-
- bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
- if (vcpu->arch.pio.in)
- ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
- else
- ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
-
- if (ret == X86EMUL_PROPAGATE_FAULT)
- kvm_inject_page_fault(vcpu, q, error_code);
-
- return ret;
-}
-
-int complete_pio(struct kvm_vcpu *vcpu)
-{
- struct kvm_pio_request *io = &vcpu->arch.pio;
- long delta;
- int r;
- unsigned long val;
-
- if (!io->string) {
- if (io->in) {
- val = kvm_register_read(vcpu, VCPU_REGS_RAX);
- memcpy(&val, vcpu->arch.pio_data, io->size);
- kvm_register_write(vcpu, VCPU_REGS_RAX, val);
- }
- } else {
- if (io->in) {
- r = pio_copy_data(vcpu);
- if (r)
- goto out;
- }
-
- delta = 1;
- if (io->rep) {
- delta *= io->cur_count;
- /*
- * The size of the register should really depend on
- * current address size.
- */
- val = kvm_register_read(vcpu, VCPU_REGS_RCX);
- val -= delta;
- kvm_register_write(vcpu, VCPU_REGS_RCX, val);
- }
- if (io->down)
- delta = -delta;
- delta *= io->size;
- if (io->in) {
- val = kvm_register_read(vcpu, VCPU_REGS_RDI);
- val += delta;
- kvm_register_write(vcpu, VCPU_REGS_RDI, val);
- } else {
- val = kvm_register_read(vcpu, VCPU_REGS_RSI);
- val += delta;
- kvm_register_write(vcpu, VCPU_REGS_RSI, val);
- }
- }
-out:
- io->count -= io->cur_count;
- io->cur_count = 0;
-
- return 0;
-}
+done:
+ if (vcpu->arch.exception.pending)
+ vcpu->arch.emulate_ctxt.restart = false;
-static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
-{
- /* TODO: String I/O for in kernel device */
- int r;
+ if (vcpu->arch.emulate_ctxt.restart)
+ goto restart;
- if (vcpu->arch.pio.in)
- r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
- vcpu->arch.pio.size, pd);
- else
- r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
- vcpu->arch.pio.port, vcpu->arch.pio.size,
- pd);
- return r;
-}
-
-static int pio_string_write(struct kvm_vcpu *vcpu)
-{
- struct kvm_pio_request *io = &vcpu->arch.pio;
- void *pd = vcpu->arch.pio_data;
- int i, r = 0;
-
- for (i = 0; i < io->cur_count; i++) {
- if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
- io->port, io->size, pd)) {
- r = -EOPNOTSUPP;
- break;
- }
- pd += io->size;
- }
- return r;
-}
-
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
-{
- unsigned long val;
-
- trace_kvm_pio(!in, port, size, 1);
-
- vcpu->run->exit_reason = KVM_EXIT_IO;
- vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
- vcpu->run->io.size = vcpu->arch.pio.size = size;
- vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
- vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
- vcpu->run->io.port = vcpu->arch.pio.port = port;
- vcpu->arch.pio.in = in;
- vcpu->arch.pio.string = 0;
- vcpu->arch.pio.down = 0;
- vcpu->arch.pio.rep = 0;
-
- if (!vcpu->arch.pio.in) {
- val = kvm_register_read(vcpu, VCPU_REGS_RAX);
- memcpy(vcpu->arch.pio_data, &val, 4);
- }
-
- if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
- complete_pio(vcpu);
- return 1;
- }
- return 0;
+ return EMULATE_DONE;
}
-EXPORT_SYMBOL_GPL(kvm_emulate_pio);
+EXPORT_SYMBOL_GPL(emulate_instruction);
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
- int size, unsigned long count, int down,
- gva_t address, int rep, unsigned port)
+int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
{
- unsigned now, in_page;
- int ret = 0;
-
- trace_kvm_pio(!in, port, size, count);
-
- vcpu->run->exit_reason = KVM_EXIT_IO;
- vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
- vcpu->run->io.size = vcpu->arch.pio.size = size;
- vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
- vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
- vcpu->run->io.port = vcpu->arch.pio.port = port;
- vcpu->arch.pio.in = in;
- vcpu->arch.pio.string = 1;
- vcpu->arch.pio.down = down;
- vcpu->arch.pio.rep = rep;
-
- if (!count) {
- kvm_x86_ops->skip_emulated_instruction(vcpu);
- return 1;
- }
-
- if (!down)
- in_page = PAGE_SIZE - offset_in_page(address);
- else
- in_page = offset_in_page(address) + size;
- now = min(count, (unsigned long)in_page / size);
- if (!now)
- now = 1;
- if (down) {
- /*
- * String I/O in reverse. Yuck. Kill the guest, fix later.
- */
- pr_unimpl(vcpu, "guest string pio down\n");
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- vcpu->run->io.count = now;
- vcpu->arch.pio.cur_count = now;
-
- if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
- kvm_x86_ops->skip_emulated_instruction(vcpu);
-
- vcpu->arch.pio.guest_gva = address;
-
- if (!vcpu->arch.pio.in) {
- /* string PIO write */
- ret = pio_copy_data(vcpu);
- if (ret == X86EMUL_PROPAGATE_FAULT)
- return 1;
- if (ret == 0 && !pio_string_write(vcpu)) {
- complete_pio(vcpu);
- if (vcpu->arch.pio.count == 0)
- ret = 1;
- }
- }
- /* no string PIO read support yet */
-
+ unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu);
+ /* do not return to emulator after return from userspace */
+ vcpu->arch.pio.count = 0;
return ret;
}
-EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
+EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
static void bounce_off(void *info)
{
@@ -4010,85 +4200,20 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
return emulator_write_emulated(rip, instruction, 3, vcpu);
}
-static u64 mk_cr_64(u64 curr_cr, u32 new_val)
-{
- return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
-}
-
void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
{
- struct descriptor_table dt = { limit, base };
+ struct desc_ptr dt = { limit, base };
kvm_x86_ops->set_gdt(vcpu, &dt);
}
void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
{
- struct descriptor_table dt = { limit, base };
+ struct desc_ptr dt = { limit, base };
kvm_x86_ops->set_idt(vcpu, &dt);
}
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
- unsigned long *rflags)
-{
- kvm_lmsw(vcpu, msw);
- *rflags = kvm_get_rflags(vcpu);
-}
-
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
-{
- unsigned long value;
-
- switch (cr) {
- case 0:
- value = kvm_read_cr0(vcpu);
- break;
- case 2:
- value = vcpu->arch.cr2;
- break;
- case 3:
- value = vcpu->arch.cr3;
- break;
- case 4:
- value = kvm_read_cr4(vcpu);
- break;
- case 8:
- value = kvm_get_cr8(vcpu);
- break;
- default:
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
- return 0;
- }
-
- return value;
-}
-
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
- unsigned long *rflags)
-{
- switch (cr) {
- case 0:
- kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
- *rflags = kvm_get_rflags(vcpu);
- break;
- case 2:
- vcpu->arch.cr2 = val;
- break;
- case 3:
- kvm_set_cr3(vcpu, val);
- break;
- case 4:
- kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
- break;
- case 8:
- kvm_set_cr8(vcpu, val & 0xfUL);
- break;
- default:
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
- }
-}
-
static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
{
struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
@@ -4152,9 +4277,13 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
+ best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
+ if (!best || best->eax < 0x80000008)
+ goto not_found;
best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
if (best)
return best->eax & 0xff;
+not_found:
return 36;
}
@@ -4268,6 +4397,9 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
{
/* try to reinject previous events if any */
if (vcpu->arch.exception.pending) {
+ trace_kvm_inj_exception(vcpu->arch.exception.nr,
+ vcpu->arch.exception.has_error_code,
+ vcpu->arch.exception.error_code);
kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code);
@@ -4528,24 +4660,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (!irqchip_in_kernel(vcpu->kvm))
kvm_set_cr8(vcpu, kvm_run->cr8);
- if (vcpu->arch.pio.cur_count) {
- r = complete_pio(vcpu);
- if (r)
- goto out;
- }
- if (vcpu->mmio_needed) {
- memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
- vcpu->mmio_read_completed = 1;
- vcpu->mmio_needed = 0;
-
+ if (vcpu->arch.pio.count || vcpu->mmio_needed ||
+ vcpu->arch.emulate_ctxt.restart) {
+ if (vcpu->mmio_needed) {
+ memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
+ vcpu->mmio_read_completed = 1;
+ vcpu->mmio_needed = 0;
+ }
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
- EMULTYPE_NO_DECODE);
+ r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
if (r == EMULATE_DO_MMIO) {
- /*
- * Read-modify-write. Back to userspace.
- */
r = 0;
goto out;
}
@@ -4628,12 +4753,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
return 0;
}
-void kvm_get_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- kvm_x86_ops->get_segment(vcpu, var, seg);
-}
-
void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
{
struct kvm_segment cs;
@@ -4647,7 +4766,7 @@ EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
- struct descriptor_table dt;
+ struct desc_ptr dt;
vcpu_load(vcpu);
@@ -4662,11 +4781,11 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
kvm_x86_ops->get_idt(vcpu, &dt);
- sregs->idt.limit = dt.limit;
- sregs->idt.base = dt.base;
+ sregs->idt.limit = dt.size;
+ sregs->idt.base = dt.address;
kvm_x86_ops->get_gdt(vcpu, &dt);
- sregs->gdt.limit = dt.limit;
- sregs->gdt.base = dt.base;
+ sregs->gdt.limit = dt.size;
+ sregs->gdt.base = dt.address;
sregs->cr0 = kvm_read_cr0(vcpu);
sregs->cr2 = vcpu->arch.cr2;
@@ -4705,559 +4824,33 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
return 0;
}
-static void kvm_set_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- kvm_x86_ops->set_segment(vcpu, var, seg);
-}
-
-static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
- struct kvm_segment *kvm_desct)
-{
- kvm_desct->base = get_desc_base(seg_desc);
- kvm_desct->limit = get_desc_limit(seg_desc);
- if (seg_desc->g) {
- kvm_desct->limit <<= 12;
- kvm_desct->limit |= 0xfff;
- }
- kvm_desct->selector = selector;
- kvm_desct->type = seg_desc->type;
- kvm_desct->present = seg_desc->p;
- kvm_desct->dpl = seg_desc->dpl;
- kvm_desct->db = seg_desc->d;
- kvm_desct->s = seg_desc->s;
- kvm_desct->l = seg_desc->l;
- kvm_desct->g = seg_desc->g;
- kvm_desct->avl = seg_desc->avl;
- if (!selector)
- kvm_desct->unusable = 1;
- else
- kvm_desct->unusable = 0;
- kvm_desct->padding = 0;
-}
-
-static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
- u16 selector,
- struct descriptor_table *dtable)
-{
- if (selector & 1 << 2) {
- struct kvm_segment kvm_seg;
-
- kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
-
- if (kvm_seg.unusable)
- dtable->limit = 0;
- else
- dtable->limit = kvm_seg.limit;
- dtable->base = kvm_seg.base;
- }
- else
- kvm_x86_ops->get_gdt(vcpu, dtable);
-}
-
-/* allowed just for 8 bytes segments */
-static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
- struct desc_struct *seg_desc)
-{
- struct descriptor_table dtable;
- u16 index = selector >> 3;
- int ret;
- u32 err;
- gva_t addr;
-
- get_segment_descriptor_dtable(vcpu, selector, &dtable);
-
- if (dtable.limit < index * 8 + 7) {
- kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
- return X86EMUL_PROPAGATE_FAULT;
- }
- addr = dtable.base + index * 8;
- ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
- vcpu, &err);
- if (ret == X86EMUL_PROPAGATE_FAULT)
- kvm_inject_page_fault(vcpu, addr, err);
-
- return ret;
-}
-
-/* allowed just for 8 bytes segments */
-static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
- struct desc_struct *seg_desc)
-{
- struct descriptor_table dtable;
- u16 index = selector >> 3;
-
- get_segment_descriptor_dtable(vcpu, selector, &dtable);
-
- if (dtable.limit < index * 8 + 7)
- return 1;
- return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
-}
-
-static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
- struct desc_struct *seg_desc)
-{
- u32 base_addr = get_desc_base(seg_desc);
-
- return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);
-}
-
-static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,
- struct desc_struct *seg_desc)
-{
- u32 base_addr = get_desc_base(seg_desc);
-
- return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);
-}
-
-static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
-{
- struct kvm_segment kvm_seg;
-
- kvm_get_segment(vcpu, &kvm_seg, seg);
- return kvm_seg.selector;
-}
-
-static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
-{
- struct kvm_segment segvar = {
- .base = selector << 4,
- .limit = 0xffff,
- .selector = selector,
- .type = 3,
- .present = 1,
- .dpl = 3,
- .db = 0,
- .s = 1,
- .l = 0,
- .g = 0,
- .avl = 0,
- .unusable = 0,
- };
- kvm_x86_ops->set_segment(vcpu, &segvar, seg);
- return X86EMUL_CONTINUE;
-}
-
-static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
+ bool has_error_code, u32 error_code)
{
- return (seg != VCPU_SREG_LDTR) &&
- (seg != VCPU_SREG_TR) &&
- (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
-}
-
-int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)
-{
- struct kvm_segment kvm_seg;
- struct desc_struct seg_desc;
- u8 dpl, rpl, cpl;
- unsigned err_vec = GP_VECTOR;
- u32 err_code = 0;
- bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
- int ret;
+ int cs_db, cs_l, ret;
+ cache_all_regs(vcpu);
- if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu))
- return kvm_load_realmode_segment(vcpu, selector, seg);
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
- /* NULL selector is not valid for TR, CS and SS */
- if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
- && null_selector)
- goto exception;
+ vcpu->arch.emulate_ctxt.vcpu = vcpu;
+ vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+ vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
+ vcpu->arch.emulate_ctxt.mode =
+ (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
+ (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
+ ? X86EMUL_MODE_VM86 : cs_l
+ ? X86EMUL_MODE_PROT64 : cs_db
+ ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
- /* TR should be in GDT only */
- if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
- goto exception;
+ ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
+ tss_selector, reason, has_error_code,
+ error_code);
- ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
if (ret)
- return ret;
-
- seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
-
- if (null_selector) { /* for NULL selector skip all following checks */
- kvm_seg.unusable = 1;
- goto load;
- }
-
- err_code = selector & 0xfffc;
- err_vec = GP_VECTOR;
-
- /* can't load system descriptor into segment selecor */
- if (seg <= VCPU_SREG_GS && !kvm_seg.s)
- goto exception;
-
- if (!kvm_seg.present) {
- err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
- goto exception;
- }
-
- rpl = selector & 3;
- dpl = kvm_seg.dpl;
- cpl = kvm_x86_ops->get_cpl(vcpu);
-
- switch (seg) {
- case VCPU_SREG_SS:
- /*
- * segment is not a writable data segment or segment
- * selector's RPL != CPL or segment selector's RPL != CPL
- */
- if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
- goto exception;
- break;
- case VCPU_SREG_CS:
- if (!(kvm_seg.type & 8))
- goto exception;
-
- if (kvm_seg.type & 4) {
- /* conforming */
- if (dpl > cpl)
- goto exception;
- } else {
- /* nonconforming */
- if (rpl > cpl || dpl != cpl)
- goto exception;
- }
- /* CS(RPL) <- CPL */
- selector = (selector & 0xfffc) | cpl;
- break;
- case VCPU_SREG_TR:
- if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
- goto exception;
- break;
- case VCPU_SREG_LDTR:
- if (kvm_seg.s || kvm_seg.type != 2)
- goto exception;
- break;
- default: /* DS, ES, FS, or GS */
- /*
- * segment is not a data or readable code segment or
- * ((segment is a data or nonconforming code segment)
- * and (both RPL and CPL > DPL))
- */
- if ((kvm_seg.type & 0xa) == 0x8 ||
- (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
- goto exception;
- break;
- }
-
- if (!kvm_seg.unusable && kvm_seg.s) {
- /* mark segment as accessed */
- kvm_seg.type |= 1;
- seg_desc.type |= 1;
- save_guest_segment_descriptor(vcpu, selector, &seg_desc);
- }
-load:
- kvm_set_segment(vcpu, &kvm_seg, seg);
- return X86EMUL_CONTINUE;
-exception:
- kvm_queue_exception_e(vcpu, err_vec, err_code);
- return X86EMUL_PROPAGATE_FAULT;
-}
-
-static void save_state_to_tss32(struct kvm_vcpu *vcpu,
- struct tss_segment_32 *tss)
-{
- tss->cr3 = vcpu->arch.cr3;
- tss->eip = kvm_rip_read(vcpu);
- tss->eflags = kvm_get_rflags(vcpu);
- tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
- tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
- tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
- tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
- tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
- tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
- tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
- tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
- tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
- tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
- tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
- tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
- tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
- tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
- tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
-}
-
-static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg)
-{
- struct kvm_segment kvm_seg;
- kvm_get_segment(vcpu, &kvm_seg, seg);
- kvm_seg.selector = sel;
- kvm_set_segment(vcpu, &kvm_seg, seg);
-}
-
-static int load_state_from_tss32(struct kvm_vcpu *vcpu,
- struct tss_segment_32 *tss)
-{
- kvm_set_cr3(vcpu, tss->cr3);
-
- kvm_rip_write(vcpu, tss->eip);
- kvm_set_rflags(vcpu, tss->eflags | 2);
-
- kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
- kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
- kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
- kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
- kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
- kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
- kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
- kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
-
- /*
- * SDM says that segment selectors are loaded before segment
- * descriptors
- */
- kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
- kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
- kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
- kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
- kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
- kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
- kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
-
- /*
- * Now load segment descriptors. If fault happenes at this stage
- * it is handled in a context of new task
- */
- if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
- return 1;
-
- if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
- return 1;
-
- if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
- return 1;
-
- if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
- return 1;
-
- if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
- return 1;
-
- if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
- return 1;
-
- if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
- return 1;
- return 0;
-}
-
-static void save_state_to_tss16(struct kvm_vcpu *vcpu,
- struct tss_segment_16 *tss)
-{
- tss->ip = kvm_rip_read(vcpu);
- tss->flag = kvm_get_rflags(vcpu);
- tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
- tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
- tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
- tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
- tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
- tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
- tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
- tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
-
- tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
- tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
- tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
- tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
- tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
-}
-
-static int load_state_from_tss16(struct kvm_vcpu *vcpu,
- struct tss_segment_16 *tss)
-{
- kvm_rip_write(vcpu, tss->ip);
- kvm_set_rflags(vcpu, tss->flag | 2);
- kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
- kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
- kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
- kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
- kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
- kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
- kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
- kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
-
- /*
- * SDM says that segment selectors are loaded before segment
- * descriptors
- */
- kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
- kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
- kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
- kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
- kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
-
- /*
- * Now load segment descriptors. If fault happenes at this stage
- * it is handled in a context of new task
- */
- if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
- return 1;
-
- if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
- return 1;
-
- if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
- return 1;
-
- if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
- return 1;
-
- if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
- return 1;
- return 0;
-}
-
-static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
- u16 old_tss_sel, u32 old_tss_base,
- struct desc_struct *nseg_desc)
-{
- struct tss_segment_16 tss_segment_16;
- int ret = 0;
-
- if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
- sizeof tss_segment_16))
- goto out;
-
- save_state_to_tss16(vcpu, &tss_segment_16);
-
- if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
- sizeof tss_segment_16))
- goto out;
-
- if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
- &tss_segment_16, sizeof tss_segment_16))
- goto out;
-
- if (old_tss_sel != 0xffff) {
- tss_segment_16.prev_task_link = old_tss_sel;
-
- if (kvm_write_guest(vcpu->kvm,
- get_tss_base_addr_write(vcpu, nseg_desc),
- &tss_segment_16.prev_task_link,
- sizeof tss_segment_16.prev_task_link))
- goto out;
- }
-
- if (load_state_from_tss16(vcpu, &tss_segment_16))
- goto out;
-
- ret = 1;
-out:
- return ret;
-}
-
-static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
- u16 old_tss_sel, u32 old_tss_base,
- struct desc_struct *nseg_desc)
-{
- struct tss_segment_32 tss_segment_32;
- int ret = 0;
-
- if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
- sizeof tss_segment_32))
- goto out;
-
- save_state_to_tss32(vcpu, &tss_segment_32);
-
- if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
- sizeof tss_segment_32))
- goto out;
-
- if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
- &tss_segment_32, sizeof tss_segment_32))
- goto out;
-
- if (old_tss_sel != 0xffff) {
- tss_segment_32.prev_task_link = old_tss_sel;
-
- if (kvm_write_guest(vcpu->kvm,
- get_tss_base_addr_write(vcpu, nseg_desc),
- &tss_segment_32.prev_task_link,
- sizeof tss_segment_32.prev_task_link))
- goto out;
- }
-
- if (load_state_from_tss32(vcpu, &tss_segment_32))
- goto out;
-
- ret = 1;
-out:
- return ret;
-}
-
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
-{
- struct kvm_segment tr_seg;
- struct desc_struct cseg_desc;
- struct desc_struct nseg_desc;
- int ret = 0;
- u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
- u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
-
- old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
-
- /* FIXME: Handle errors. Failure to read either TSS or their
- * descriptors should generate a pagefault.
- */
- if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
- goto out;
-
- if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
- goto out;
-
- if (reason != TASK_SWITCH_IRET) {
- int cpl;
-
- cpl = kvm_x86_ops->get_cpl(vcpu);
- if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
- kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
- return 1;
- }
- }
+ return EMULATE_FAIL;
- if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) {
- kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
- return 1;
- }
-
- if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
- cseg_desc.type &= ~(1 << 1); //clear the B flag
- save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
- }
-
- if (reason == TASK_SWITCH_IRET) {
- u32 eflags = kvm_get_rflags(vcpu);
- kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
- }
-
- /* set back link to prev task only if NT bit is set in eflags
- note that old_tss_sel is not used afetr this point */
- if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
- old_tss_sel = 0xffff;
-
- if (nseg_desc.type & 8)
- ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
- old_tss_base, &nseg_desc);
- else
- ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
- old_tss_base, &nseg_desc);
-
- if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
- u32 eflags = kvm_get_rflags(vcpu);
- kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
- }
-
- if (reason != TASK_SWITCH_IRET) {
- nseg_desc.type |= (1 << 1);
- save_guest_segment_descriptor(vcpu, tss_selector,
- &nseg_desc);
- }
-
- kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
- seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
- tr_seg.type = 11;
- kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
-out:
- return ret;
+ kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+ return EMULATE_DONE;
}
EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -5266,15 +4859,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
{
int mmu_reset_needed = 0;
int pending_vec, max_bits;
- struct descriptor_table dt;
+ struct desc_ptr dt;
vcpu_load(vcpu);
- dt.limit = sregs->idt.limit;
- dt.base = sregs->idt.base;
+ dt.size = sregs->idt.limit;
+ dt.address = sregs->idt.base;
kvm_x86_ops->set_idt(vcpu, &dt);
- dt.limit = sregs->gdt.limit;
- dt.base = sregs->gdt.base;
+ dt.size = sregs->gdt.limit;
+ dt.address = sregs->gdt.base;
kvm_x86_ops->set_gdt(vcpu, &dt);
vcpu->arch.cr2 = sregs->cr2;
@@ -5373,11 +4966,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
}
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
- vcpu->arch.singlestep_cs =
- get_segment_selector(vcpu, VCPU_SREG_CS);
- vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
- }
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
+ get_segment_base(vcpu, VCPU_SREG_CS);
/*
* Trigger an rflags update that will inject or remove the trace
@@ -5868,13 +5459,22 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
return kvm_x86_ops->interrupt_allowed(vcpu);
}
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
+{
+ unsigned long current_rip = kvm_rip_read(vcpu) +
+ get_segment_base(vcpu, VCPU_SREG_CS);
+
+ return current_rip == linear_rip;
+}
+EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
+
unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
{
unsigned long rflags;
rflags = kvm_x86_ops->get_rflags(vcpu);
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
+ rflags &= ~X86_EFLAGS_TF;
return rflags;
}
EXPORT_SYMBOL_GPL(kvm_get_rflags);
@@ -5882,10 +5482,8 @@ EXPORT_SYMBOL_GPL(kvm_get_rflags);
void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
- vcpu->arch.singlestep_cs ==
- get_segment_selector(vcpu, VCPU_SREG_CS) &&
- vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
- rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
+ kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
+ rflags |= X86_EFLAGS_TF;
kvm_x86_ops->set_rflags(vcpu, rflags);
}
EXPORT_SYMBOL_GPL(kvm_set_rflags);
@@ -5901,3 +5499,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 7e59dc1..2bdf628 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -115,7 +115,7 @@ static void async_hcall(unsigned long call, unsigned long arg1,
local_irq_save(flags);
if (lguest_data.hcall_status[next_call] != 0xFF) {
/* Table full, so do normal hcall which will flush table. */
- kvm_hypercall4(call, arg1, arg2, arg3, arg4);
+ hcall(call, arg1, arg2, arg3, arg4);
} else {
lguest_data.hcalls[next_call].arg0 = call;
lguest_data.hcalls[next_call].arg1 = arg1;
@@ -145,46 +145,45 @@ static void async_hcall(unsigned long call, unsigned long arg1,
* So, when we're in lazy mode, we call async_hcall() to store the call for
* future processing:
*/
-static void lazy_hcall1(unsigned long call,
- unsigned long arg1)
+static void lazy_hcall1(unsigned long call, unsigned long arg1)
{
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
- kvm_hypercall1(call, arg1);
+ hcall(call, arg1, 0, 0, 0);
else
async_hcall(call, arg1, 0, 0, 0);
}
/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/
static void lazy_hcall2(unsigned long call,
- unsigned long arg1,
- unsigned long arg2)
+ unsigned long arg1,
+ unsigned long arg2)
{
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
- kvm_hypercall2(call, arg1, arg2);
+ hcall(call, arg1, arg2, 0, 0);
else
async_hcall(call, arg1, arg2, 0, 0);
}
static void lazy_hcall3(unsigned long call,
- unsigned long arg1,
- unsigned long arg2,
- unsigned long arg3)
+ unsigned long arg1,
+ unsigned long arg2,
+ unsigned long arg3)
{
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
- kvm_hypercall3(call, arg1, arg2, arg3);
+ hcall(call, arg1, arg2, arg3, 0);
else
async_hcall(call, arg1, arg2, arg3, 0);
}
#ifdef CONFIG_X86_PAE
static void lazy_hcall4(unsigned long call,
- unsigned long arg1,
- unsigned long arg2,
- unsigned long arg3,
- unsigned long arg4)
+ unsigned long arg1,
+ unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4)
{
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
- kvm_hypercall4(call, arg1, arg2, arg3, arg4);
+ hcall(call, arg1, arg2, arg3, arg4);
else
async_hcall(call, arg1, arg2, arg3, arg4);
}
@@ -196,13 +195,13 @@ static void lazy_hcall4(unsigned long call,
:*/
static void lguest_leave_lazy_mmu_mode(void)
{
- kvm_hypercall0(LHCALL_FLUSH_ASYNC);
+ hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
paravirt_leave_lazy_mmu();
}
static void lguest_end_context_switch(struct task_struct *next)
{
- kvm_hypercall0(LHCALL_FLUSH_ASYNC);
+ hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
paravirt_end_context_switch(next);
}
@@ -286,7 +285,7 @@ static void lguest_write_idt_entry(gate_desc *dt,
/* Keep the local copy up to date. */
native_write_idt_entry(dt, entrynum, g);
/* Tell Host about this new entry. */
- kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]);
+ hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0);
}
/*
@@ -300,7 +299,7 @@ static void lguest_load_idt(const struct desc_ptr *desc)
struct desc_struct *idt = (void *)desc->address;
for (i = 0; i < (desc->size+1)/8; i++)
- kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);
+ hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0);
}
/*
@@ -321,7 +320,7 @@ static void lguest_load_gdt(const struct desc_ptr *desc)
struct desc_struct *gdt = (void *)desc->address;
for (i = 0; i < (desc->size+1)/8; i++)
- kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b);
+ hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0);
}
/*
@@ -334,8 +333,8 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
{
native_write_gdt_entry(dt, entrynum, desc, type);
/* Tell Host about this new entry. */
- kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, entrynum,
- dt[entrynum].a, dt[entrynum].b);
+ hcall(LHCALL_LOAD_GDT_ENTRY, entrynum,
+ dt[entrynum].a, dt[entrynum].b, 0);
}
/*
@@ -931,7 +930,7 @@ static int lguest_clockevent_set_next_event(unsigned long delta,
}
/* Please wake us this far in the future. */
- kvm_hypercall1(LHCALL_SET_CLOCKEVENT, delta);
+ hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0);
return 0;
}
@@ -942,7 +941,7 @@ static void lguest_clockevent_set_mode(enum clock_event_mode mode,
case CLOCK_EVT_MODE_UNUSED:
case CLOCK_EVT_MODE_SHUTDOWN:
/* A 0 argument shuts the clock down. */
- kvm_hypercall0(LHCALL_SET_CLOCKEVENT);
+ hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0);
break;
case CLOCK_EVT_MODE_ONESHOT:
/* This is what we expect. */
@@ -1100,7 +1099,7 @@ static void set_lguest_basic_apic_ops(void)
/* STOP! Until an interrupt comes in. */
static void lguest_safe_halt(void)
{
- kvm_hypercall0(LHCALL_HALT);
+ hcall(LHCALL_HALT, 0, 0, 0, 0);
}
/*
@@ -1112,8 +1111,8 @@ static void lguest_safe_halt(void)
*/
static void lguest_power_off(void)
{
- kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"),
- LGUEST_SHUTDOWN_POWEROFF);
+ hcall(LHCALL_SHUTDOWN, __pa("Power down"),
+ LGUEST_SHUTDOWN_POWEROFF, 0, 0);
}
/*
@@ -1123,7 +1122,7 @@ static void lguest_power_off(void)
*/
static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
{
- kvm_hypercall2(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF);
+ hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0);
/* The hcall won't return, but to keep gcc happy, we're "done". */
return NOTIFY_DONE;
}
@@ -1162,7 +1161,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
len = sizeof(scratch) - 1;
scratch[len] = '\0';
memcpy(scratch, buf, len);
- kvm_hypercall1(LHCALL_NOTIFY, __pa(scratch));
+ hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0, 0);
/* This routine returns the number of bytes actually written. */
return len;
@@ -1174,7 +1173,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
*/
static void lguest_restart(char *reason)
{
- kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART);
+ hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0);
}
/*G:050
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 27eac0f..4f420c2f 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -32,7 +32,7 @@ ENTRY(lguest_entry)
*/
movl $LHCALL_LGUEST_INIT, %eax
movl $lguest_data - __PAGE_OFFSET, %ebx
- .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
+ int $LGUEST_TRAP_ENTRY
/* Set up the initial stack so we can run C code. */
movl $(init_thread_union+THREAD_SIZE),%esp
OpenPOWER on IntegriCloud