diff options
-rw-r--r-- | arch/i386/kernel/cpu/mcheck/therm_throt.c | 6 | ||||
-rw-r--r-- | arch/i386/kernel/smpcommon.c | 8 | ||||
-rw-r--r-- | arch/x86_64/kernel/smp.c | 12 | ||||
-rw-r--r-- | drivers/kvm/Kconfig | 9 | ||||
-rw-r--r-- | drivers/kvm/kvm.h | 116 | ||||
-rw-r--r-- | drivers/kvm/kvm_main.c | 456 | ||||
-rw-r--r-- | drivers/kvm/mmu.c | 292 | ||||
-rw-r--r-- | drivers/kvm/paging_tmpl.h | 273 | ||||
-rw-r--r-- | drivers/kvm/svm.c | 59 | ||||
-rw-r--r-- | drivers/kvm/svm.h | 3 | ||||
-rw-r--r-- | drivers/kvm/vmx.c | 652 | ||||
-rw-r--r-- | drivers/kvm/x86_emulate.c | 44 | ||||
-rw-r--r-- | fs/anon_inodes.c | 1 | ||||
-rw-r--r-- | include/linux/magic.h | 1 | ||||
-rw-r--r-- | include/linux/notifier.h | 3 | ||||
-rw-r--r-- | include/linux/smp.h | 7 | ||||
-rw-r--r-- | kernel/cpu.c | 16 | ||||
-rw-r--r-- | kernel/cpuset.c | 3 |
18 files changed, 1198 insertions, 763 deletions
diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c index 7ba7c3a..1203dc5 100644 --- a/arch/i386/kernel/cpu/mcheck/therm_throt.c +++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c @@ -134,19 +134,21 @@ static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, int err; sys_dev = get_cpu_sysdev(cpu); - mutex_lock(&therm_cpu_lock); switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: + mutex_lock(&therm_cpu_lock); err = thermal_throttle_add_dev(sys_dev); + mutex_unlock(&therm_cpu_lock); WARN_ON(err); break; case CPU_DEAD: case CPU_DEAD_FROZEN: + mutex_lock(&therm_cpu_lock); thermal_throttle_remove_dev(sys_dev); + mutex_unlock(&therm_cpu_lock); break; } - mutex_unlock(&therm_cpu_lock); return NOTIFY_OK; } diff --git a/arch/i386/kernel/smpcommon.c b/arch/i386/kernel/smpcommon.c index 1868ae1..bbfe85a 100644 --- a/arch/i386/kernel/smpcommon.c +++ b/arch/i386/kernel/smpcommon.c @@ -47,7 +47,7 @@ int smp_call_function(void (*func) (void *info), void *info, int nonatomic, EXPORT_SYMBOL(smp_call_function); /** - * smp_call_function_single - Run a function on another CPU + * smp_call_function_single - Run a function on a specific CPU * @cpu: The target CPU. Cannot be the calling CPU. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. @@ -66,9 +66,11 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int ret; int me = get_cpu(); if (cpu == me) { - WARN_ON(1); + local_irq_disable(); + func(info); + local_irq_enable(); put_cpu(); - return -EBUSY; + return 0; } ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait); diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 2ff4685..0694940 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -357,7 +357,7 @@ __smp_call_function_single(int cpu, void (*func) (void *info), void *info, } /* - * smp_call_function_single - Run a function on another CPU + * smp_call_function_single - Run a function on a specific CPU * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @nonatomic: Currently unused. @@ -374,14 +374,18 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info, { /* prevent preemption and reschedule on another processor */ int me = get_cpu(); + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + if (cpu == me) { + local_irq_disable(); + func(info); + local_irq_enable(); put_cpu(); return 0; } - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - spin_lock_bh(&call_lock); __smp_call_function_single(cpu, func, info, nonatomic, wait); spin_unlock_bh(&call_lock); diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig index e8e37d8..33fa28a 100644 --- a/drivers/kvm/Kconfig +++ b/drivers/kvm/Kconfig @@ -1,12 +1,17 @@ # # KVM configuration # -menu "Virtualization" +menuconfig VIRTUALIZATION + bool "Virtualization" depends on X86 + default y + +if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on X86 && EXPERIMENTAL + depends on X86_CMPXCHG64 || 64BIT ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent @@ -35,4 +40,4 @@ config KVM_AMD Provides support for KVM on AMD processors equipped with the AMD-V (SVM) extensions. -endmenu +endif # VIRTUALIZATION diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 152312c..a7c5e6b 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -10,6 +10,8 @@ #include <linux/list.h> #include <linux/mutex.h> #include <linux/spinlock.h> +#include <linux/signal.h> +#include <linux/sched.h> #include <linux/mm.h> #include <asm/signal.h> @@ -18,6 +20,7 @@ #include <linux/kvm_para.h> #define CR0_PE_MASK (1ULL << 0) +#define CR0_MP_MASK (1ULL << 1) #define CR0_TS_MASK (1ULL << 3) #define CR0_NE_MASK (1ULL << 5) #define CR0_WP_MASK (1ULL << 16) @@ -42,7 +45,8 @@ (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK \ | CR0_NW_MASK | CR0_CD_MASK) #define KVM_VM_CR0_ALWAYS_ON \ - (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK) + (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK | CR0_TS_MASK \ + | CR0_MP_MASK) #define KVM_GUEST_CR4_MASK \ (CR4_PSE_MASK | CR4_PAE_MASK | CR4_PGE_MASK | CR4_VMXE_MASK | CR4_VME_MASK) #define KVM_PMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK) @@ -51,10 +55,10 @@ #define INVALID_PAGE (~(hpa_t)0) #define UNMAPPED_GVA (~(gpa_t)0) -#define KVM_MAX_VCPUS 1 +#define KVM_MAX_VCPUS 4 #define KVM_ALIAS_SLOTS 4 #define KVM_MEMORY_SLOTS 4 -#define KVM_NUM_MMU_PAGES 256 +#define KVM_NUM_MMU_PAGES 1024 #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 #define KVM_MAX_CPUID_ENTRIES 40 @@ -80,6 +84,11 @@ #define KVM_PIO_PAGE_OFFSET 1 /* + * vcpu->requests bit members + */ +#define KVM_TLB_FLUSH 0 + +/* * Address types: * * gva - guest virtual address @@ -137,7 +146,7 @@ struct kvm_mmu_page { gfn_t gfn; union kvm_mmu_page_role role; - hpa_t page_hpa; + u64 *spt; unsigned long slot_bitmap; /* One bit set per slot which has memory * in this shadow page. */ @@ -232,6 +241,7 @@ struct kvm_pio_request { struct page *guest_pages[2]; unsigned guest_page_offset; int in; + int port; int size; int string; int down; @@ -252,8 +262,70 @@ struct kvm_stat { u32 halt_exits; u32 request_irq_exits; u32 irq_exits; + u32 light_exits; + u32 efer_reload; +}; + +struct kvm_io_device { + void (*read)(struct kvm_io_device *this, + gpa_t addr, + int len, + void *val); + void (*write)(struct kvm_io_device *this, + gpa_t addr, + int len, + const void *val); + int (*in_range)(struct kvm_io_device *this, gpa_t addr); + void (*destructor)(struct kvm_io_device *this); + + void *private; +}; + +static inline void kvm_iodevice_read(struct kvm_io_device *dev, + gpa_t addr, + int len, + void *val) +{ + dev->read(dev, addr, len, val); +} + +static inline void kvm_iodevice_write(struct kvm_io_device *dev, + gpa_t addr, + int len, + const void *val) +{ + dev->write(dev, addr, len, val); +} + +static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr) +{ + return dev->in_range(dev, addr); +} + +static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) +{ + if (dev->destructor) + dev->destructor(dev); +} + +/* + * It would be nice to use something smarter than a linear search, TBD... + * Thankfully we dont expect many devices to register (famous last words :), + * so until then it will suffice. At least its abstracted so we can change + * in one place. + */ +struct kvm_io_bus { + int dev_count; +#define NR_IOBUS_DEVS 6 + struct kvm_io_device *devs[NR_IOBUS_DEVS]; }; +void kvm_io_bus_init(struct kvm_io_bus *bus); +void kvm_io_bus_destroy(struct kvm_io_bus *bus); +struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr); +void kvm_io_bus_register_dev(struct kvm_io_bus *bus, + struct kvm_io_device *dev); + struct kvm_vcpu { struct kvm *kvm; union { @@ -266,6 +338,8 @@ struct kvm_vcpu { u64 host_tsc; struct kvm_run *run; int interrupt_window_open; + int guest_mode; + unsigned long requests; unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) unsigned long irq_pending[NR_IRQ_WORDS]; @@ -285,15 +359,20 @@ struct kvm_vcpu { u64 apic_base; u64 ia32_misc_enable_msr; int nmsrs; + int save_nmsrs; + int msr_offset_efer; +#ifdef CONFIG_X86_64 + int msr_offset_kernel_gs_base; +#endif struct vmx_msr_entry *guest_msrs; struct vmx_msr_entry *host_msrs; - struct list_head free_pages; - struct kvm_mmu_page page_header_buf[KVM_NUM_MMU_PAGES]; struct kvm_mmu mmu; struct kvm_mmu_memory_cache mmu_pte_chain_cache; struct kvm_mmu_memory_cache mmu_rmap_desc_cache; + struct kvm_mmu_memory_cache mmu_page_cache; + struct kvm_mmu_memory_cache mmu_page_header_cache; gfn_t last_pt_write_gfn; int last_pt_write_count; @@ -305,6 +384,11 @@ struct kvm_vcpu { char *guest_fx_image; int fpu_active; int guest_fpu_loaded; + struct vmx_host_state { + int loaded; + u16 fs_sel, gs_sel, ldt_sel; + int fs_gs_ldt_reload_needed; + } vmx_host_state; int mmio_needed; int mmio_read_completed; @@ -331,6 +415,7 @@ struct kvm_vcpu { u32 ar; } tr, es, ds, fs, gs; } rmode; + int halt_request; /* real mode on Intel only */ int cpuid_nent; struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES]; @@ -362,12 +447,15 @@ struct kvm { struct list_head active_mmu_pages; int n_free_mmu_pages; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; + int nvcpus; struct kvm_vcpu vcpus[KVM_MAX_VCPUS]; int memory_config_version; int busy; unsigned long rmap_overflow; struct list_head vm_list; struct file *filp; + struct kvm_io_bus mmio_bus; + struct kvm_io_bus pio_bus; }; struct descriptor_table { @@ -488,6 +576,7 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, int size, unsigned long count, int string, int down, gva_t address, int rep, unsigned port); void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); +int kvm_emulate_halt(struct kvm_vcpu *vcpu); int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); int emulate_clts(struct kvm_vcpu *vcpu); int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, @@ -511,6 +600,7 @@ void save_msrs(struct vmx_msr_entry *e, int n); void kvm_resched(struct kvm_vcpu *vcpu); void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); +void kvm_flush_remote_tlbs(struct kvm *kvm); int kvm_read_guest(struct kvm_vcpu *vcpu, gva_t addr, @@ -524,10 +614,12 @@ int kvm_write_guest(struct kvm_vcpu *vcpu, unsigned long segment_base(u16 selector); -void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes); -void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes); +void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *old, const u8 *new, int bytes); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); +int kvm_mmu_load(struct kvm_vcpu *vcpu); +void kvm_mmu_unload(struct kvm_vcpu *vcpu); int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); @@ -539,6 +631,14 @@ static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, return vcpu->mmu.page_fault(vcpu, gva, error_code); } +static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) +{ + if (likely(vcpu->mmu.root_hpa != INVALID_PAGE)) + return 0; + + return kvm_mmu_load(vcpu); +} + static inline int is_long_mode(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 8f1f07a..1b206f1 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -16,34 +16,33 @@ */ #include "kvm.h" +#include "x86_emulate.h" +#include "segment_descriptor.h" #include <linux/kvm.h> #include <linux/module.h> #include <linux/errno.h> -#include <linux/magic.h> -#include <asm/processor.h> #include <linux/percpu.h> #include <linux/gfp.h> -#include <asm/msr.h> #include <linux/mm.h> #include <linux/miscdevice.h> #include <linux/vmalloc.h> -#include <asm/uaccess.h> #include <linux/reboot.h> -#include <asm/io.h> #include <linux/debugfs.h> #include <linux/highmem.h> #include <linux/file.h> -#include <asm/desc.h> #include <linux/sysdev.h> #include <linux/cpu.h> -#include <linux/file.h> -#include <linux/fs.h> -#include <linux/mount.h> #include <linux/sched.h> +#include <linux/cpumask.h> +#include <linux/smp.h> +#include <linux/anon_inodes.h> -#include "x86_emulate.h" -#include "segment_descriptor.h" +#include <asm/processor.h> +#include <asm/msr.h> +#include <asm/io.h> +#include <asm/uaccess.h> +#include <asm/desc.h> MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -51,8 +50,12 @@ MODULE_LICENSE("GPL"); static DEFINE_SPINLOCK(kvm_lock); static LIST_HEAD(vm_list); +static cpumask_t cpus_hardware_enabled; + struct kvm_arch_ops *kvm_arch_ops; +static void hardware_disable(void *ignored); + #define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x) static struct kvm_stats_debugfs_item { @@ -72,13 +75,13 @@ static struct kvm_stats_debugfs_item { { "halt_exits", STAT_OFFSET(halt_exits) }, { "request_irq", STAT_OFFSET(request_irq_exits) }, { "irq_exits", STAT_OFFSET(irq_exits) }, + { "light_exits", STAT_OFFSET(light_exits) }, + { "efer_reload", STAT_OFFSET(efer_reload) }, { NULL } }; static struct dentry *debugfs_dir; -struct vfsmount *kvmfs_mnt; - #define MAX_IO_MSRS 256 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL @@ -100,55 +103,6 @@ struct segment_descriptor_64 { static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, unsigned long arg); -static struct inode *kvmfs_inode(struct file_operations *fops) -{ - int error = -ENOMEM; - struct inode *inode = new_inode(kvmfs_mnt->mnt_sb); - - if (!inode) - goto eexit_1; - - inode->i_fop = fops; - - /* - * Mark the inode dirty from the very beginning, - * that way it will never be moved to the dirty - * list because mark_inode_dirty() will think - * that it already _is_ on the dirty list. - */ - inode->i_state = I_DIRTY; - inode->i_mode = S_IRUSR | S_IWUSR; - inode->i_uid = current->fsuid; - inode->i_gid = current->fsgid; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - return inode; - -eexit_1: - return ERR_PTR(error); -} - -static struct file *kvmfs_file(struct inode *inode, void *private_data) -{ - struct file *file = get_empty_filp(); - - if (!file) - return ERR_PTR(-ENFILE); - - file->f_path.mnt = mntget(kvmfs_mnt); - file->f_path.dentry = d_alloc_anon(inode); - if (!file->f_path.dentry) - return ERR_PTR(-ENOMEM); - file->f_mapping = inode->i_mapping; - - file->f_pos = 0; - file->f_flags = O_RDWR; - file->f_op = inode->i_fop; - file->f_mode = FMODE_READ | FMODE_WRITE; - file->f_version = 0; - file->private_data = private_data; - return file; -} - unsigned long segment_base(u16 selector) { struct descriptor_table gdt; @@ -307,6 +261,48 @@ static void vcpu_put(struct kvm_vcpu *vcpu) mutex_unlock(&vcpu->mutex); } +static void ack_flush(void *_completed) +{ + atomic_t *completed = _completed; + + atomic_inc(completed); +} + +void kvm_flush_remote_tlbs(struct kvm *kvm) +{ + int i, cpu, needed; + cpumask_t cpus; + struct kvm_vcpu *vcpu; + atomic_t completed; + + atomic_set(&completed, 0); + cpus_clear(cpus); + needed = 0; + for (i = 0; i < kvm->nvcpus; ++i) { + vcpu = &kvm->vcpus[i]; + if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests)) + continue; + cpu = vcpu->cpu; + if (cpu != -1 && cpu != raw_smp_processor_id()) + if (!cpu_isset(cpu, cpus)) { + cpu_set(cpu, cpus); + ++needed; + } + } + + /* + * We really want smp_call_function_mask() here. But that's not + * available, so ipi all cpus in parallel and wait for them + * to complete. + */ + for (cpu = first_cpu(cpus); cpu != NR_CPUS; cpu = next_cpu(cpu, cpus)) + smp_call_function_single(cpu, ack_flush, &completed, 1, 0); + while (atomic_read(&completed) != needed) { + cpu_relax(); + barrier(); + } +} + static struct kvm *kvm_create_vm(void) { struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); @@ -315,8 +311,13 @@ static struct kvm *kvm_create_vm(void) if (!kvm) return ERR_PTR(-ENOMEM); + kvm_io_bus_init(&kvm->pio_bus); spin_lock_init(&kvm->lock); INIT_LIST_HEAD(&kvm->active_mmu_pages); + spin_lock(&kvm_lock); + list_add(&kvm->vm_list, &vm_list); + spin_unlock(&kvm_lock); + kvm_io_bus_init(&kvm->mmio_bus); for (i = 0; i < KVM_MAX_VCPUS; ++i) { struct kvm_vcpu *vcpu = &kvm->vcpus[i]; @@ -324,10 +325,6 @@ static struct kvm *kvm_create_vm(void) vcpu->cpu = -1; vcpu->kvm = kvm; vcpu->mmu.root_hpa = INVALID_PAGE; - INIT_LIST_HEAD(&vcpu->free_pages); - spin_lock(&kvm_lock); - list_add(&kvm->vm_list, &vm_list); - spin_unlock(&kvm_lock); } return kvm; } @@ -380,6 +377,16 @@ static void free_pio_guest_pages(struct kvm_vcpu *vcpu) } } +static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) +{ + if (!vcpu->vmcs) + return; + + vcpu_load(vcpu); + kvm_mmu_unload(vcpu); + vcpu_put(vcpu); +} + static void kvm_free_vcpu(struct kvm_vcpu *vcpu) { if (!vcpu->vmcs) @@ -400,6 +407,11 @@ static void kvm_free_vcpus(struct kvm *kvm) { unsigned int i; + /* + * Unpin any mmu pages first. + */ + for (i = 0; i < KVM_MAX_VCPUS; ++i) + kvm_unload_vcpu_mmu(&kvm->vcpus[i]); for (i = 0; i < KVM_MAX_VCPUS; ++i) kvm_free_vcpu(&kvm->vcpus[i]); } @@ -414,6 +426,8 @@ static void kvm_destroy_vm(struct kvm *kvm) spin_lock(&kvm_lock); list_del(&kvm->vm_list); spin_unlock(&kvm_lock); + kvm_io_bus_destroy(&kvm->pio_bus); + kvm_io_bus_destroy(&kvm->mmio_bus); kvm_free_vcpus(kvm); kvm_free_physmem(kvm); kfree(kvm); @@ -969,7 +983,7 @@ EXPORT_SYMBOL_GPL(gfn_to_page); void mark_page_dirty(struct kvm *kvm, gfn_t gfn) { int i; - struct kvm_memory_slot *memslot = NULL; + struct kvm_memory_slot *memslot; unsigned long rel_gfn; for (i = 0; i < kvm->nmemslots; ++i) { @@ -978,7 +992,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn) if (gfn >= memslot->base_gfn && gfn < memslot->base_gfn + memslot->npages) { - if (!memslot || !memslot->dirty_bitmap) + if (!memslot->dirty_bitmap) return; rel_gfn = gfn - memslot->base_gfn; @@ -1037,12 +1051,31 @@ static int emulator_write_std(unsigned long addr, return X86EMUL_UNHANDLEABLE; } +static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, + gpa_t addr) +{ + /* + * Note that its important to have this wrapper function because + * in the very near future we will be checking for MMIOs against + * the LAPIC as well as the general MMIO bus + */ + return kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); +} + +static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, + gpa_t addr) +{ + return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); +} + static int emulator_read_emulated(unsigned long addr, void *val, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { - struct kvm_vcpu *vcpu = ctxt->vcpu; + struct kvm_vcpu *vcpu = ctxt->vcpu; + struct kvm_io_device *mmio_dev; + gpa_t gpa; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -1051,18 +1084,26 @@ static int emulator_read_emulated(unsigned long addr, } else if (emulator_read_std(addr, val, bytes, ctxt) == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; - else { - gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; - vcpu->mmio_needed = 1; - vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->mmio_is_write = 0; + gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); + if (gpa == UNMAPPED_GVA) + return X86EMUL_PROPAGATE_FAULT; - return X86EMUL_UNHANDLEABLE; + /* + * Is this MMIO handled locally? + */ + mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); + if (mmio_dev) { + kvm_iodevice_read(mmio_dev, gpa, bytes, val); + return X86EMUL_CONTINUE; } + + vcpu->mmio_needed = 1; + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 0; + + return X86EMUL_UNHANDLEABLE; } static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, @@ -1070,18 +1111,20 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, { struct page *page; void *virt; + unsigned offset = offset_in_page(gpa); if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) return 0; page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); if (!page) return 0; - kvm_mmu_pre_write(vcpu, gpa, bytes); mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); virt = kmap_atomic(page, KM_USER0); - memcpy(virt + offset_in_page(gpa), val, bytes); + if (memcmp(virt + offset_in_page(gpa), val, bytes)) { + kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes); + memcpy(virt + offset_in_page(gpa), val, bytes); + } kunmap_atomic(virt, KM_USER0); - kvm_mmu_post_write(vcpu, gpa, bytes); return 1; } @@ -1090,8 +1133,9 @@ static int emulator_write_emulated(unsigned long addr, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { - struct kvm_vcpu *vcpu = ctxt->vcpu; - gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); + struct kvm_vcpu *vcpu = ctxt->vcpu; + struct kvm_io_device *mmio_dev; + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); if (gpa == UNMAPPED_GVA) { kvm_arch_ops->inject_page_fault(vcpu, addr, 2); @@ -1101,6 +1145,15 @@ static int emulator_write_emulated(unsigned long addr, if (emulator_write_phys(vcpu, gpa, val, bytes)) return X86EMUL_CONTINUE; + /* + * Is this MMIO handled locally? + */ + mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); + if (mmio_dev) { + kvm_iodevice_write(mmio_dev, gpa, bytes, val); + return X86EMUL_CONTINUE; + } + vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; vcpu->mmio_size = bytes; @@ -1269,6 +1322,17 @@ int emulate_instruction(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(emulate_instruction); +int kvm_emulate_halt(struct kvm_vcpu *vcpu) +{ + if (vcpu->irq_summary) + return 1; + + vcpu->run->exit_reason = KVM_EXIT_HLT; + ++vcpu->stat.halt_exits; + return 0; +} +EXPORT_SYMBOL_GPL(kvm_emulate_halt); + int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) { unsigned long nr, a0, a1, a2, a3, a4, a5, ret; @@ -1469,6 +1533,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MC0_MISC+16: case MSR_IA32_UCODE_REV: case MSR_IA32_PERF_STATUS: + case MSR_IA32_EBL_CR_POWERON: /* MTRR registers */ case 0xfe: case 0x200 ... 0x2ff: @@ -1727,6 +1792,20 @@ static int complete_pio(struct kvm_vcpu *vcpu) return 0; } +void kernel_pio(struct kvm_io_device *pio_dev, struct kvm_vcpu *vcpu) +{ + /* TODO: String I/O for in kernel device */ + + if (vcpu->pio.in) + kvm_iodevice_read(pio_dev, vcpu->pio.port, + vcpu->pio.size, + vcpu->pio_data); + else + kvm_iodevice_write(pio_dev, vcpu->pio.port, + vcpu->pio.size, + vcpu->pio_data); +} + int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, int size, unsigned long count, int string, int down, gva_t address, int rep, unsigned port) @@ -1735,6 +1814,7 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, int i; int nr_pages = 1; struct page *page; + struct kvm_io_device *pio_dev; vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; @@ -1746,17 +1826,27 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->pio.cur_count = count; vcpu->pio.size = size; vcpu->pio.in = in; + vcpu->pio.port = port; vcpu->pio.string = string; vcpu->pio.down = down; vcpu->pio.guest_page_offset = offset_in_page(address); vcpu->pio.rep = rep; + pio_dev = vcpu_find_pio_dev(vcpu, port); if (!string) { kvm_arch_ops->cache_regs(vcpu); memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4); kvm_arch_ops->decache_regs(vcpu); + if (pio_dev) { + kernel_pio(pio_dev, vcpu); + complete_pio(vcpu); + return 1; + } return 0; } + /* TODO: String I/O for in kernel device */ + if (pio_dev) + printk(KERN_ERR "kvm_setup_pio: no string io support\n"); if (!count) { kvm_arch_ops->skip_emulated_instruction(vcpu); @@ -2273,34 +2363,12 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu) struct inode *inode; struct file *file; + r = anon_inode_getfd(&fd, &inode, &file, + "kvm-vcpu", &kvm_vcpu_fops, vcpu); + if (r) + return r; atomic_inc(&vcpu->kvm->filp->f_count); - inode = kvmfs_inode(&kvm_vcpu_fops); - if (IS_ERR(inode)) { - r = PTR_ERR(inode); - goto out1; - } - - file = kvmfs_file(inode, vcpu); - if (IS_ERR(file)) { - r = PTR_ERR(file); - goto out2; - } - - r = get_unused_fd(); - if (r < 0) - goto out3; - fd = r; - fd_install(fd, file); - return fd; - -out3: - fput(file); -out2: - iput(inode); -out1: - fput(vcpu->kvm->filp); - return r; } /* @@ -2363,6 +2431,11 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) if (r < 0) goto out_free_vcpus; + spin_lock(&kvm_lock); + if (n >= kvm->nvcpus) + kvm->nvcpus = n + 1; + spin_unlock(&kvm_lock); + return r; out_free_vcpus: @@ -2376,6 +2449,27 @@ out: return r; } +static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) +{ + u64 efer; + int i; + struct kvm_cpuid_entry *e, *entry; + + rdmsrl(MSR_EFER, efer); + entry = NULL; + for (i = 0; i < vcpu->cpuid_nent; ++i) { + e = &vcpu->cpuid_entries[i]; + if (e->function == 0x80000001) { + entry = e; + break; + } + } + if (entry && (entry->edx & EFER_NX) && !(efer & EFER_NX)) { + entry->edx &= ~(1 << 20); + printk(KERN_INFO ": guest NX capability removed\n"); + } +} + static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, struct kvm_cpuid_entry __user *entries) @@ -2390,6 +2484,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, cpuid->nent * sizeof(struct kvm_cpuid_entry))) goto out; vcpu->cpuid_nent = cpuid->nent; + cpuid_fix_nx_cap(vcpu); return 0; out: @@ -2738,41 +2833,18 @@ static int kvm_dev_ioctl_create_vm(void) struct file *file; struct kvm *kvm; - inode = kvmfs_inode(&kvm_vm_fops); - if (IS_ERR(inode)) { - r = PTR_ERR(inode); - goto out1; - } - kvm = kvm_create_vm(); - if (IS_ERR(kvm)) { - r = PTR_ERR(kvm); - goto out2; + if (IS_ERR(kvm)) + return PTR_ERR(kvm); + r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm); + if (r) { + kvm_destroy_vm(kvm); + return r; } - file = kvmfs_file(inode, kvm); - if (IS_ERR(file)) { - r = PTR_ERR(file); - goto out3; - } kvm->filp = file; - r = get_unused_fd(); - if (r < 0) - goto out4; - fd = r; - fd_install(fd, file); - return fd; - -out4: - fput(file); -out3: - kvm_destroy_vm(kvm); -out2: - iput(inode); -out1: - return r; } static long kvm_dev_ioctl(struct file *filp, @@ -2862,7 +2934,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val, * in vmx root mode. */ printk(KERN_INFO "kvm: exiting hardware virtualization\n"); - on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); + on_each_cpu(hardware_disable, NULL, 0, 1); } return NOTIFY_OK; } @@ -2905,33 +2977,88 @@ static void decache_vcpus_on_cpu(int cpu) spin_unlock(&kvm_lock); } +static void hardware_enable(void *junk) +{ + int cpu = raw_smp_processor_id(); + + if (cpu_isset(cpu, cpus_hardware_enabled)) + return; + cpu_set(cpu, cpus_hardware_enabled); + kvm_arch_ops->hardware_enable(NULL); +} + +static void hardware_disable(void *junk) +{ + int cpu = raw_smp_processor_id(); + + if (!cpu_isset(cpu, cpus_hardware_enabled)) + return; + cpu_clear(cpu, cpus_hardware_enabled); + decache_vcpus_on_cpu(cpu); + kvm_arch_ops->hardware_disable(NULL); +} + static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, void *v) { int cpu = (long)v; switch (val) { - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: + case CPU_DYING: + case CPU_DYING_FROZEN: case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", cpu); - decache_vcpus_on_cpu(cpu); - smp_call_function_single(cpu, kvm_arch_ops->hardware_disable, - NULL, 0, 1); + smp_call_function_single(cpu, hardware_disable, NULL, 0, 1); break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", cpu); - smp_call_function_single(cpu, kvm_arch_ops->hardware_enable, - NULL, 0, 1); + smp_call_function_single(cpu, hardware_enable, NULL, 0, 1); break; } return NOTIFY_OK; } +void kvm_io_bus_init(struct kvm_io_bus *bus) +{ + memset(bus, 0, sizeof(*bus)); +} + +void kvm_io_bus_destroy(struct kvm_io_bus *bus) +{ + int i; + + for (i = 0; i < bus->dev_count; i++) { + struct kvm_io_device *pos = bus->devs[i]; + + kvm_iodevice_destructor(pos); + } +} + +struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr) +{ + int i; + + for (i = 0; i < bus->dev_count; i++) { + struct kvm_io_device *pos = bus->devs[i]; + + if (pos->in_range(pos, addr)) + return pos; + } + + return NULL; +} + +void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) +{ + BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); + + bus->devs[bus->dev_count++] = dev; +} + static struct notifier_block kvm_cpu_notifier = { .notifier_call = kvm_cpu_hotplug, .priority = 20, /* must be > scheduler priority */ @@ -2983,14 +3110,13 @@ static void kvm_exit_debug(void) static int kvm_suspend(struct sys_device *dev, pm_message_t state) { - decache_vcpus_on_cpu(raw_smp_processor_id()); - on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); + hardware_disable(NULL); return 0; } static int kvm_resume(struct sys_device *dev) { - on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1); + hardware_enable(NULL); return 0; } @@ -3007,18 +3133,6 @@ static struct sys_device kvm_sysdev = { hpa_t bad_page_address; -static int kvmfs_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) -{ - return get_sb_pseudo(fs_type, "kvm:", NULL, KVMFS_SUPER_MAGIC, mnt); -} - -static struct file_system_type kvm_fs_type = { - .name = "kvmfs", - .get_sb = kvmfs_get_sb, - .kill_sb = kill_anon_super, -}; - int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) { int r; @@ -3043,7 +3157,7 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) if (r < 0) goto out; - on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1); + on_each_cpu(hardware_enable, NULL, 0, 1); r = register_cpu_notifier(&kvm_cpu_notifier); if (r) goto out_free_1; @@ -3075,7 +3189,7 @@ out_free_2: unregister_reboot_notifier(&kvm_reboot_notifier); unregister_cpu_notifier(&kvm_cpu_notifier); out_free_1: - on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); + on_each_cpu(hardware_disable, NULL, 0, 1); kvm_arch_ops->hardware_unsetup(); out: kvm_arch_ops = NULL; @@ -3089,7 +3203,7 @@ void kvm_exit_arch(void) sysdev_class_unregister(&kvm_sysdev_class); unregister_reboot_notifier(&kvm_reboot_notifier); unregister_cpu_notifier(&kvm_cpu_notifier); - on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); + on_each_cpu(hardware_disable, NULL, 0, 1); kvm_arch_ops->hardware_unsetup(); kvm_arch_ops = NULL; } @@ -3103,14 +3217,6 @@ static __init int kvm_init(void) if (r) goto out4; - r = register_filesystem(&kvm_fs_type); - if (r) - goto out3; - - kvmfs_mnt = kern_mount(&kvm_fs_type); - r = PTR_ERR(kvmfs_mnt); - if (IS_ERR(kvmfs_mnt)) - goto out2; kvm_init_debug(); kvm_init_msr_list(); @@ -3127,10 +3233,6 @@ static __init int kvm_init(void) out: kvm_exit_debug(); - mntput(kvmfs_mnt); -out2: - unregister_filesystem(&kvm_fs_type); -out3: kvm_mmu_module_exit(); out4: return r; @@ -3140,8 +3242,6 @@ static __exit void kvm_exit(void) { kvm_exit_debug(); __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT)); - mntput(kvmfs_mnt); - unregister_filesystem(&kvm_fs_type); kvm_mmu_module_exit(); } diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index e8e2281..b297a6b 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -16,15 +16,18 @@ * the COPYING file in the top-level directory. * */ + +#include "vmx.h" +#include "kvm.h" + #include <linux/types.h> #include <linux/string.h> -#include <asm/page.h> #include <linux/mm.h> #include <linux/highmem.h> #include <linux/module.h> -#include "vmx.h" -#include "kvm.h" +#include <asm/page.h> +#include <asm/cmpxchg.h> #undef MMU_DEBUG @@ -90,25 +93,11 @@ static int dbg = 1; #define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) -#define PT32_PTE_COPY_MASK \ - (PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK | PT_GLOBAL_MASK) - -#define PT64_PTE_COPY_MASK (PT64_NX_MASK | PT32_PTE_COPY_MASK) - #define PT_FIRST_AVAIL_BITS_SHIFT 9 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 -#define PT_SHADOW_PS_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) -#define PT_SHADOW_WRITABLE_SHIFT (PT_FIRST_AVAIL_BITS_SHIFT + 1) -#define PT_SHADOW_WRITABLE_MASK (1ULL << PT_SHADOW_WRITABLE_SHIFT) - -#define PT_SHADOW_USER_SHIFT (PT_SHADOW_WRITABLE_SHIFT + 1) -#define PT_SHADOW_USER_MASK (1ULL << (PT_SHADOW_USER_SHIFT)) - -#define PT_SHADOW_BITS_OFFSET (PT_SHADOW_WRITABLE_SHIFT - PT_WRITABLE_SHIFT) - #define VALID_PAGE(x) ((x) != INVALID_PAGE) #define PT64_LEVEL_BITS 9 @@ -165,6 +154,8 @@ struct kvm_rmap_desc { static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; +static struct kmem_cache *mmu_page_cache; +static struct kmem_cache *mmu_page_header_cache; static int is_write_protection(struct kvm_vcpu *vcpu) { @@ -202,6 +193,15 @@ static int is_rmap_pte(u64 pte) == (PT_WRITABLE_MASK | PT_PRESENT_MASK); } +static void set_shadow_pte(u64 *sptep, u64 spte) +{ +#ifdef CONFIG_X86_64 + set_64bit((unsigned long *)sptep, spte); +#else + set_64bit((unsigned long long *)sptep, spte); +#endif +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, struct kmem_cache *base_cache, int min, gfp_t gfp_flags) @@ -235,6 +235,14 @@ static int __mmu_topup_memory_caches(struct kvm_vcpu *vcpu, gfp_t gfp_flags) goto out; r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache, rmap_desc_cache, 1, gfp_flags); + if (r) + goto out; + r = mmu_topup_memory_cache(&vcpu->mmu_page_cache, + mmu_page_cache, 4, gfp_flags); + if (r) + goto out; + r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache, + mmu_page_header_cache, 4, gfp_flags); out: return r; } @@ -258,6 +266,8 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache); mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache); + mmu_free_memory_cache(&vcpu->mmu_page_cache); + mmu_free_memory_cache(&vcpu->mmu_page_header_cache); } static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, @@ -433,19 +443,18 @@ static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) BUG_ON(!(*spte & PT_WRITABLE_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); rmap_remove(vcpu, spte); - kvm_arch_ops->tlb_flush(vcpu); - *spte &= ~(u64)PT_WRITABLE_MASK; + set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); + kvm_flush_remote_tlbs(vcpu->kvm); } } #ifdef MMU_DEBUG -static int is_empty_shadow_page(hpa_t page_hpa) +static int is_empty_shadow_page(u64 *spt) { u64 *pos; u64 *end; - for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u64); - pos != end; pos++) + for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) if (*pos != 0) { printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, pos, *pos); @@ -455,13 +464,13 @@ static int is_empty_shadow_page(hpa_t page_hpa) } #endif -static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa) +static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page_head) { - struct kvm_mmu_page *page_head = page_header(page_hpa); - - ASSERT(is_empty_shadow_page(page_hpa)); - page_head->page_hpa = page_hpa; - list_move(&page_head->link, &vcpu->free_pages); + ASSERT(is_empty_shadow_page(page_head->spt)); + list_del(&page_head->link); + mmu_memory_cache_free(&vcpu->mmu_page_cache, page_head->spt); + mmu_memory_cache_free(&vcpu->mmu_page_header_cache, page_head); ++vcpu->kvm->n_free_mmu_pages; } @@ -475,12 +484,15 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, { struct kvm_mmu_page *page; - if (list_empty(&vcpu->free_pages)) + if (!vcpu->kvm->n_free_mmu_pages) return NULL; - page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); - list_move(&page->link, &vcpu->kvm->active_mmu_pages); - ASSERT(is_empty_shadow_page(page->page_hpa)); + page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache, + sizeof *page); + page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE); + set_page_private(virt_to_page(page->spt), (unsigned long)page); + list_add(&page->link, &vcpu->kvm->active_mmu_pages); + ASSERT(is_empty_shadow_page(page->spt)); page->slot_bitmap = 0; page->multimapped = 0; page->parent_pte = parent_pte; @@ -638,7 +650,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu, u64 *pt; u64 ent; - pt = __va(page->page_hpa); + pt = page->spt; if (page->role.level == PT_PAGE_TABLE_LEVEL) { for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { @@ -646,7 +658,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu, rmap_remove(vcpu, &pt[i]); pt[i] = 0; } - kvm_arch_ops->tlb_flush(vcpu); + kvm_flush_remote_tlbs(vcpu->kvm); return; } @@ -659,6 +671,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu, ent &= PT64_BASE_ADDR_MASK; mmu_page_remove_parent_pte(vcpu, page_header(ent), &pt[i]); } + kvm_flush_remote_tlbs(vcpu->kvm); } static void kvm_mmu_put_page(struct kvm_vcpu *vcpu, @@ -685,12 +698,12 @@ static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu, } BUG_ON(!parent_pte); kvm_mmu_put_page(vcpu, page, parent_pte); - *parent_pte = 0; + set_shadow_pte(parent_pte, 0); } kvm_mmu_page_unlink_children(vcpu, page); if (!page->root_count) { hlist_del(&page->hash_link); - kvm_mmu_free_page(vcpu, page->page_hpa); + kvm_mmu_free_page(vcpu, page); } else list_move(&page->link, &vcpu->kvm->active_mmu_pages); } @@ -717,6 +730,17 @@ static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn) return r; } +static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + struct kvm_mmu_page *page; + + while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) { + pgprintk("%s: zap %lx %x\n", + __FUNCTION__, gfn, page->role.word); + kvm_mmu_zap_page(vcpu, page); + } +} + static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa) { int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT)); @@ -805,7 +829,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) return -ENOMEM; } - table[index] = new_table->page_hpa | PT_PRESENT_MASK + table[index] = __pa(new_table->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK; } table_addr = table[index] & PT64_BASE_ADDR_MASK; @@ -817,11 +841,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) int i; struct kvm_mmu_page *page; + if (!VALID_PAGE(vcpu->mmu.root_hpa)) + return; #ifdef CONFIG_X86_64 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->mmu.root_hpa; - ASSERT(VALID_PAGE(root)); page = page_header(root); --page->root_count; vcpu->mmu.root_hpa = INVALID_PAGE; @@ -832,7 +857,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->mmu.pae_root[i]; if (root) { - ASSERT(VALID_PAGE(root)); root &= PT64_BASE_ADDR_MASK; page = page_header(root); --page->root_count; @@ -857,7 +881,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); page = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, 0, 0, NULL); - root = page->page_hpa; + root = __pa(page->spt); ++page->root_count; vcpu->mmu.root_hpa = root; return; @@ -878,7 +902,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) page = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, !is_paging(vcpu), 0, NULL); - root = page->page_hpa; + root = __pa(page->spt); ++page->root_count; vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK; } @@ -928,9 +952,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) context->free = nonpaging_free; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; - mmu_alloc_roots(vcpu); - ASSERT(VALID_PAGE(context->root_hpa)); - kvm_arch_ops->set_cr3(vcpu, context->root_hpa); + context->root_hpa = INVALID_PAGE; return 0; } @@ -944,59 +966,6 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu) { pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); mmu_free_roots(vcpu); - if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) - kvm_mmu_free_some_pages(vcpu); - mmu_alloc_roots(vcpu); - kvm_mmu_flush_tlb(vcpu); - kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); -} - -static inline void set_pte_common(struct kvm_vcpu *vcpu, - u64 *shadow_pte, - gpa_t gaddr, - int dirty, - u64 access_bits, - gfn_t gfn) -{ - hpa_t paddr; - - *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET; - if (!dirty) - access_bits &= ~PT_WRITABLE_MASK; - - paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK); - - *shadow_pte |= access_bits; - - if (is_error_hpa(paddr)) { - *shadow_pte |= gaddr; - *shadow_pte |= PT_SHADOW_IO_MARK; - *shadow_pte &= ~PT_PRESENT_MASK; - return; - } - - *shadow_pte |= paddr; - - if (access_bits & PT_WRITABLE_MASK) { - struct kvm_mmu_page *shadow; - - shadow = kvm_mmu_lookup_page(vcpu, gfn); - if (shadow) { - pgprintk("%s: found shadow page for %lx, marking ro\n", - __FUNCTION__, gfn); - access_bits &= ~PT_WRITABLE_MASK; - if (is_writeble_pte(*shadow_pte)) { - *shadow_pte &= ~PT_WRITABLE_MASK; - kvm_arch_ops->tlb_flush(vcpu); - } - } - } - - if (access_bits & PT_WRITABLE_MASK) - mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); - - page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); - rmap_add(vcpu, shadow_pte); } static void inject_page_fault(struct kvm_vcpu *vcpu, @@ -1006,23 +975,6 @@ static void inject_page_fault(struct kvm_vcpu *vcpu, kvm_arch_ops->inject_page_fault(vcpu, addr, err_code); } -static inline int fix_read_pf(u64 *shadow_ent) -{ - if ((*shadow_ent & PT_SHADOW_USER_MASK) && - !(*shadow_ent & PT_USER_MASK)) { - /* - * If supervisor write protect is disabled, we shadow kernel - * pages as user pages so we can trap the write access. - */ - *shadow_ent |= PT_USER_MASK; - *shadow_ent &= ~PT_WRITABLE_MASK; - - return 1; - - } - return 0; -} - static void paging_free(struct kvm_vcpu *vcpu) { nonpaging_free(vcpu); @@ -1047,10 +999,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) context->free = paging_free; context->root_level = level; context->shadow_root_level = level; - mmu_alloc_roots(vcpu); - ASSERT(VALID_PAGE(context->root_hpa)); - kvm_arch_ops->set_cr3(vcpu, context->root_hpa | - (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); + context->root_hpa = INVALID_PAGE; return 0; } @@ -1069,10 +1018,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) context->free = paging_free; context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; - mmu_alloc_roots(vcpu); - ASSERT(VALID_PAGE(context->root_hpa)); - kvm_arch_ops->set_cr3(vcpu, context->root_hpa | - (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); + context->root_hpa = INVALID_PAGE; return 0; } @@ -1107,18 +1053,33 @@ static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { + destroy_kvm_mmu(vcpu); + return init_kvm_mmu(vcpu); +} + +int kvm_mmu_load(struct kvm_vcpu *vcpu) +{ int r; - destroy_kvm_mmu(vcpu); - r = init_kvm_mmu(vcpu); - if (r < 0) - goto out; + spin_lock(&vcpu->kvm->lock); r = mmu_topup_memory_caches(vcpu); + if (r) + goto out; + mmu_alloc_roots(vcpu); + kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); + kvm_mmu_flush_tlb(vcpu); out: + spin_unlock(&vcpu->kvm->lock); return r; } +EXPORT_SYMBOL_GPL(kvm_mmu_load); + +void kvm_mmu_unload(struct kvm_vcpu *vcpu) +{ + mmu_free_roots(vcpu); +} -static void mmu_pre_write_zap_pte(struct kvm_vcpu *vcpu, +static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, u64 *spte) { @@ -1135,9 +1096,25 @@ static void mmu_pre_write_zap_pte(struct kvm_vcpu *vcpu, } } *spte = 0; + kvm_flush_remote_tlbs(vcpu->kvm); +} + +static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page, + u64 *spte, + const void *new, int bytes) +{ + if (page->role.level != PT_PAGE_TABLE_LEVEL) + return; + + if (page->role.glevels == PT32_ROOT_LEVEL) + paging32_update_pte(vcpu, page, spte, new, bytes); + else + paging64_update_pte(vcpu, page, spte, new, bytes); } -void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) +void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *old, const u8 *new, int bytes) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *page; @@ -1149,6 +1126,7 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) unsigned pte_size; unsigned page_offset; unsigned misaligned; + unsigned quadrant; int level; int flooded = 0; int npte; @@ -1169,6 +1147,7 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) continue; pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); + misaligned |= bytes < 4; if (misaligned || flooded) { /* * Misaligned accesses are too much trouble to fix @@ -1200,21 +1179,20 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) page_offset <<= 1; npte = 2; } + quadrant = page_offset >> PAGE_SHIFT; page_offset &= ~PAGE_MASK; + if (quadrant != page->role.quadrant) + continue; } - spte = __va(page->page_hpa); - spte += page_offset / sizeof(*spte); + spte = &page->spt[page_offset / sizeof(*spte)]; while (npte--) { - mmu_pre_write_zap_pte(vcpu, page, spte); + mmu_pte_write_zap_pte(vcpu, page, spte); + mmu_pte_write_new_pte(vcpu, page, spte, new, bytes); ++spte; } } } -void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) -{ -} - int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) { gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); @@ -1243,13 +1221,6 @@ static void free_mmu_pages(struct kvm_vcpu *vcpu) struct kvm_mmu_page, link); kvm_mmu_zap_page(vcpu, page); } - while (!list_empty(&vcpu->free_pages)) { - page = list_entry(vcpu->free_pages.next, - struct kvm_mmu_page, link); - list_del(&page->link); - __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT)); - page->page_hpa = INVALID_PAGE; - } free_page((unsigned long)vcpu->mmu.pae_root); } @@ -1260,18 +1231,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) ASSERT(vcpu); - for (i = 0; i < KVM_NUM_MMU_PAGES; i++) { - struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i]; - - INIT_LIST_HEAD(&page_header->link); - if ((page = alloc_page(GFP_KERNEL)) == NULL) - goto error_1; - set_page_private(page, (unsigned long)page_header); - page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT; - memset(__va(page_header->page_hpa), 0, PAGE_SIZE); - list_add(&page_header->link, &vcpu->free_pages); - ++vcpu->kvm->n_free_mmu_pages; - } + vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES; /* * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. @@ -1296,7 +1256,6 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) { ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); - ASSERT(list_empty(&vcpu->free_pages)); return alloc_mmu_pages(vcpu); } @@ -1305,7 +1264,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) { ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); - ASSERT(!list_empty(&vcpu->free_pages)); return init_kvm_mmu(vcpu); } @@ -1331,7 +1289,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot) if (!test_bit(slot, &page->slot_bitmap)) continue; - pt = __va(page->page_hpa); + pt = page->spt; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) /* avoid RMW */ if (pt[i] & PT_WRITABLE_MASK) { @@ -1354,7 +1312,7 @@ void kvm_mmu_zap_all(struct kvm_vcpu *vcpu) } mmu_free_memory_caches(vcpu); - kvm_arch_ops->tlb_flush(vcpu); + kvm_flush_remote_tlbs(vcpu->kvm); init_kvm_mmu(vcpu); } @@ -1364,6 +1322,10 @@ void kvm_mmu_module_exit(void) kmem_cache_destroy(pte_chain_cache); if (rmap_desc_cache) kmem_cache_destroy(rmap_desc_cache); + if (mmu_page_cache) + kmem_cache_destroy(mmu_page_cache); + if (mmu_page_header_cache) + kmem_cache_destroy(mmu_page_header_cache); } int kvm_mmu_module_init(void) @@ -1379,6 +1341,18 @@ int kvm_mmu_module_init(void) if (!rmap_desc_cache) goto nomem; + mmu_page_cache = kmem_cache_create("kvm_mmu_page", + PAGE_SIZE, + PAGE_SIZE, 0, NULL, NULL); + if (!mmu_page_cache) + goto nomem; + + mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", + sizeof(struct kvm_mmu_page), + 0, 0, NULL, NULL); + if (!mmu_page_header_cache) + goto nomem; + return 0; nomem: @@ -1482,7 +1456,7 @@ static int count_writable_mappings(struct kvm_vcpu *vcpu) int i; list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) { - u64 *pt = __va(page->page_hpa); + u64 *pt = page->spt; if (page->role.level != PT_PAGE_TABLE_LEVEL) continue; diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 73ffbff..a7c5cb0 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -31,7 +31,6 @@ #define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) - #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS 4 #else @@ -46,7 +45,6 @@ #define PT_INDEX(addr, level) PT32_INDEX(addr, level) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) - #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK #define PT_MAX_FULL_LEVELS 2 #else #error Invalid PTTYPE value @@ -192,40 +190,143 @@ static void FNAME(mark_pagetable_dirty)(struct kvm *kvm, mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]); } -static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte, - u64 *shadow_pte, u64 access_bits, gfn_t gfn) +static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, + u64 *shadow_pte, + gpa_t gaddr, + pt_element_t *gpte, + u64 access_bits, + int user_fault, + int write_fault, + int *ptwrite, + struct guest_walker *walker, + gfn_t gfn) { - ASSERT(*shadow_pte == 0); - access_bits &= guest_pte; - *shadow_pte = (guest_pte & PT_PTE_COPY_MASK); - set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK, - guest_pte & PT_DIRTY_MASK, access_bits, gfn); + hpa_t paddr; + int dirty = *gpte & PT_DIRTY_MASK; + u64 spte = *shadow_pte; + int was_rmapped = is_rmap_pte(spte); + + pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d" + " user_fault %d gfn %lx\n", + __FUNCTION__, spte, (u64)*gpte, access_bits, + write_fault, user_fault, gfn); + + if (write_fault && !dirty) { + *gpte |= PT_DIRTY_MASK; + dirty = 1; + FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); + } + + spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK; + spte |= *gpte & PT64_NX_MASK; + if (!dirty) + access_bits &= ~PT_WRITABLE_MASK; + + paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK); + + spte |= PT_PRESENT_MASK; + if (access_bits & PT_USER_MASK) + spte |= PT_USER_MASK; + + if (is_error_hpa(paddr)) { + spte |= gaddr; + spte |= PT_SHADOW_IO_MARK; + spte &= ~PT_PRESENT_MASK; + set_shadow_pte(shadow_pte, spte); + return; + } + + spte |= paddr; + + if ((access_bits & PT_WRITABLE_MASK) + || (write_fault && !is_write_protection(vcpu) && !user_fault)) { + struct kvm_mmu_page *shadow; + + spte |= PT_WRITABLE_MASK; + if (user_fault) { + mmu_unshadow(vcpu, gfn); + goto unshadowed; + } + + shadow = kvm_mmu_lookup_page(vcpu, gfn); + if (shadow) { + pgprintk("%s: found shadow page for %lx, marking ro\n", + __FUNCTION__, gfn); + access_bits &= ~PT_WRITABLE_MASK; + if (is_writeble_pte(spte)) { + spte &= ~PT_WRITABLE_MASK; + kvm_arch_ops->tlb_flush(vcpu); + } + if (write_fault) + *ptwrite = 1; + } + } + +unshadowed: + + if (access_bits & PT_WRITABLE_MASK) + mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); + + set_shadow_pte(shadow_pte, spte); + page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); + if (!was_rmapped) + rmap_add(vcpu, shadow_pte); } -static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde, - u64 *shadow_pte, u64 access_bits, gfn_t gfn) +static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t *gpte, + u64 *shadow_pte, u64 access_bits, + int user_fault, int write_fault, int *ptwrite, + struct guest_walker *walker, gfn_t gfn) +{ + access_bits &= *gpte; + FNAME(set_pte_common)(vcpu, shadow_pte, *gpte & PT_BASE_ADDR_MASK, + gpte, access_bits, user_fault, write_fault, + ptwrite, walker, gfn); +} + +static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, + u64 *spte, const void *pte, int bytes) +{ + pt_element_t gpte; + + if (bytes < sizeof(pt_element_t)) + return; + gpte = *(const pt_element_t *)pte; + if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) + return; + pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); + FNAME(set_pte)(vcpu, &gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0, + 0, NULL, NULL, + (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT); +} + +static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t *gpde, + u64 *shadow_pte, u64 access_bits, + int user_fault, int write_fault, int *ptwrite, + struct guest_walker *walker, gfn_t gfn) { gpa_t gaddr; - ASSERT(*shadow_pte == 0); - access_bits &= guest_pde; + access_bits &= *gpde; gaddr = (gpa_t)gfn << PAGE_SHIFT; if (PTTYPE == 32 && is_cpuid_PSE36()) - gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) << + gaddr |= (*gpde & PT32_DIR_PSE36_MASK) << (32 - PT32_DIR_PSE36_SHIFT); - *shadow_pte = guest_pde & PT_PTE_COPY_MASK; - set_pte_common(vcpu, shadow_pte, gaddr, - guest_pde & PT_DIRTY_MASK, access_bits, gfn); + FNAME(set_pte_common)(vcpu, shadow_pte, gaddr, + gpde, access_bits, user_fault, write_fault, + ptwrite, walker, gfn); } /* * Fetch a shadow pte for a specific level in the paging hierarchy. */ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, - struct guest_walker *walker) + struct guest_walker *walker, + int user_fault, int write_fault, int *ptwrite) { hpa_t shadow_addr; int level; + u64 *shadow_ent; u64 *prev_shadow_ent = NULL; pt_element_t *guest_ent = walker->ptep; @@ -242,37 +343,23 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, for (; ; level--) { u32 index = SHADOW_PT_INDEX(addr, level); - u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index; struct kvm_mmu_page *shadow_page; u64 shadow_pte; int metaphysical; gfn_t table_gfn; unsigned hugepage_access = 0; + shadow_ent = ((u64 *)__va(shadow_addr)) + index; if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { if (level == PT_PAGE_TABLE_LEVEL) - return shadow_ent; + break; shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; prev_shadow_ent = shadow_ent; continue; } - if (level == PT_PAGE_TABLE_LEVEL) { - - if (walker->level == PT_DIRECTORY_LEVEL) { - if (prev_shadow_ent) - *prev_shadow_ent |= PT_SHADOW_PS_MARK; - FNAME(set_pde)(vcpu, *guest_ent, shadow_ent, - walker->inherited_ar, - walker->gfn); - } else { - ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); - FNAME(set_pte)(vcpu, *guest_ent, shadow_ent, - walker->inherited_ar, - walker->gfn); - } - return shadow_ent; - } + if (level == PT_PAGE_TABLE_LEVEL) + break; if (level - 1 == PT_PAGE_TABLE_LEVEL && walker->level == PT_DIRECTORY_LEVEL) { @@ -289,90 +376,24 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, metaphysical, hugepage_access, shadow_ent); - shadow_addr = shadow_page->page_hpa; + shadow_addr = __pa(shadow_page->spt); shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; *shadow_ent = shadow_pte; prev_shadow_ent = shadow_ent; } -} -/* - * The guest faulted for write. We need to - * - * - check write permissions - * - update the guest pte dirty bit - * - update our own dirty page tracking structures - */ -static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu, - u64 *shadow_ent, - struct guest_walker *walker, - gva_t addr, - int user, - int *write_pt) -{ - pt_element_t *guest_ent; - int writable_shadow; - gfn_t gfn; - struct kvm_mmu_page *page; - - if (is_writeble_pte(*shadow_ent)) - return !user || (*shadow_ent & PT_USER_MASK); - - writable_shadow = *shadow_ent & PT_SHADOW_WRITABLE_MASK; - if (user) { - /* - * User mode access. Fail if it's a kernel page or a read-only - * page. - */ - if (!(*shadow_ent & PT_SHADOW_USER_MASK) || !writable_shadow) - return 0; - ASSERT(*shadow_ent & PT_USER_MASK); - } else - /* - * Kernel mode access. Fail if it's a read-only page and - * supervisor write protection is enabled. - */ - if (!writable_shadow) { - if (is_write_protection(vcpu)) - return 0; - *shadow_ent &= ~PT_USER_MASK; - } - - guest_ent = walker->ptep; - - if (!is_present_pte(*guest_ent)) { - *shadow_ent = 0; - return 0; + if (walker->level == PT_DIRECTORY_LEVEL) { + FNAME(set_pde)(vcpu, guest_ent, shadow_ent, + walker->inherited_ar, user_fault, write_fault, + ptwrite, walker, walker->gfn); + } else { + ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); + FNAME(set_pte)(vcpu, guest_ent, shadow_ent, + walker->inherited_ar, user_fault, write_fault, + ptwrite, walker, walker->gfn); } - - gfn = walker->gfn; - - if (user) { - /* - * Usermode page faults won't be for page table updates. - */ - while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) { - pgprintk("%s: zap %lx %x\n", - __FUNCTION__, gfn, page->role.word); - kvm_mmu_zap_page(vcpu, page); - } - } else if (kvm_mmu_lookup_page(vcpu, gfn)) { - pgprintk("%s: found shadow page for %lx, marking ro\n", - __FUNCTION__, gfn); - mark_page_dirty(vcpu->kvm, gfn); - FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); - *guest_ent |= PT_DIRTY_MASK; - *write_pt = 1; - return 0; - } - mark_page_dirty(vcpu->kvm, gfn); - *shadow_ent |= PT_WRITABLE_MASK; - FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); - *guest_ent |= PT_DIRTY_MASK; - rmap_add(vcpu, shadow_ent); - - return 1; + return shadow_ent; } /* @@ -397,7 +418,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, int fetch_fault = error_code & PFERR_FETCH_MASK; struct guest_walker walker; u64 *shadow_pte; - int fixed; int write_pt = 0; int r; @@ -421,27 +441,20 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, pgprintk("%s: guest page fault\n", __FUNCTION__); inject_page_fault(vcpu, addr, walker.error_code); FNAME(release_walker)(&walker); + vcpu->last_pt_write_count = 0; /* reset fork detector */ return 0; } - shadow_pte = FNAME(fetch)(vcpu, addr, &walker); - pgprintk("%s: shadow pte %p %llx\n", __FUNCTION__, - shadow_pte, *shadow_pte); - - /* - * Update the shadow pte. - */ - if (write_fault) - fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr, - user_fault, &write_pt); - else - fixed = fix_read_pf(shadow_pte); - - pgprintk("%s: updated shadow pte %p %llx\n", __FUNCTION__, - shadow_pte, *shadow_pte); + shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, + &write_pt); + pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, + shadow_pte, *shadow_pte, write_pt); FNAME(release_walker)(&walker); + if (!write_pt) + vcpu->last_pt_write_count = 0; /* reset fork detector */ + /* * mmio: emulate if accessible, otherwise its a guest fault. */ @@ -478,7 +491,5 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) #undef PT_INDEX #undef SHADOW_PT_INDEX #undef PT_LEVEL_MASK -#undef PT_PTE_COPY_MASK -#undef PT_NON_PTE_COPY_MASK #undef PT_DIR_BASE_ADDR_MASK #undef PT_MAX_FULL_LEVELS diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index fa17d6d..bc818cc 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -14,16 +14,17 @@ * */ +#include "kvm_svm.h" +#include "x86_emulate.h" + #include <linux/module.h> #include <linux/kernel.h> #include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/profile.h> #include <linux/sched.h> -#include <asm/desc.h> -#include "kvm_svm.h" -#include "x86_emulate.h" +#include <asm/desc.h> MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -378,7 +379,7 @@ static __init int svm_hardware_setup(void) int cpu; struct page *iopm_pages; struct page *msrpm_pages; - void *msrpm_va; + void *iopm_va, *msrpm_va; int r; kvm_emulator_want_group7_invlpg(); @@ -387,8 +388,10 @@ static __init int svm_hardware_setup(void) if (!iopm_pages) return -ENOMEM; - memset(page_address(iopm_pages), 0xff, - PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); + + iopm_va = page_address(iopm_pages); + memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); + clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */ iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; @@ -579,7 +582,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) goto out2; vcpu->svm->vmcb = page_address(page); - memset(vcpu->svm->vmcb, 0, PAGE_SIZE); + clear_page(vcpu->svm->vmcb); vcpu->svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; vcpu->svm->asid_generation = 0; memset(vcpu->svm->db_regs, 0, sizeof(vcpu->svm->db_regs)); @@ -587,9 +590,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) fx_init(vcpu); vcpu->fpu_active = 1; - vcpu->apic_base = 0xfee00000 | - /*for vcpu 0*/ MSR_IA32_APICBASE_BSP | - MSR_IA32_APICBASE_ENABLE; + vcpu->apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; + if (vcpu == &vcpu->kvm->vcpus[0]) + vcpu->apic_base |= MSR_IA32_APICBASE_BSP; return 0; @@ -955,7 +958,7 @@ static int shutdown_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) * VMCB is undefined after a SHUTDOWN intercept * so reinitialize it. */ - memset(vcpu->svm->vmcb, 0, PAGE_SIZE); + clear_page(vcpu->svm->vmcb); init_vmcb(vcpu->svm->vmcb); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; @@ -1113,12 +1116,7 @@ static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1; skip_emulated_instruction(vcpu); - if (vcpu->irq_summary) - return 1; - - kvm_run->exit_reason = KVM_EXIT_HLT; - ++vcpu->stat.halt_exits; - return 0; + return kvm_emulate_halt(vcpu); } static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) @@ -1473,6 +1471,11 @@ static void load_db_regs(unsigned long *db_regs) asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3])); } +static void svm_flush_tlb(struct kvm_vcpu *vcpu) +{ + force_new_asid(vcpu); +} + static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u16 fs_selector; @@ -1481,11 +1484,20 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; again: + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) + return r; + if (!vcpu->mmio_read_completed) do_interrupt_requests(vcpu, kvm_run); clgi(); + vcpu->guest_mode = 1; + if (vcpu->requests) + if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) + svm_flush_tlb(vcpu); + pre_svm_run(vcpu); save_host_msrs(vcpu); @@ -1617,6 +1629,8 @@ again: #endif : "cc", "memory" ); + vcpu->guest_mode = 0; + if (vcpu->fpu_active) { fx_save(vcpu->guest_fx_image); fx_restore(vcpu->host_fx_image); @@ -1681,11 +1695,6 @@ again: return r; } -static void svm_flush_tlb(struct kvm_vcpu *vcpu) -{ - force_new_asid(vcpu); -} - static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { vcpu->svm->vmcb->save.cr3 = root; @@ -1727,6 +1736,12 @@ static void svm_inject_page_fault(struct kvm_vcpu *vcpu, static int is_disabled(void) { + u64 vm_cr; + + rdmsrl(MSR_VM_CR, vm_cr); + if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) + return 1; + return 0; } diff --git a/drivers/kvm/svm.h b/drivers/kvm/svm.h index 5e93814..3b1b0f3 100644 --- a/drivers/kvm/svm.h +++ b/drivers/kvm/svm.h @@ -175,8 +175,11 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_CPUID_FUNC 0x8000000a #define MSR_EFER_SVME_MASK (1ULL << 12) +#define MSR_VM_CR 0xc0010114 #define MSR_VM_HSAVE_PA 0xc0010117ULL +#define SVM_VM_CR_SVM_DISABLE 4 + #define SVM_SELECTOR_S_SHIFT 4 #define SVM_SELECTOR_DPL_SHIFT 5 #define SVM_SELECTOR_P_SHIFT 7 diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index c1ac106..80628f6 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -17,28 +17,35 @@ #include "kvm.h" #include "vmx.h" +#include "segment_descriptor.h" + #include <linux/module.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/highmem.h> #include <linux/profile.h> #include <linux/sched.h> + #include <asm/io.h> #include <asm/desc.h> -#include "segment_descriptor.h" - MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); +static int init_rmode_tss(struct kvm *kvm); + static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); +static struct page *vmx_io_bitmap_a; +static struct page *vmx_io_bitmap_b; + #ifdef CONFIG_X86_64 #define HOST_IS_64 1 #else #define HOST_IS_64 0 #endif +#define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE) static struct vmcs_descriptor { int size; @@ -82,18 +89,17 @@ static const u32 vmx_msr_index[] = { }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) -#ifdef CONFIG_X86_64 -static unsigned msr_offset_kernel_gs_base; -#define NR_64BIT_MSRS 4 -/* - * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt - * mechanism (cpu bug AA24) - */ -#define NR_BAD_MSRS 2 -#else -#define NR_64BIT_MSRS 0 -#define NR_BAD_MSRS 0 -#endif +static inline u64 msr_efer_save_restore_bits(struct vmx_msr_entry msr) +{ + return (u64)msr.data & EFER_SAVE_RESTORE_BITS; +} + +static inline int msr_efer_need_save_restore(struct kvm_vcpu *vcpu) +{ + int efer_offset = vcpu->msr_offset_efer; + return msr_efer_save_restore_bits(vcpu->host_msrs[efer_offset]) != + msr_efer_save_restore_bits(vcpu->guest_msrs[efer_offset]); +} static inline int is_page_fault(u32 intr_info) { @@ -115,13 +121,23 @@ static inline int is_external_interrupt(u32 intr_info) == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); } -static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr) +static int __find_msr_index(struct kvm_vcpu *vcpu, u32 msr) { int i; for (i = 0; i < vcpu->nmsrs; ++i) if (vcpu->guest_msrs[i].index == msr) - return &vcpu->guest_msrs[i]; + return i; + return -1; +} + +static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr) +{ + int i; + + i = __find_msr_index(vcpu, msr); + if (i >= 0) + return &vcpu->guest_msrs[i]; return NULL; } @@ -147,6 +163,7 @@ static void __vcpu_clear(void *arg) vmcs_clear(vcpu->vmcs); if (per_cpu(current_vmcs, cpu) == vcpu->vmcs) per_cpu(current_vmcs, cpu) = NULL; + rdtscll(vcpu->host_tsc); } static void vcpu_clear(struct kvm_vcpu *vcpu) @@ -234,6 +251,127 @@ static void vmcs_set_bits(unsigned long field, u32 mask) vmcs_writel(field, vmcs_readl(field) | mask); } +static void update_exception_bitmap(struct kvm_vcpu *vcpu) +{ + u32 eb; + + eb = 1u << PF_VECTOR; + if (!vcpu->fpu_active) + eb |= 1u << NM_VECTOR; + if (vcpu->guest_debug.enabled) + eb |= 1u << 1; + if (vcpu->rmode.active) + eb = ~0; + vmcs_write32(EXCEPTION_BITMAP, eb); +} + +static void reload_tss(void) +{ +#ifndef CONFIG_X86_64 + + /* + * VT restores TR but not its size. Useless. + */ + struct descriptor_table gdt; + struct segment_descriptor *descs; + + get_gdt(&gdt); + descs = (void *)gdt.base; + descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ + load_TR_desc(); +#endif +} + +static void load_transition_efer(struct kvm_vcpu *vcpu) +{ + u64 trans_efer; + int efer_offset = vcpu->msr_offset_efer; + + trans_efer = vcpu->host_msrs[efer_offset].data; + trans_efer &= ~EFER_SAVE_RESTORE_BITS; + trans_efer |= msr_efer_save_restore_bits( + vcpu->guest_msrs[efer_offset]); + wrmsrl(MSR_EFER, trans_efer); + vcpu->stat.efer_reload++; +} + +static void vmx_save_host_state(struct kvm_vcpu *vcpu) +{ + struct vmx_host_state *hs = &vcpu->vmx_host_state; + + if (hs->loaded) + return; + + hs->loaded = 1; + /* + * Set host fs and gs selectors. Unfortunately, 22.2.3 does not + * allow segment selectors with cpl > 0 or ti == 1. + */ + hs->ldt_sel = read_ldt(); + hs->fs_gs_ldt_reload_needed = hs->ldt_sel; + hs->fs_sel = read_fs(); + if (!(hs->fs_sel & 7)) + vmcs_write16(HOST_FS_SELECTOR, hs->fs_sel); + else { + vmcs_write16(HOST_FS_SELECTOR, 0); + hs->fs_gs_ldt_reload_needed = 1; + } + hs->gs_sel = read_gs(); + if (!(hs->gs_sel & 7)) + vmcs_write16(HOST_GS_SELECTOR, hs->gs_sel); + else { + vmcs_write16(HOST_GS_SELECTOR, 0); + hs->fs_gs_ldt_reload_needed = 1; + } + +#ifdef CONFIG_X86_64 + vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); + vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); +#else + vmcs_writel(HOST_FS_BASE, segment_base(hs->fs_sel)); + vmcs_writel(HOST_GS_BASE, segment_base(hs->gs_sel)); +#endif + +#ifdef CONFIG_X86_64 + if (is_long_mode(vcpu)) { + save_msrs(vcpu->host_msrs + vcpu->msr_offset_kernel_gs_base, 1); + } +#endif + load_msrs(vcpu->guest_msrs, vcpu->save_nmsrs); + if (msr_efer_need_save_restore(vcpu)) + load_transition_efer(vcpu); +} + +static void vmx_load_host_state(struct kvm_vcpu *vcpu) +{ + struct vmx_host_state *hs = &vcpu->vmx_host_state; + + if (!hs->loaded) + return; + + hs->loaded = 0; + if (hs->fs_gs_ldt_reload_needed) { + load_ldt(hs->ldt_sel); + load_fs(hs->fs_sel); + /* + * If we have to reload gs, we must take care to + * preserve our gs base. + */ + local_irq_disable(); + load_gs(hs->gs_sel); +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); +#endif + local_irq_enable(); + + reload_tss(); + } + save_msrs(vcpu->guest_msrs, vcpu->save_nmsrs); + load_msrs(vcpu->host_msrs, vcpu->save_nmsrs); + if (msr_efer_need_save_restore(vcpu)) + load_msrs(vcpu->host_msrs + vcpu->msr_offset_efer, 1); +} + /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -242,6 +380,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu) { u64 phys_addr = __pa(vcpu->vmcs); int cpu; + u64 tsc_this, delta; cpu = get_cpu(); @@ -275,15 +414,43 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu) rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ + + /* + * Make sure the time stamp counter is monotonous. + */ + rdtscll(tsc_this); + delta = vcpu->host_tsc - tsc_this; + vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta); } } static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { + vmx_load_host_state(vcpu); kvm_put_guest_fpu(vcpu); put_cpu(); } +static void vmx_fpu_activate(struct kvm_vcpu *vcpu) +{ + if (vcpu->fpu_active) + return; + vcpu->fpu_active = 1; + vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); + if (vcpu->cr0 & CR0_TS_MASK) + vmcs_set_bits(GUEST_CR0, CR0_TS_MASK); + update_exception_bitmap(vcpu); +} + +static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) +{ + if (!vcpu->fpu_active) + return; + vcpu->fpu_active = 0; + vmcs_set_bits(GUEST_CR0, CR0_TS_MASK); + update_exception_bitmap(vcpu); +} + static void vmx_vcpu_decache(struct kvm_vcpu *vcpu) { vcpu_clear(vcpu); @@ -332,41 +499,61 @@ static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) } /* + * Swap MSR entry in host/guest MSR entry array. + */ +void move_msr_up(struct kvm_vcpu *vcpu, int from, int to) +{ + struct vmx_msr_entry tmp; + tmp = vcpu->guest_msrs[to]; + vcpu->guest_msrs[to] = vcpu->guest_msrs[from]; + vcpu->guest_msrs[from] = tmp; + tmp = vcpu->host_msrs[to]; + vcpu->host_msrs[to] = vcpu->host_msrs[from]; + vcpu->host_msrs[from] = tmp; +} + +/* * Set up the vmcs to automatically save and restore system * msrs. Don't touch the 64-bit msrs if the guest is in legacy * mode, as fiddling with msrs is very expensive. */ static void setup_msrs(struct kvm_vcpu *vcpu) { - int nr_skip, nr_good_msrs; - - if (is_long_mode(vcpu)) - nr_skip = NR_BAD_MSRS; - else - nr_skip = NR_64BIT_MSRS; - nr_good_msrs = vcpu->nmsrs - nr_skip; + int save_nmsrs; - /* - * MSR_K6_STAR is only needed on long mode guests, and only - * if efer.sce is enabled. - */ - if (find_msr_entry(vcpu, MSR_K6_STAR)) { - --nr_good_msrs; + save_nmsrs = 0; #ifdef CONFIG_X86_64 - if (is_long_mode(vcpu) && (vcpu->shadow_efer & EFER_SCE)) - ++nr_good_msrs; -#endif + if (is_long_mode(vcpu)) { + int index; + + index = __find_msr_index(vcpu, MSR_SYSCALL_MASK); + if (index >= 0) + move_msr_up(vcpu, index, save_nmsrs++); + index = __find_msr_index(vcpu, MSR_LSTAR); + if (index >= 0) + move_msr_up(vcpu, index, save_nmsrs++); + index = __find_msr_index(vcpu, MSR_CSTAR); + if (index >= 0) + move_msr_up(vcpu, index, save_nmsrs++); + index = __find_msr_index(vcpu, MSR_KERNEL_GS_BASE); + if (index >= 0) + move_msr_up(vcpu, index, save_nmsrs++); + /* + * MSR_K6_STAR is only needed on long mode guests, and only + * if efer.sce is enabled. + */ + index = __find_msr_index(vcpu, MSR_K6_STAR); + if ((index >= 0) && (vcpu->shadow_efer & EFER_SCE)) + move_msr_up(vcpu, index, save_nmsrs++); } +#endif + vcpu->save_nmsrs = save_nmsrs; - vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, - virt_to_phys(vcpu->guest_msrs + nr_skip)); - vmcs_writel(VM_EXIT_MSR_STORE_ADDR, - virt_to_phys(vcpu->guest_msrs + nr_skip)); - vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, - virt_to_phys(vcpu->host_msrs + nr_skip)); - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */ - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ +#ifdef CONFIG_X86_64 + vcpu->msr_offset_kernel_gs_base = + __find_msr_index(vcpu, MSR_KERNEL_GS_BASE); +#endif + vcpu->msr_offset_efer = __find_msr_index(vcpu, MSR_EFER); } /* @@ -394,23 +581,6 @@ static void guest_write_tsc(u64 guest_tsc) vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); } -static void reload_tss(void) -{ -#ifndef CONFIG_X86_64 - - /* - * VT restores TR but not its size. Useless. - */ - struct descriptor_table gdt; - struct segment_descriptor *descs; - - get_gdt(&gdt); - descs = (void *)gdt.base; - descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ - load_TR_desc(); -#endif -} - /* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. @@ -470,10 +640,15 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { struct vmx_msr_entry *msr; + int ret = 0; + switch (msr_index) { #ifdef CONFIG_X86_64 case MSR_EFER: - return kvm_set_msr_common(vcpu, msr_index, data); + ret = kvm_set_msr_common(vcpu, msr_index, data); + if (vcpu->vmx_host_state.loaded) + load_transition_efer(vcpu); + break; case MSR_FS_BASE: vmcs_writel(GUEST_FS_BASE, data); break; @@ -497,14 +672,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) msr = find_msr_entry(vcpu, msr_index); if (msr) { msr->data = data; + if (vcpu->vmx_host_state.loaded) + load_msrs(vcpu->guest_msrs, vcpu->save_nmsrs); break; } - return kvm_set_msr_common(vcpu, msr_index, data); - msr->data = data; - break; + ret = kvm_set_msr_common(vcpu, msr_index, data); } - return 0; + return ret; } /* @@ -530,10 +705,8 @@ static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) { unsigned long dr7 = 0x400; - u32 exception_bitmap; int old_singlestep; - exception_bitmap = vmcs_read32(EXCEPTION_BITMAP); old_singlestep = vcpu->guest_debug.singlestep; vcpu->guest_debug.enabled = dbg->enabled; @@ -549,13 +722,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) dr7 |= 0 << (i*4+16); /* execution breakpoint */ } - exception_bitmap |= (1u << 1); /* Trap debug exceptions */ - vcpu->guest_debug.singlestep = dbg->singlestep; - } else { - exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */ + } else vcpu->guest_debug.singlestep = 0; - } if (old_singlestep && !vcpu->guest_debug.singlestep) { unsigned long flags; @@ -565,7 +734,7 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) vmcs_writel(GUEST_RFLAGS, flags); } - vmcs_write32(EXCEPTION_BITMAP, exception_bitmap); + update_exception_bitmap(vcpu); vmcs_writel(GUEST_DR7, dr7); return 0; @@ -679,14 +848,6 @@ static __exit void hardware_unsetup(void) free_kvm_area(); } -static void update_exception_bitmap(struct kvm_vcpu *vcpu) -{ - if (vcpu->rmode.active) - vmcs_write32(EXCEPTION_BITMAP, ~0); - else - vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR); -} - static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) { struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; @@ -793,6 +954,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu) fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds); fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs); fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs); + + init_rmode_tss(vcpu->kvm); } #ifdef CONFIG_X86_64 @@ -837,6 +1000,8 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { + vmx_fpu_deactivate(vcpu); + if (vcpu->rmode.active && (cr0 & CR0_PE_MASK)) enter_pmode(vcpu); @@ -852,26 +1017,20 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif - if (!(cr0 & CR0_TS_MASK)) { - vcpu->fpu_active = 1; - vmcs_clear_bits(EXCEPTION_BITMAP, CR0_TS_MASK); - } - vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); vcpu->cr0 = cr0; + + if (!(cr0 & CR0_TS_MASK) || !(cr0 & CR0_PE_MASK)) + vmx_fpu_activate(vcpu); } static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { vmcs_writel(GUEST_CR3, cr3); - - if (!(vcpu->cr0 & CR0_TS_MASK)) { - vcpu->fpu_active = 0; - vmcs_set_bits(GUEST_CR0, CR0_TS_MASK); - vmcs_set_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR); - } + if (vcpu->cr0 & CR0_PE_MASK) + vmx_fpu_deactivate(vcpu); } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -937,23 +1096,11 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, var->unusable = (ar >> 16) & 1; } -static void vmx_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) +static u32 vmx_segment_access_rights(struct kvm_segment *var) { - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; u32 ar; - vmcs_writel(sf->base, var->base); - vmcs_write32(sf->limit, var->limit); - vmcs_write16(sf->selector, var->selector); - if (vcpu->rmode.active && var->s) { - /* - * Hack real-mode segments into vm86 compatibility. - */ - if (var->base == 0xffff0000 && var->selector == 0xf000) - vmcs_writel(sf->base, 0xf0000); - ar = 0xf3; - } else if (var->unusable) + if (var->unusable) ar = 1 << 16; else { ar = var->type & 15; @@ -967,6 +1114,35 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, } if (ar == 0) /* a 0 value means unusable */ ar = AR_UNUSABLE_MASK; + + return ar; +} + +static void vmx_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + u32 ar; + + if (vcpu->rmode.active && seg == VCPU_SREG_TR) { + vcpu->rmode.tr.selector = var->selector; + vcpu->rmode.tr.base = var->base; + vcpu->rmode.tr.limit = var->limit; + vcpu->rmode.tr.ar = vmx_segment_access_rights(var); + return; + } + vmcs_writel(sf->base, var->base); + vmcs_write32(sf->limit, var->limit); + vmcs_write16(sf->selector, var->selector); + if (vcpu->rmode.active && var->s) { + /* + * Hack real-mode segments into vm86 compatibility. + */ + if (var->base == 0xffff0000 && var->selector == 0xf000) + vmcs_writel(sf->base, 0xf0000); + ar = 0xf3; + } else + ar = vmx_segment_access_rights(var); vmcs_write32(sf->ar_bytes, ar); } @@ -1018,16 +1194,16 @@ static int init_rmode_tss(struct kvm* kvm) } page = kmap_atomic(p1, KM_USER0); - memset(page, 0, PAGE_SIZE); + clear_page(page); *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; kunmap_atomic(page, KM_USER0); page = kmap_atomic(p2, KM_USER0); - memset(page, 0, PAGE_SIZE); + clear_page(page); kunmap_atomic(page, KM_USER0); page = kmap_atomic(p3, KM_USER0); - memset(page, 0, PAGE_SIZE); + clear_page(page); *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0; kunmap_atomic(page, KM_USER0); @@ -1066,7 +1242,7 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) struct descriptor_table dt; int i; int ret = 0; - extern asmlinkage void kvm_vmx_return(void); + unsigned long kvm_vmx_return; if (!init_rmode_tss(vcpu->kvm)) { ret = -ENOMEM; @@ -1076,9 +1252,9 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) memset(vcpu->regs, 0, sizeof(vcpu->regs)); vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val(); vcpu->cr8 = 0; - vcpu->apic_base = 0xfee00000 | - /*for vcpu 0*/ MSR_IA32_APICBASE_BSP | - MSR_IA32_APICBASE_ENABLE; + vcpu->apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; + if (vcpu == &vcpu->kvm->vcpus[0]) + vcpu->apic_base |= MSR_IA32_APICBASE_BSP; fx_init(vcpu); @@ -1129,8 +1305,8 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); /* I/O */ - vmcs_write64(IO_BITMAP_A, 0); - vmcs_write64(IO_BITMAP_B, 0); + vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); + vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); guest_write_tsc(0); @@ -1150,12 +1326,11 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) CPU_BASED_HLT_EXITING /* 20.6.2 */ | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */ | CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */ - | CPU_BASED_UNCOND_IO_EXITING /* 20.6.2 */ + | CPU_BASED_ACTIVATE_IO_BITMAP /* 20.6.2 */ | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */ ); - vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR); vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ @@ -1185,8 +1360,11 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) get_idt(&dt); vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ - - vmcs_writel(HOST_RIP, (unsigned long)kvm_vmx_return); /* 22.2.5 */ + asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); + vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); @@ -1210,10 +1388,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->host_msrs[j].reserved = 0; vcpu->host_msrs[j].data = data; vcpu->guest_msrs[j] = vcpu->host_msrs[j]; -#ifdef CONFIG_X86_64 - if (index == MSR_KERNEL_GS_BASE) - msr_offset_kernel_gs_base = j; -#endif ++vcpu->nmsrs; } @@ -1241,6 +1415,8 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 vmx_set_efer(vcpu, 0); #endif + vmx_fpu_activate(vcpu); + update_exception_bitmap(vcpu); return 0; @@ -1365,7 +1541,11 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, if (!vcpu->rmode.active) return 0; - if (vec == GP_VECTOR && err_code == 0) + /* + * Instruction with address size override prefix opcode 0x67 + * Cause the #SS fault with 0 error code in VM86 mode. + */ + if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE) return 1; return 0; @@ -1400,10 +1580,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } if (is_no_device(intr_info)) { - vcpu->fpu_active = 1; - vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR); - if (!(vcpu->cr0 & CR0_TS_MASK)) - vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); + vmx_fpu_activate(vcpu); return 1; } @@ -1445,8 +1622,13 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vcpu->rmode.active && handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, - error_code)) + error_code)) { + if (vcpu->halt_request) { + vcpu->halt_request = 0; + return kvm_emulate_halt(vcpu); + } return 1; + } if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) { kvm_run->exit_reason = KVM_EXIT_DEBUG; @@ -1595,11 +1777,10 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) break; case 2: /* clts */ vcpu_load_rsp_rip(vcpu); - vcpu->fpu_active = 1; - vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR); - vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); + vmx_fpu_deactivate(vcpu); vcpu->cr0 &= ~CR0_TS_MASK; vmcs_writel(CR0_READ_SHADOW, vcpu->cr0); + vmx_fpu_activate(vcpu); skip_emulated_instruction(vcpu); return 1; case 1: /*mov from cr*/ @@ -1734,12 +1915,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { skip_emulated_instruction(vcpu); - if (vcpu->irq_summary) - return 1; - - kvm_run->exit_reason = KVM_EXIT_HLT; - ++vcpu->stat.halt_exits; - return 0; + return kvm_emulate_halt(vcpu); } static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) @@ -1770,7 +1946,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, }; static const int kvm_vmx_max_exit_handlers = - sizeof(kvm_vmx_exit_handlers) / sizeof(*kvm_vmx_exit_handlers); + ARRAY_SIZE(kvm_vmx_exit_handlers); /* * The guest has exited. See if we can fix it or if we need userspace @@ -1810,61 +1986,44 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)); } +static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +{ +} + static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u8 fail; - u16 fs_sel, gs_sel, ldt_sel; - int fs_gs_ldt_reload_needed; int r; -again: - /* - * Set host fs and gs selectors. Unfortunately, 22.2.3 does not - * allow segment selectors with cpl > 0 or ti == 1. - */ - fs_sel = read_fs(); - gs_sel = read_gs(); - ldt_sel = read_ldt(); - fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel; - if (!fs_gs_ldt_reload_needed) { - vmcs_write16(HOST_FS_SELECTOR, fs_sel); - vmcs_write16(HOST_GS_SELECTOR, gs_sel); - } else { - vmcs_write16(HOST_FS_SELECTOR, 0); - vmcs_write16(HOST_GS_SELECTOR, 0); - } - -#ifdef CONFIG_X86_64 - vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); - vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); -#else - vmcs_writel(HOST_FS_BASE, segment_base(fs_sel)); - vmcs_writel(HOST_GS_BASE, segment_base(gs_sel)); -#endif +preempted: + if (vcpu->guest_debug.enabled) + kvm_guest_debug_pre(vcpu); +again: if (!vcpu->mmio_read_completed) do_interrupt_requests(vcpu, kvm_run); - if (vcpu->guest_debug.enabled) - kvm_guest_debug_pre(vcpu); - + vmx_save_host_state(vcpu); kvm_load_guest_fpu(vcpu); + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) + goto out; + /* * Loading guest fpu may have cleared host cr0.ts */ vmcs_writel(HOST_CR0, read_cr0()); -#ifdef CONFIG_X86_64 - if (is_long_mode(vcpu)) { - save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1); - load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); - } -#endif + local_irq_disable(); + + vcpu->guest_mode = 1; + if (vcpu->requests) + if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) + vmx_flush_tlb(vcpu); asm ( /* Store host registers */ - "pushf \n\t" #ifdef CONFIG_X86_64 "push %%rax; push %%rbx; push %%rdx;" "push %%rsi; push %%rdi; push %%rbp;" @@ -1909,12 +2068,11 @@ again: "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */ #endif /* Enter guest mode */ - "jne launched \n\t" + "jne .Llaunched \n\t" ASM_VMX_VMLAUNCH "\n\t" - "jmp kvm_vmx_return \n\t" - "launched: " ASM_VMX_VMRESUME "\n\t" - ".globl kvm_vmx_return \n\t" - "kvm_vmx_return: " + "jmp .Lkvm_vmx_return \n\t" + ".Llaunched: " ASM_VMX_VMRESUME "\n\t" + ".Lkvm_vmx_return: " /* Save guest registers, load host registers, keep flags */ #ifdef CONFIG_X86_64 "xchg %3, (%%rsp) \n\t" @@ -1957,7 +2115,6 @@ again: "pop %%ecx; popa \n\t" #endif "setbe %0 \n\t" - "popf \n\t" : "=q" (fail) : "r"(vcpu->launched), "d"((unsigned long)HOST_RSP), "c"(vcpu), @@ -1981,84 +2138,61 @@ again: [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) : "cc", "memory" ); - /* - * Reload segment selectors ASAP. (it's needed for a functional - * kernel: x86 relies on having __KERNEL_PDA in %fs and x86_64 - * relies on having 0 in %gs for the CPU PDA to work.) - */ - if (fs_gs_ldt_reload_needed) { - load_ldt(ldt_sel); - load_fs(fs_sel); - /* - * If we have to reload gs, we must take care to - * preserve our gs base. - */ - local_irq_disable(); - load_gs(gs_sel); -#ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); -#endif - local_irq_enable(); + vcpu->guest_mode = 0; + local_irq_enable(); - reload_tss(); - } ++vcpu->stat.exits; -#ifdef CONFIG_X86_64 - if (is_long_mode(vcpu)) { - save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); - load_msrs(vcpu->host_msrs, NR_BAD_MSRS); - } -#endif - vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); - if (fail) { + if (unlikely(fail)) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason = vmcs_read32(VM_INSTRUCTION_ERROR); r = 0; - } else { - /* - * Profile KVM exit RIPs: - */ - if (unlikely(prof_on == KVM_PROFILING)) - profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP)); - - vcpu->launched = 1; - r = kvm_handle_exit(kvm_run, vcpu); - if (r > 0) { - /* Give scheduler a change to reschedule. */ - if (signal_pending(current)) { - ++vcpu->stat.signal_exits; - post_kvm_run_save(vcpu, kvm_run); - kvm_run->exit_reason = KVM_EXIT_INTR; - return -EINTR; - } - - if (dm_request_for_irq_injection(vcpu, kvm_run)) { - ++vcpu->stat.request_irq_exits; - post_kvm_run_save(vcpu, kvm_run); - kvm_run->exit_reason = KVM_EXIT_INTR; - return -EINTR; - } - - kvm_resched(vcpu); + goto out; + } + /* + * Profile KVM exit RIPs: + */ + if (unlikely(prof_on == KVM_PROFILING)) + profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP)); + + vcpu->launched = 1; + r = kvm_handle_exit(kvm_run, vcpu); + if (r > 0) { + /* Give scheduler a change to reschedule. */ + if (signal_pending(current)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.signal_exits; + goto out; + } + + if (dm_request_for_irq_injection(vcpu, kvm_run)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.request_irq_exits; + goto out; + } + if (!need_resched()) { + ++vcpu->stat.light_exits; goto again; } } +out: + if (r > 0) { + kvm_resched(vcpu); + goto preempted; + } + post_kvm_run_save(vcpu, kvm_run); return r; } -static void vmx_flush_tlb(struct kvm_vcpu *vcpu) -{ - vmcs_writel(GUEST_CR3, vmcs_readl(GUEST_CR3)); -} - static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, u32 err_code) @@ -2122,7 +2256,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmcs_clear(vmcs); vcpu->vmcs = vmcs; vcpu->launched = 0; - vcpu->fpu_active = 1; return 0; @@ -2188,11 +2321,50 @@ static struct kvm_arch_ops vmx_arch_ops = { static int __init vmx_init(void) { - return kvm_init_arch(&vmx_arch_ops, THIS_MODULE); + void *iova; + int r; + + vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!vmx_io_bitmap_a) + return -ENOMEM; + + vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!vmx_io_bitmap_b) { + r = -ENOMEM; + goto out; + } + + /* + * Allow direct access to the PC debug port (it is often used for I/O + * delays, but the vmexits simply slow things down). + */ + iova = kmap(vmx_io_bitmap_a); + memset(iova, 0xff, PAGE_SIZE); + clear_bit(0x80, iova); + kunmap(vmx_io_bitmap_a); + + iova = kmap(vmx_io_bitmap_b); + memset(iova, 0xff, PAGE_SIZE); + kunmap(vmx_io_bitmap_b); + + r = kvm_init_arch(&vmx_arch_ops, THIS_MODULE); + if (r) + goto out1; + + return 0; + +out1: + __free_page(vmx_io_bitmap_b); +out: + __free_page(vmx_io_bitmap_a); + return r; } static void __exit vmx_exit(void) { + __free_page(vmx_io_bitmap_b); + __free_page(vmx_io_bitmap_a); + kvm_exit_arch(); } diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c index 7ade090..f60012d 100644 --- a/drivers/kvm/x86_emulate.c +++ b/drivers/kvm/x86_emulate.c @@ -98,8 +98,11 @@ static u8 opcode_table[256] = { 0, 0, 0, 0, /* 0x40 - 0x4F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x50 - 0x5F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x50 - 0x57 */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x58 - 0x5F */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0x60 - 0x6F */ 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -128,9 +131,9 @@ static u8 opcode_table[256] = { /* 0xB0 - 0xBF */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xC0 - 0xC7 */ - ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 0, 0, - 0, 0, ByteOp | DstMem | SrcImm | ModRM | Mov, - DstMem | SrcImm | ModRM | Mov, + ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, + 0, ImplicitOps, 0, 0, + ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, /* 0xC8 - 0xCF */ 0, 0, 0, 0, 0, 0, 0, 0, /* 0xD0 - 0xD7 */ @@ -143,7 +146,8 @@ static u8 opcode_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, - 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + ImplicitOps, 0, + ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, /* 0xF8 - 0xFF */ 0, 0, 0, 0, 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM @@ -152,7 +156,7 @@ static u8 opcode_table[256] = { static u16 twobyte_table[256] = { /* 0x00 - 0x0F */ 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, - 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, + 0, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, /* 0x10 - 0x1F */ 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, /* 0x20 - 0x2F */ @@ -481,6 +485,7 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) int mode = ctxt->mode; unsigned long modrm_ea; int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0; + int no_wb = 0; /* Shadow copy of register state. Committed on successful emulation. */ unsigned long _regs[NR_VCPU_REGS]; @@ -1047,7 +1052,7 @@ done_prefixes: _regs[VCPU_REGS_RSP]), &dst.val, dst.bytes, ctxt)) != 0) goto done; - dst.val = dst.orig_val; /* skanky: disable writeback */ + no_wb = 1; break; default: goto cannot_emulate; @@ -1056,7 +1061,7 @@ done_prefixes: } writeback: - if ((d & Mov) || (dst.orig_val != dst.val)) { + if (!no_wb) { switch (dst.type) { case OP_REG: /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ @@ -1149,6 +1154,23 @@ special_insn: case 0xae ... 0xaf: /* scas */ DPRINTF("Urk! I don't handle SCAS.\n"); goto cannot_emulate; + case 0xf4: /* hlt */ + ctxt->vcpu->halt_request = 1; + goto done; + case 0xc3: /* ret */ + dst.ptr = &_eip; + goto pop_instruction; + case 0x58 ... 0x5f: /* pop reg */ + dst.ptr = (unsigned long *)&_regs[b & 0x7]; + +pop_instruction: + if ((rc = ops->read_std(register_address(ctxt->ss_base, + _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt)) != 0) + goto done; + + register_address_increment(_regs[VCPU_REGS_RSP], op_bytes); + no_wb = 1; /* Disable writeback. */ + break; } goto writeback; @@ -1302,8 +1324,10 @@ twobyte_insn: twobyte_special_insn: /* Disable writeback. */ - dst.orig_val = dst.val; + no_wb = 1; switch (b) { + case 0x09: /* wbinvd */ + break; case 0x0d: /* GrpP (prefetch) */ case 0x18: /* Grp16 (prefetch/nop) */ break; diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index a260198..b4a7588 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -139,6 +139,7 @@ err_put_filp: put_filp(file); return error; } +EXPORT_SYMBOL_GPL(anon_inode_getfd); /* * A single inode exists for all anon_inode files. Contrary to pipes, diff --git a/include/linux/magic.h b/include/linux/magic.h index 9d713c0..36cc20d 100644 --- a/include/linux/magic.h +++ b/include/linux/magic.h @@ -13,7 +13,6 @@ #define HPFS_SUPER_MAGIC 0xf995e849 #define ISOFS_SUPER_MAGIC 0x9660 #define JFFS2_SUPER_MAGIC 0x72b6 -#define KVMFS_SUPER_MAGIC 0x19700426 #define ANON_INODE_FS_MAGIC 0x09041934 #define MINIX_SUPER_MAGIC 0x137F /* original minix fs */ diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 9431101..576f2bb 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -196,6 +196,8 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh, #define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */ #define CPU_LOCK_ACQUIRE 0x0008 /* Acquire all hotcpu locks */ #define CPU_LOCK_RELEASE 0x0009 /* Release all hotcpu locks */ +#define CPU_DYING 0x000A /* CPU (unsigned)v not running any task, + * not handling interrupts, soon dead */ /* Used for CPU hotplug events occuring while tasks are frozen due to a suspend * operation in progress @@ -208,6 +210,7 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh, #define CPU_DOWN_PREPARE_FROZEN (CPU_DOWN_PREPARE | CPU_TASKS_FROZEN) #define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN) #define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN) +#define CPU_DYING_FROZEN (CPU_DYING | CPU_TASKS_FROZEN) #endif /* __KERNEL__ */ #endif /* _LINUX_NOTIFIER_H */ diff --git a/include/linux/smp.h b/include/linux/smp.h index 96ac21f..8039dac 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -7,6 +7,7 @@ */ #include <linux/errno.h> +#include <asm/system.h> extern void cpu_idle(void); @@ -102,7 +103,11 @@ static inline void smp_send_reschedule(int cpu) { } static inline int smp_call_function_single(int cpuid, void (*func) (void *info), void *info, int retry, int wait) { - return -EBUSY; + WARN_ON(cpuid != 0); + local_irq_disable(); + func(info); + local_irq_enable(); + return 0; } #endif /* !SMP */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 208cf34..181ae70 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -103,11 +103,19 @@ static inline void check_for_tasks(int cpu) write_unlock_irq(&tasklist_lock); } +struct take_cpu_down_param { + unsigned long mod; + void *hcpu; +}; + /* Take this CPU down. */ -static int take_cpu_down(void *unused) +static int take_cpu_down(void *_param) { + struct take_cpu_down_param *param = _param; int err; + raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, + param->hcpu); /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) @@ -127,6 +135,10 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) cpumask_t old_allowed, tmp; void *hcpu = (void *)(long)cpu; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; + struct take_cpu_down_param tcd_param = { + .mod = mod, + .hcpu = hcpu, + }; if (num_online_cpus() == 1) return -EBUSY; @@ -153,7 +165,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) set_cpus_allowed(current, tmp); mutex_lock(&cpu_bitmask_lock); - p = __stop_machine_run(take_cpu_down, NULL, cpu); + p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); mutex_unlock(&cpu_bitmask_lock); if (IS_ERR(p) || cpu_online(cpu)) { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 824b1c0..b4796d8 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2138,6 +2138,9 @@ static void common_cpu_mem_hotplug_unplug(void) static int cpuset_handle_cpuhp(struct notifier_block *nb, unsigned long phase, void *cpu) { + if (phase == CPU_DYING || phase == CPU_DYING_FROZEN) + return NOTIFY_DONE; + common_cpu_mem_hotplug_unplug(); return 0; } |