diff options
Diffstat (limited to 'virt/kvm')
-rw-r--r-- | virt/kvm/eventfd.c | 9 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 197 |
2 files changed, 131 insertions, 75 deletions
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 2ca4535..73358d2 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -90,7 +90,7 @@ irqfd_shutdown(struct work_struct *work) * We know no new events will be scheduled at this point, so block * until all previously outstanding events have completed */ - flush_work(&irqfd->inject); + flush_work_sync(&irqfd->inject); /* * It is now safe to release the object's resources @@ -313,8 +313,9 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) { /* * This rcu_assign_pointer is needed for when - * another thread calls kvm_irqfd_update before - * we flush workqueue below. + * another thread calls kvm_irq_routing_update before + * we flush workqueue below (we synchronize with + * kvm_irq_routing_update using irqfds.lock). * It is paired with synchronize_rcu done by caller * of that function. */ @@ -577,7 +578,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) mutex_lock(&kvm->slots_lock); - /* Verify that there isnt a match already */ + /* Verify that there isn't a match already */ if (ioeventfd_check_collision(kvm, p)) { ret = -EEXIST; goto unlock_fail; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f29abeb..6330653 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -30,7 +30,7 @@ #include <linux/debugfs.h> #include <linux/highmem.h> #include <linux/file.h> -#include <linux/sysdev.h> +#include <linux/syscore_ops.h> #include <linux/cpu.h> #include <linux/sched.h> #include <linux/cpumask.h> @@ -52,7 +52,6 @@ #include <asm/io.h> #include <asm/uaccess.h> #include <asm/pgtable.h> -#include <asm-generic/bitops/le.h> #include "coalesced_mmio.h" #include "async_pf.h" @@ -69,7 +68,7 @@ MODULE_LICENSE("GPL"); * kvm->lock --> kvm->slots_lock --> kvm->irq_lock */ -DEFINE_SPINLOCK(kvm_lock); +DEFINE_RAW_SPINLOCK(kvm_lock); LIST_HEAD(vm_list); static cpumask_var_t cpus_hardware_enabled; @@ -137,6 +136,14 @@ void vcpu_load(struct kvm_vcpu *vcpu) int cpu; mutex_lock(&vcpu->mutex); + if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { + /* The thread running this VCPU changed. */ + struct pid *oldpid = vcpu->pid; + struct pid *newpid = get_task_pid(current, PIDTYPE_PID); + rcu_assign_pointer(vcpu->pid, newpid); + synchronize_rcu(); + put_pid(oldpid); + } cpu = get_cpu(); preempt_notifier_register(&vcpu->preempt_notifier); kvm_arch_vcpu_load(vcpu, cpu); @@ -165,13 +172,16 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) zalloc_cpumask_var(&cpus, GFP_ATOMIC); - raw_spin_lock(&kvm->requests_lock); - me = smp_processor_id(); + me = get_cpu(); kvm_for_each_vcpu(i, vcpu, kvm) { - if (kvm_make_check_request(req, vcpu)) - continue; + kvm_make_request(req, vcpu); cpu = vcpu->cpu; - if (cpus != NULL && cpu != -1 && cpu != me) + + /* Set ->requests bit before we read ->mode */ + smp_mb(); + + if (cpus != NULL && cpu != -1 && cpu != me && + kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE) cpumask_set_cpu(cpu, cpus); } if (unlikely(cpus == NULL)) @@ -180,7 +190,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) smp_call_function_many(cpus, ack_flush, NULL, 1); else called = false; - raw_spin_unlock(&kvm->requests_lock); + put_cpu(); free_cpumask_var(cpus); return called; } @@ -209,6 +219,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->cpu = -1; vcpu->kvm = kvm; vcpu->vcpu_id = id; + vcpu->pid = NULL; init_waitqueue_head(&vcpu->wq); kvm_async_pf_vcpu_init(vcpu); @@ -233,6 +244,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_init); void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) { + put_pid(vcpu->pid); kvm_arch_vcpu_uninit(vcpu); free_page((unsigned long)vcpu->run); } @@ -463,15 +475,14 @@ static struct kvm *kvm_create_vm(void) kvm->mm = current->mm; atomic_inc(&kvm->mm->mm_count); spin_lock_init(&kvm->mmu_lock); - raw_spin_lock_init(&kvm->requests_lock); kvm_eventfd_init(kvm); mutex_init(&kvm->lock); mutex_init(&kvm->irq_lock); mutex_init(&kvm->slots_lock); atomic_set(&kvm->users_count, 1); - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); return kvm; @@ -544,9 +555,9 @@ static void kvm_destroy_vm(struct kvm *kvm) struct mm_struct *mm = kvm->mm; kvm_arch_sync_events(kvm); - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); list_del(&kvm->vm_list); - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) kvm_io_bus_destroy(kvm->buses[i]); @@ -588,6 +599,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) return 0; } +#ifndef CONFIG_S390 /* * Allocation size is twice as large as the actual dirty bitmap size. * This makes it possible to do double buffering: see x86's @@ -608,6 +620,7 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) memslot->dirty_bitmap_head = memslot->dirty_bitmap; return 0; } +#endif /* !CONFIG_S390 */ /* * Allocate some memory and give it an address in the guest physical address @@ -621,7 +634,7 @@ int __kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, int user_alloc) { - int r, flush_shadow = 0; + int r; gfn_t base_gfn; unsigned long npages; unsigned long i; @@ -741,8 +754,6 @@ skip_lpage: if (kvm_create_dirty_bitmap(&new) < 0) goto out_free; /* destroy any largepage mappings for dirty tracking */ - if (old.npages) - flush_shadow = 1; } #else /* not defined CONFIG_S390 */ new.user_alloc = user_alloc; @@ -813,9 +824,6 @@ skip_lpage: kvm_free_physmem_slot(&old, &new); kfree(old_memslots); - if (flush_shadow) - kvm_arch_flush_shadow(kvm); - return 0; out_free: @@ -1029,6 +1037,26 @@ static pfn_t get_fault_pfn(void) return fault_pfn; } +int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int write, struct page **page) +{ + int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET; + + if (write) + flags |= FOLL_WRITE; + + return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); +} + +static inline int check_user_page_hwpoison(unsigned long addr) +{ + int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; + + rc = __get_user_pages(current, current->mm, addr, 1, + flags, NULL, NULL, NULL); + return rc == -EHWPOISON; +} + static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, bool *async, bool write_fault, bool *writable) { @@ -1053,7 +1081,14 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, if (writable) *writable = write_fault; - npages = get_user_pages_fast(addr, 1, write_fault, page); + if (async) { + down_read(¤t->mm->mmap_sem); + npages = get_user_page_nowait(current, current->mm, + addr, write_fault, page); + up_read(¤t->mm->mmap_sem); + } else + npages = get_user_pages_fast(addr, 1, write_fault, + page); /* map read fault as writable if possible */ if (unlikely(!write_fault) && npages == 1) { @@ -1076,7 +1111,8 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, return get_fault_pfn(); down_read(¤t->mm->mmap_sem); - if (is_hwpoison_address(addr)) { + if (npages == -EHWPOISON || + (!async && check_user_page_hwpoison(addr))) { up_read(¤t->mm->mmap_sem); get_page(hwpoison_page); return page_to_pfn(hwpoison_page); @@ -1421,7 +1457,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, if (memslot && memslot->dirty_bitmap) { unsigned long rel_gfn = gfn - memslot->base_gfn; - generic___set_le_bit(rel_gfn, memslot->dirty_bitmap); + __set_bit_le(rel_gfn, memslot->dirty_bitmap); } } @@ -1466,18 +1502,55 @@ void kvm_resched(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_resched); -void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu) +void kvm_vcpu_on_spin(struct kvm_vcpu *me) { - ktime_t expires; - DEFINE_WAIT(wait); - - prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); - - /* Sleep for 100 us, and hope lock-holder got scheduled */ - expires = ktime_add_ns(ktime_get(), 100000UL); - schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + struct kvm *kvm = me->kvm; + struct kvm_vcpu *vcpu; + int last_boosted_vcpu = me->kvm->last_boosted_vcpu; + int yielded = 0; + int pass; + int i; - finish_wait(&vcpu->wq, &wait); + /* + * We boost the priority of a VCPU that is runnable but not + * currently running, because it got preempted by something + * else and called schedule in __vcpu_run. Hopefully that + * VCPU is holding the lock that we need and will release it. + * We approximate round-robin by starting at the last boosted VCPU. + */ + for (pass = 0; pass < 2 && !yielded; pass++) { + kvm_for_each_vcpu(i, vcpu, kvm) { + struct task_struct *task = NULL; + struct pid *pid; + if (!pass && i < last_boosted_vcpu) { + i = last_boosted_vcpu; + continue; + } else if (pass && i > last_boosted_vcpu) + break; + if (vcpu == me) + continue; + if (waitqueue_active(&vcpu->wq)) + continue; + rcu_read_lock(); + pid = rcu_dereference(vcpu->pid); + if (pid) + task = get_pid_task(vcpu->pid, PIDTYPE_PID); + rcu_read_unlock(); + if (!task) + continue; + if (task->flags & PF_VCPU) { + put_task_struct(task); + continue; + } + if (yield_to(task, 1)) { + put_task_struct(task); + kvm->last_boosted_vcpu = i; + yielded = 1; + break; + } + put_task_struct(task); + } + } } EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); @@ -2122,9 +2195,9 @@ static void hardware_enable_nolock(void *junk) static void hardware_enable(void *junk) { - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); hardware_enable_nolock(junk); - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); } static void hardware_disable_nolock(void *junk) @@ -2139,9 +2212,9 @@ static void hardware_disable_nolock(void *junk) static void hardware_disable(void *junk) { - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); hardware_disable_nolock(junk); - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); } static void hardware_disable_all_nolock(void) @@ -2155,16 +2228,16 @@ static void hardware_disable_all_nolock(void) static void hardware_disable_all(void) { - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); hardware_disable_all_nolock(); - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); } static int hardware_enable_all(void) { int r = 0; - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); kvm_usage_count++; if (kvm_usage_count == 1) { @@ -2177,7 +2250,7 @@ static int hardware_enable_all(void) } } - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); return r; } @@ -2339,10 +2412,10 @@ static int vm_stat_get(void *_offset, u64 *val) struct kvm *kvm; *val = 0; - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) *val += *(u32 *)((void *)kvm + offset); - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); return 0; } @@ -2356,12 +2429,12 @@ static int vcpu_stat_get(void *_offset, u64 *val) int i; *val = 0; - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_for_each_vcpu(i, vcpu, kvm) *val += *(u32 *)((void *)vcpu + offset); - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); return 0; } @@ -2392,33 +2465,26 @@ static void kvm_exit_debug(void) debugfs_remove(kvm_debugfs_dir); } -static int kvm_suspend(struct sys_device *dev, pm_message_t state) +static int kvm_suspend(void) { if (kvm_usage_count) hardware_disable_nolock(NULL); return 0; } -static int kvm_resume(struct sys_device *dev) +static void kvm_resume(void) { if (kvm_usage_count) { - WARN_ON(spin_is_locked(&kvm_lock)); + WARN_ON(raw_spin_is_locked(&kvm_lock)); hardware_enable_nolock(NULL); } - return 0; } -static struct sysdev_class kvm_sysdev_class = { - .name = "kvm", +static struct syscore_ops kvm_syscore_ops = { .suspend = kvm_suspend, .resume = kvm_resume, }; -static struct sys_device kvm_sysdev = { - .id = 0, - .cls = &kvm_sysdev_class, -}; - struct page *bad_page; pfn_t bad_pfn; @@ -2502,14 +2568,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, goto out_free_2; register_reboot_notifier(&kvm_reboot_notifier); - r = sysdev_class_register(&kvm_sysdev_class); - if (r) - goto out_free_3; - - r = sysdev_register(&kvm_sysdev); - if (r) - goto out_free_4; - /* A kmem cache lets us meet the alignment requirements of fx_save. */ if (!vcpu_align) vcpu_align = __alignof__(struct kvm_vcpu); @@ -2517,7 +2575,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 0, NULL); if (!kvm_vcpu_cache) { r = -ENOMEM; - goto out_free_5; + goto out_free_3; } r = kvm_async_pf_init(); @@ -2534,6 +2592,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, goto out_unreg; } + register_syscore_ops(&kvm_syscore_ops); + kvm_preempt_ops.sched_in = kvm_sched_in; kvm_preempt_ops.sched_out = kvm_sched_out; @@ -2545,10 +2605,6 @@ out_unreg: kvm_async_pf_deinit(); out_free: kmem_cache_destroy(kvm_vcpu_cache); -out_free_5: - sysdev_unregister(&kvm_sysdev); -out_free_4: - sysdev_class_unregister(&kvm_sysdev_class); out_free_3: unregister_reboot_notifier(&kvm_reboot_notifier); unregister_cpu_notifier(&kvm_cpu_notifier); @@ -2576,8 +2632,7 @@ void kvm_exit(void) misc_deregister(&kvm_dev); kmem_cache_destroy(kvm_vcpu_cache); kvm_async_pf_deinit(); - sysdev_unregister(&kvm_sysdev); - sysdev_class_unregister(&kvm_sysdev_class); + unregister_syscore_ops(&kvm_syscore_ops); unregister_reboot_notifier(&kvm_reboot_notifier); unregister_cpu_notifier(&kvm_cpu_notifier); on_each_cpu(hardware_disable_nolock, NULL, 1); |