From ca3f10172eea9b95bbb66487656f3c3e93855702 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 14 Oct 2010 11:22:49 +0200 Subject: KVM paravirt: Move kvm_smp_prepare_boot_cpu() from kvmclock.c to kvm.c. Async PF also needs to hook into smp_prepare_boot_cpu so move the hook into generic code. Acked-by: Rik van Riel Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kernel/kvm.c | 11 +++++++++++ arch/x86/kernel/kvmclock.c | 13 +------------ 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 63b0ec8..e6db179 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -231,10 +231,21 @@ static void __init paravirt_ops_setup(void) #endif } +#ifdef CONFIG_SMP +static void __init kvm_smp_prepare_boot_cpu(void) +{ + WARN_ON(kvm_register_clock("primary cpu clock")); + native_smp_prepare_boot_cpu(); +} +#endif + void __init kvm_guest_init(void) { if (!kvm_para_available()) return; paravirt_ops_setup(); +#ifdef CONFIG_SMP + smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; +#endif } diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index ca43ce3..f98d3ea 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -125,7 +125,7 @@ static struct clocksource kvm_clock = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static int kvm_register_clock(char *txt) +int kvm_register_clock(char *txt) { int cpu = smp_processor_id(); int low, high, ret; @@ -152,14 +152,6 @@ static void __cpuinit kvm_setup_secondary_clock(void) } #endif -#ifdef CONFIG_SMP -static void __init kvm_smp_prepare_boot_cpu(void) -{ - WARN_ON(kvm_register_clock("primary cpu clock")); - native_smp_prepare_boot_cpu(); -} -#endif - /* * After the clock is registered, the host will keep writing to the * registered memory location. If the guest happens to shutdown, this memory @@ -206,9 +198,6 @@ void __init kvmclock_init(void) x86_cpuinit.setup_percpu_clockev = kvm_setup_secondary_clock; #endif -#ifdef CONFIG_SMP - smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; -#endif machine_ops.shutdown = kvm_shutdown; #ifdef CONFIG_KEXEC machine_ops.crash_shutdown = kvm_crash_shutdown; -- cgit v1.1 From fd10cde9294f73eeccbc16f3fec1ae6cde7b800c Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 14 Oct 2010 11:22:51 +0200 Subject: KVM paravirt: Add async PF initialization to PV guest. Enable async PF in a guest if async PF capability is discovered. Acked-by: Rik van Riel Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kernel/kvm.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index e6db179..032d03b 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -27,16 +27,30 @@ #include #include #include +#include +#include #include +#include #define MMU_QUEUE_SIZE 1024 +static int kvmapf = 1; + +static int parse_no_kvmapf(char *arg) +{ + kvmapf = 0; + return 0; +} + +early_param("no-kvmapf", parse_no_kvmapf); + struct kvm_para_state { u8 mmu_queue[MMU_QUEUE_SIZE]; int mmu_queue_len; }; static DEFINE_PER_CPU(struct kvm_para_state, para_state); +static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); static struct kvm_para_state *kvm_para_state(void) { @@ -231,12 +245,86 @@ static void __init paravirt_ops_setup(void) #endif } +void __cpuinit kvm_guest_cpu_init(void) +{ + if (!kvm_para_available()) + return; + + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { + u64 pa = __pa(&__get_cpu_var(apf_reason)); + + wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED); + __get_cpu_var(apf_reason).enabled = 1; + printk(KERN_INFO"KVM setup async PF for cpu %d\n", + smp_processor_id()); + } +} + +static void kvm_pv_disable_apf(void *unused) +{ + if (!__get_cpu_var(apf_reason).enabled) + return; + + wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); + __get_cpu_var(apf_reason).enabled = 0; + + printk(KERN_INFO"Unregister pv shared memory for cpu %d\n", + smp_processor_id()); +} + +static int kvm_pv_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) +{ + if (code == SYS_RESTART) + on_each_cpu(kvm_pv_disable_apf, NULL, 1); + return NOTIFY_DONE; +} + +static struct notifier_block kvm_pv_reboot_nb = { + .notifier_call = kvm_pv_reboot_notify, +}; + #ifdef CONFIG_SMP static void __init kvm_smp_prepare_boot_cpu(void) { WARN_ON(kvm_register_clock("primary cpu clock")); + kvm_guest_cpu_init(); native_smp_prepare_boot_cpu(); } + +static void kvm_guest_cpu_online(void *dummy) +{ + kvm_guest_cpu_init(); +} + +static void kvm_guest_cpu_offline(void *dummy) +{ + kvm_pv_disable_apf(NULL); +} + +static int __cpuinit kvm_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (unsigned long)hcpu; + switch (action) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + case CPU_ONLINE_FROZEN: + smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata kvm_cpu_notifier = { + .notifier_call = kvm_cpu_notify, +}; #endif void __init kvm_guest_init(void) @@ -245,7 +333,11 @@ void __init kvm_guest_init(void) return; paravirt_ops_setup(); + register_reboot_notifier(&kvm_pv_reboot_nb); #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; + register_cpu_notifier(&kvm_cpu_notifier); +#else + kvm_guest_cpu_init(); #endif } -- cgit v1.1 From 631bc4878220932fe67fc46fc7cf7cccdb1ec597 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 14 Oct 2010 11:22:52 +0200 Subject: KVM: Handle async PF in a guest. When async PF capability is detected hook up special page fault handler that will handle async page fault events and bypass other page faults to regular page fault handler. Also add async PF handling to nested SVM emulation. Async PF always generates exit to L1 where vcpu thread will be scheduled out until page is available. Acked-by: Rik van Riel Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kernel/entry_32.S | 10 +++ arch/x86/kernel/entry_64.S | 3 + arch/x86/kernel/kvm.c | 181 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 194 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 591e601..c8b4efa 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1406,6 +1406,16 @@ ENTRY(general_protection) CFI_ENDPROC END(general_protection) +#ifdef CONFIG_KVM_GUEST +ENTRY(async_page_fault) + RING0_EC_FRAME + pushl $do_async_page_fault + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(apf_page_fault) +#endif + /* * End of kprobes section */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e3ba417..bb3f6e9 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1319,6 +1319,9 @@ errorentry xen_stack_segment do_stack_segment #endif errorentry general_protection do_general_protection errorentry page_fault do_page_fault +#ifdef CONFIG_KVM_GUEST +errorentry async_page_fault do_async_page_fault +#endif #ifdef CONFIG_X86_MCE paranoidzeroentry machine_check *machine_check_vector(%rip) #endif diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 032d03b..d564063 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -29,8 +29,14 @@ #include #include #include +#include +#include +#include +#include #include #include +#include +#include #define MMU_QUEUE_SIZE 1024 @@ -64,6 +70,168 @@ static void kvm_io_delay(void) { } +#define KVM_TASK_SLEEP_HASHBITS 8 +#define KVM_TASK_SLEEP_HASHSIZE (1<list) { + struct kvm_task_sleep_node *n = + hlist_entry(p, typeof(*n), link); + if (n->token == token) + return n; + } + + return NULL; +} + +void kvm_async_pf_task_wait(u32 token) +{ + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); + struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; + struct kvm_task_sleep_node n, *e; + DEFINE_WAIT(wait); + + spin_lock(&b->lock); + e = _find_apf_task(b, token); + if (e) { + /* dummy entry exist -> wake up was delivered ahead of PF */ + hlist_del(&e->link); + kfree(e); + spin_unlock(&b->lock); + return; + } + + n.token = token; + n.cpu = smp_processor_id(); + init_waitqueue_head(&n.wq); + hlist_add_head(&n.link, &b->list); + spin_unlock(&b->lock); + + for (;;) { + prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + if (hlist_unhashed(&n.link)) + break; + local_irq_enable(); + schedule(); + local_irq_disable(); + } + finish_wait(&n.wq, &wait); + + return; +} +EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); + +static void apf_task_wake_one(struct kvm_task_sleep_node *n) +{ + hlist_del_init(&n->link); + if (waitqueue_active(&n->wq)) + wake_up(&n->wq); +} + +static void apf_task_wake_all(void) +{ + int i; + + for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { + struct hlist_node *p, *next; + struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; + spin_lock(&b->lock); + hlist_for_each_safe(p, next, &b->list) { + struct kvm_task_sleep_node *n = + hlist_entry(p, typeof(*n), link); + if (n->cpu == smp_processor_id()) + apf_task_wake_one(n); + } + spin_unlock(&b->lock); + } +} + +void kvm_async_pf_task_wake(u32 token) +{ + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); + struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; + struct kvm_task_sleep_node *n; + + if (token == ~0) { + apf_task_wake_all(); + return; + } + +again: + spin_lock(&b->lock); + n = _find_apf_task(b, token); + if (!n) { + /* + * async PF was not yet handled. + * Add dummy entry for the token. + */ + n = kmalloc(sizeof(*n), GFP_ATOMIC); + if (!n) { + /* + * Allocation failed! Busy wait while other cpu + * handles async PF. + */ + spin_unlock(&b->lock); + cpu_relax(); + goto again; + } + n->token = token; + n->cpu = smp_processor_id(); + init_waitqueue_head(&n->wq); + hlist_add_head(&n->link, &b->list); + } else + apf_task_wake_one(n); + spin_unlock(&b->lock); + return; +} +EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); + +u32 kvm_read_and_reset_pf_reason(void) +{ + u32 reason = 0; + + if (__get_cpu_var(apf_reason).enabled) { + reason = __get_cpu_var(apf_reason).reason; + __get_cpu_var(apf_reason).reason = 0; + } + + return reason; +} +EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); + +dotraplinkage void __kprobes +do_async_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + switch (kvm_read_and_reset_pf_reason()) { + default: + do_page_fault(regs, error_code); + break; + case KVM_PV_REASON_PAGE_NOT_PRESENT: + /* page is swapped out by the host. */ + kvm_async_pf_task_wait((u32)read_cr2()); + break; + case KVM_PV_REASON_PAGE_READY: + kvm_async_pf_task_wake((u32)read_cr2()); + break; + } +} + static void kvm_mmu_op(void *buffer, unsigned len) { int r; @@ -300,6 +468,7 @@ static void kvm_guest_cpu_online(void *dummy) static void kvm_guest_cpu_offline(void *dummy) { kvm_pv_disable_apf(NULL); + apf_task_wake_all(); } static int __cpuinit kvm_cpu_notify(struct notifier_block *self, @@ -327,13 +496,25 @@ static struct notifier_block __cpuinitdata kvm_cpu_notifier = { }; #endif +static void __init kvm_apf_trap_init(void) +{ + set_intr_gate(14, &async_page_fault); +} + void __init kvm_guest_init(void) { + int i; + if (!kvm_para_available()) return; paravirt_ops_setup(); register_reboot_notifier(&kvm_pv_reboot_nb); + for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) + spin_lock_init(&async_pf_sleepers[i].lock); + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) + x86_init.irqs.trap_init = kvm_apf_trap_init; + #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; register_cpu_notifier(&kvm_cpu_notifier); -- cgit v1.1 From 6c047cd982f944fa63b2d96de2a06463d113f9fa Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 14 Oct 2010 11:22:54 +0200 Subject: KVM paravirt: Handle async PF in non preemptable context If async page fault is received by idle task or when preemp_count is not zero guest cannot reschedule, so do sti; hlt and wait for page to be ready. vcpu can still process interrupts while it waits for the page to be ready. Acked-by: Rik van Riel Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kernel/kvm.c | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index d564063..47ea93e 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -37,6 +37,7 @@ #include #include #include +#include #define MMU_QUEUE_SIZE 1024 @@ -78,6 +79,8 @@ struct kvm_task_sleep_node { wait_queue_head_t wq; u32 token; int cpu; + bool halted; + struct mm_struct *mm; }; static struct kvm_task_sleep_head { @@ -106,6 +109,11 @@ void kvm_async_pf_task_wait(u32 token) struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; struct kvm_task_sleep_node n, *e; DEFINE_WAIT(wait); + int cpu, idle; + + cpu = get_cpu(); + idle = idle_cpu(cpu); + put_cpu(); spin_lock(&b->lock); e = _find_apf_task(b, token); @@ -119,19 +127,33 @@ void kvm_async_pf_task_wait(u32 token) n.token = token; n.cpu = smp_processor_id(); + n.mm = current->active_mm; + n.halted = idle || preempt_count() > 1; + atomic_inc(&n.mm->mm_count); init_waitqueue_head(&n.wq); hlist_add_head(&n.link, &b->list); spin_unlock(&b->lock); for (;;) { - prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + if (!n.halted) + prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); if (hlist_unhashed(&n.link)) break; - local_irq_enable(); - schedule(); - local_irq_disable(); + + if (!n.halted) { + local_irq_enable(); + schedule(); + local_irq_disable(); + } else { + /* + * We cannot reschedule. So halt. + */ + native_safe_halt(); + local_irq_disable(); + } } - finish_wait(&n.wq, &wait); + if (!n.halted) + finish_wait(&n.wq, &wait); return; } @@ -140,7 +162,12 @@ EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); static void apf_task_wake_one(struct kvm_task_sleep_node *n) { hlist_del_init(&n->link); - if (waitqueue_active(&n->wq)) + if (!n->mm) + return; + mmdrop(n->mm); + if (n->halted) + smp_send_reschedule(n->cpu); + else if (waitqueue_active(&n->wq)) wake_up(&n->wq); } @@ -193,6 +220,7 @@ again: } n->token = token; n->cpu = smp_processor_id(); + n->mm = NULL; init_waitqueue_head(&n->wq); hlist_add_head(&n->link, &b->list); } else -- cgit v1.1 From 6adba527420651b6cacaf392541c09fb108711a2 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 14 Oct 2010 11:22:55 +0200 Subject: KVM: Let host know whether the guest can handle async PF in non-userspace context. If guest can detect that it runs in non-preemptable context it can handle async PFs at any time, so let host know that it can send async PF even if guest cpu is not in userspace. Acked-by: Rik van Riel Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kernel/kvm.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 47ea93e..91b3d65 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -449,6 +449,9 @@ void __cpuinit kvm_guest_cpu_init(void) if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { u64 pa = __pa(&__get_cpu_var(apf_reason)); +#ifdef CONFIG_PREEMPT + pa |= KVM_ASYNC_PF_SEND_ALWAYS; +#endif wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED); __get_cpu_var(apf_reason).enabled = 1; printk(KERN_INFO"KVM setup async PF for cpu %d\n", -- cgit v1.1 From a63512a4d711c9bd6a5d03847f45fcf88cdea0c6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 16 Dec 2010 11:27:23 +0200 Subject: KVM guest: Fix kvm clock initialization when it's configured out Signed-off-by: Avi Kivity --- arch/x86/kernel/kvm.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 91b3d65..8dc4466 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -486,7 +486,9 @@ static struct notifier_block kvm_pv_reboot_nb = { #ifdef CONFIG_SMP static void __init kvm_smp_prepare_boot_cpu(void) { +#ifdef CONFIG_KVM_CLOCK WARN_ON(kvm_register_clock("primary cpu clock")); +#endif kvm_guest_cpu_init(); native_smp_prepare_boot_cpu(); } -- cgit v1.1 From e5c301428294cb8925667c9ee39f817c4ab1c2c9 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 11 Jan 2011 12:15:54 +0200 Subject: KVM: Initialize fpu state in preemptible context init_fpu() (which is indirectly called by the fpu switching code) assumes it is in process context. Rather than makeing init_fpu() use an atomic allocation, which can cause a task to be killed, make sure the fpu is already initialized when we enter the run loop. KVM-Stable-Tag. Reported-and-tested-by: Kirill A. Shutemov Acked-by: Pekka Enberg Reviewed-by: Christoph Lameter Signed-off-by: Avi Kivity --- arch/x86/kernel/i387.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 58bb239..e60c38c 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -169,6 +169,7 @@ int init_fpu(struct task_struct *tsk) set_stopped_child_used_math(tsk); return 0; } +EXPORT_SYMBOL_GPL(init_fpu); /* * The xstateregs_active() routine is the same as the fpregs_active() routine, -- cgit v1.1