From 3843fc2575e3389f4f0ad0420a720240a5746a5d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 9 May 2008 12:05:57 +0100 Subject: xen: remove support for non-PAE 32-bit Non-PAE operation has been deprecated in Xen for a while, and is rarely tested or used. xen-unstable has now officially dropped non-PAE support. Since Xen/pvops' non-PAE support has also been broken for a while, we may as well completely drop it altogether. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/xen/Kconfig | 2 +- arch/x86/xen/enlighten.c | 51 +++++++++++++++++------------------------------- arch/x86/xen/mmu.c | 19 ++---------------- arch/x86/xen/mmu.h | 24 ++++++----------------- arch/x86/xen/xen-head.S | 4 ---- 5 files changed, 27 insertions(+), 73 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 2e641be..525b108 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -6,7 +6,7 @@ config XEN bool "Xen guest support" select PARAVIRT depends on X86_32 - depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER) + depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) help This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c8a56e4..a05b772 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -785,38 +785,35 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) static __init void xen_pagetable_setup_start(pgd_t *base) { pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; + int i; /* special set_pte for pagetable initialization */ pv_mmu_ops.set_pte = xen_set_pte_init; init_mm.pgd = base; /* - * copy top-level of Xen-supplied pagetable into place. For - * !PAE we can use this as-is, but for PAE it is a stand-in - * while we copy the pmd pages. + * copy top-level of Xen-supplied pagetable into place. This + * is a stand-in while we copy the pmd pages. */ memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); - if (PTRS_PER_PMD > 1) { - int i; - /* - * For PAE, need to allocate new pmds, rather than - * share Xen's, since Xen doesn't like pmd's being - * shared between address spaces. - */ - for (i = 0; i < PTRS_PER_PGD; i++) { - if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { - pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); + /* + * For PAE, need to allocate new pmds, rather than + * share Xen's, since Xen doesn't like pmd's being + * shared between address spaces. + */ + for (i = 0; i < PTRS_PER_PGD; i++) { + if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { + pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); - memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), - PAGE_SIZE); + memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), + PAGE_SIZE); - make_lowmem_page_readonly(pmd); + make_lowmem_page_readonly(pmd); - set_pgd(&base[i], __pgd(1 + __pa(pmd))); - } else - pgd_clear(&base[i]); - } + set_pgd(&base[i], __pgd(1 + __pa(pmd))); + } else + pgd_clear(&base[i]); } /* make sure zero_page is mapped RO so we can use it in pagetables */ @@ -873,17 +870,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base) /* Actually pin the pagetable down, but we can't set PG_pinned yet because the page structures don't exist yet. */ - { - unsigned level; - -#ifdef CONFIG_X86_PAE - level = MMUEXT_PIN_L3_TABLE; -#else - level = MMUEXT_PIN_L2_TABLE; -#endif - - pin_pagetable_pfn(level, PFN_DOWN(__pa(base))); - } + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base))); } /* This is called once we have the cpu_possible_map */ @@ -1093,7 +1080,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .make_pte = xen_make_pte, .make_pgd = xen_make_pgd, -#ifdef CONFIG_X86_PAE .set_pte_atomic = xen_set_pte_atomic, .set_pte_present = xen_set_pte_at, .set_pud = xen_set_pud, @@ -1102,7 +1088,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .make_pmd = xen_make_pmd, .pmd_val = xen_pmd_val, -#endif /* PAE */ .activate_mm = xen_activate_mm, .dup_mmap = xen_dup_mmap, diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 126766d..07c2653 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -222,7 +222,7 @@ pmdval_t xen_pmd_val(pmd_t pmd) ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; return ret; } -#ifdef CONFIG_X86_PAE + void xen_set_pud(pud_t *ptr, pud_t val) { struct multicall_space mcs; @@ -272,12 +272,6 @@ pmd_t xen_make_pmd(pmdval_t pmd) return native_make_pmd(pmd); } -#else /* !PAE */ -void xen_set_pte(pte_t *ptep, pte_t pte) -{ - *ptep = pte; -} -#endif /* CONFIG_X86_PAE */ /* (Yet another) pagetable walker. This one is intended for pinning a @@ -430,8 +424,6 @@ static int pin_page(struct page *page, enum pt_level level) read-only, and can be pinned. */ void xen_pgd_pin(pgd_t *pgd) { - unsigned level; - xen_mc_batch(); if (pgd_walk(pgd, pin_page, TASK_SIZE)) { @@ -441,14 +433,7 @@ void xen_pgd_pin(pgd_t *pgd) xen_mc_batch(); } -#ifdef CONFIG_X86_PAE - level = MMUEXT_PIN_L3_TABLE; -#else - level = MMUEXT_PIN_L2_TABLE; -#endif - - xen_do_pin(level, PFN_DOWN(__pa(pgd))); - + xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); xen_mc_issue(0); } diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index b5e189b..5fe961c 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -37,14 +37,13 @@ void xen_exit_mmap(struct mm_struct *mm); void xen_pgd_pin(pgd_t *pgd); //void xen_pgd_unpin(pgd_t *pgd); -#ifdef CONFIG_X86_PAE -unsigned long long xen_pte_val(pte_t); -unsigned long long xen_pmd_val(pmd_t); -unsigned long long xen_pgd_val(pgd_t); +pteval_t xen_pte_val(pte_t); +pmdval_t xen_pmd_val(pmd_t); +pgdval_t xen_pgd_val(pgd_t); -pte_t xen_make_pte(unsigned long long); -pmd_t xen_make_pmd(unsigned long long); -pgd_t xen_make_pgd(unsigned long long); +pte_t xen_make_pte(pteval_t); +pmd_t xen_make_pmd(pmdval_t); +pgd_t xen_make_pgd(pgdval_t); void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval); @@ -53,15 +52,4 @@ void xen_set_pud(pud_t *ptr, pud_t val); void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void xen_pmd_clear(pmd_t *pmdp); - -#else -unsigned long xen_pte_val(pte_t); -unsigned long xen_pmd_val(pmd_t); -unsigned long xen_pgd_val(pgd_t); - -pte_t xen_make_pte(unsigned long); -pmd_t xen_make_pmd(unsigned long); -pgd_t xen_make_pgd(unsigned long); -#endif - #endif /* _XEN_MMU_H */ diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 288d587..2ab5f42 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -30,11 +30,7 @@ ENTRY(hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") -#ifdef CONFIG_X86_PAE ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") -#else - ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no") -#endif ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") #endif /*CONFIG_XEN */ -- cgit v1.1 From 334ef7a7ab8f80b689a2be95d5e62d2167900865 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:13 +0200 Subject: x86: use performance variant for_each_cpu_mask_nr Change references from for_each_cpu_mask to for_each_cpu_mask_nr where appropriate Reviewed-by: Paul Jackson Reviewed-by: Christoph Lameter Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner commit 2d474871e2fb092eb46a0930aba5442e10eb96cc Author: Mike Travis Date: Mon May 12 21:21:13 2008 +0200 --- arch/x86/xen/smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 94e6900..7a70638 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -345,7 +345,7 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) cpus_and(mask, mask, cpu_online_map); - for_each_cpu_mask(cpu, mask) + for_each_cpu_mask_nr(cpu, mask) xen_send_IPI_one(cpu, vector); } @@ -413,7 +413,7 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), /* Make sure other vcpus get a chance to run if they need to. */ yield = false; - for_each_cpu_mask(cpu, mask) + for_each_cpu_mask_nr(cpu, mask) if (xen_vcpu_stolen(cpu)) yield = true; -- cgit v1.1 From 7b1333aa4cb546ddeb9c05098a53d9a777623a05 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 26 May 2008 23:31:01 +0100 Subject: xen: use hypercall rather than clts Xen will trap and emulate clts, but its better to use a hypercall. Also, xenner doesn't handle clts. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- arch/x86/xen/enlighten.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a05b772..446f4cd 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -607,6 +607,30 @@ static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, xen_mc_issue(PARAVIRT_LAZY_MMU); } +static void xen_clts(void) +{ + struct multicall_space mcs; + + mcs = xen_mc_entry(0); + + MULTI_fpu_taskswitch(mcs.mc, 0); + + xen_mc_issue(PARAVIRT_LAZY_CPU); +} + +static void xen_write_cr0(unsigned long cr0) +{ + struct multicall_space mcs; + + /* Only pay attention to cr0.TS; everything else is + ignored. */ + mcs = xen_mc_entry(0); + + MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0); + + xen_mc_issue(PARAVIRT_LAZY_CPU); +} + static void xen_write_cr2(unsigned long cr2) { x86_read_percpu(xen_vcpu)->arch.cr2 = cr2; @@ -978,10 +1002,10 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .set_debugreg = xen_set_debugreg, .get_debugreg = xen_get_debugreg, - .clts = native_clts, + .clts = xen_clts, .read_cr0 = native_read_cr0, - .write_cr0 = native_write_cr0, + .write_cr0 = xen_write_cr0, .read_cr4 = native_read_cr4, .read_cr4_safe = native_read_cr4_safe, -- cgit v1.1 From 349c709f42453707f74bece0d9d35ee5b3842893 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 26 May 2008 23:31:02 +0100 Subject: xen: use new sched_op Use the new sched_op hypercall, mainly because xenner doesn't support the old one. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- arch/x86/xen/enlighten.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 446f4cd..35ddaf5 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -254,7 +254,7 @@ static void xen_irq_enable(void) static void xen_safe_halt(void) { /* Blocking includes an implicit local_irq_enable(). */ - if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0) + if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) BUG(); } @@ -1138,11 +1138,13 @@ static const struct smp_ops xen_smp_ops __initdata = { static void xen_reboot(int reason) { + struct sched_shutdown r = { .reason = reason }; + #ifdef CONFIG_SMP smp_send_stop(); #endif - if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason)) + if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) BUG(); } -- cgit v1.1 From 2956a3511c8c5dccb1d4739ead17c7c3c23a24b7 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 26 May 2008 23:31:04 +0100 Subject: xen: allow some cr4 updates The guest can legitimately change things like cr4.OSFXSR and OSXMMEXCPT, so let it. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- arch/x86/xen/enlighten.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 35ddaf5..f687648 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -648,8 +648,10 @@ static unsigned long xen_read_cr2_direct(void) static void xen_write_cr4(unsigned long cr4) { - /* Just ignore cr4 changes; Xen doesn't allow us to do - anything anyway. */ + cr4 &= ~X86_CR4_PGE; + cr4 &= ~X86_CR4_PSE; + + native_write_cr4(cr4); } static unsigned long xen_read_cr3(void) -- cgit v1.1 From 239d1fc04ed0b58d638096b12a7f6d50269d30c9 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 26 May 2008 23:31:05 +0100 Subject: xen: don't worry about preempt during xen_irq_enable() When enabling interrupts, we don't need to worry about preemption, because we either enter with interrupts disabled - so no preemption - or the caller is confused and is re-enabling interrupts on some indeterminate processor. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- arch/x86/xen/enlighten.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index f687648..4a372b7 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -235,13 +235,13 @@ static void xen_irq_enable(void) { struct vcpu_info *vcpu; - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ - preempt_disable(); + /* We don't need to worry about being preempted here, since + either a) interrupts are disabled, so no preemption, or b) + the caller is confused and is trying to re-enable interrupts + on an indeterminate processor. */ + vcpu = x86_read_percpu(xen_vcpu); vcpu->evtchn_upcall_mask = 0; - preempt_enable_no_resched(); /* Doesn't matter if we get preempted here, because any pending event will get dealt with anyway. */ -- cgit v1.1 From a15af1c9ea2750a9ff01e51615c45950bad8221b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 26 May 2008 23:31:06 +0100 Subject: x86/paravirt: add pte_flags to just get pte flags Add pte_flags() to extract the flags from a pte. This is a special case of pte_val() which is only guaranteed to return the pte's flags correctly; the page number may be corrupted or missing. The intent is to allow paravirt implementations to return pte flags without having to do any translation of the page number (most notably, Xen). Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 4a372b7..1b4b5fa 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1101,6 +1101,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .set_pmd = xen_set_pmd, .pte_val = xen_pte_val, + .pte_flags = native_pte_val, .pgd_val = xen_pgd_val, .make_pte = xen_make_pte, -- cgit v1.1 From 9e124fe16ff24746d6de5a2ad685266d7bce0e08 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Mon, 26 May 2008 23:31:07 +0100 Subject: xen: Enable console tty by default in domU if it's not a dummy Without console= arguments on the kernel command line, the first console to register becomes enabled and the preferred console (the one behind /dev/console). This is normally tty (assuming CONFIG_VT_CONSOLE is enabled, which it commonly is). This is okay as long tty is a useful console. But unless we have the PV framebuffer, and it is enabled for this domain, tty0 in domU is merely a dummy. In that case, we want the preferred console to be the Xen console hvc0, and we want it without having to fiddle with the kernel command line. Commit b8c2d3dfbc117dff26058fbac316b8acfc2cb5f7 did that for us. Since we now have the PV framebuffer, we want to enable and prefer tty again, but only when PVFB is enabled. But even then we still want to enable the Xen console as well. Problem: when tty registers, we can't yet know whether the PVFB is enabled. By the time we can know (xenstore is up), the console setup game is over. Solution: enable console tty by default, but keep hvc as the preferred console. Change the preferred console to tty when PVFB probes successfully, unless we've been given console kernel parameters. Signed-off-by: Markus Armbruster Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- arch/x86/xen/enlighten.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 1b4b5fa..6cfb708 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1256,8 +1256,10 @@ asmlinkage void __init xen_start_kernel(void) ? __pa(xen_start_info->mod_start) : 0; boot_params.hdr.ramdisk_size = xen_start_info->mod_len; - if (!is_initial_xendomain()) + if (!is_initial_xendomain()) { + add_preferred_console("tty", 0, NULL); add_preferred_console("hvc", 0, NULL); + } /* Start the world */ start_kernel(); -- cgit v1.1 From 83abc70a4c6e306f4c1672e25884322f797e4fcb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 26 May 2008 23:31:12 +0100 Subject: xen: make earlyprintk=xen work again For some perverse reason, if you call add_preferred_console() it prevents setup_early_printk() from successfully enabling the boot console - unless you make it a preferred console too... Also, make xenboot console output distinct from normal console output, since it gets repeated when the console handover happens, and the duplicated output is confusing without disambiguation. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner Cc: Markus Armbruster Cc: Gerd Hoffmann --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 6cfb708..5c0635a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1257,6 +1257,7 @@ asmlinkage void __init xen_start_kernel(void) boot_params.hdr.ramdisk_size = xen_start_info->mod_len; if (!is_initial_xendomain()) { + add_preferred_console("xenboot", 0, NULL); add_preferred_console("tty", 0, NULL); add_preferred_console("hvc", 0, NULL); } -- cgit v1.1 From ec9b2065d4d3b797604c09a569083dd9ff951b1b Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Mon, 26 May 2008 23:31:13 +0100 Subject: xen: Move manage.c to drivers/xen for ia64/xen support move arch/x86/xen/manage.c under drivers/xen/to share codes with x86 and ia64. ia64/xen also uses manage.c Signed-off-by: Isaku Yamahata Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- arch/x86/xen/Makefile | 2 +- arch/x86/xen/manage.c | 143 -------------------------------------------------- 2 files changed, 1 insertion(+), 144 deletions(-) delete mode 100644 arch/x86/xen/manage.c (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 3d8df98..40b119b 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,4 +1,4 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o \ - time.o manage.o xen-asm.o grant-table.o + time.o xen-asm.o grant-table.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/x86/xen/manage.c b/arch/x86/xen/manage.c deleted file mode 100644 index aa7af9e..0000000 --- a/arch/x86/xen/manage.c +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Handle extern requests for shutdown, reboot and sysrq - */ -#include -#include -#include -#include - -#include - -#define SHUTDOWN_INVALID -1 -#define SHUTDOWN_POWEROFF 0 -#define SHUTDOWN_SUSPEND 2 -/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only - * report a crash, not be instructed to crash! - * HALT is the same as POWEROFF, as far as we're concerned. The tools use - * the distinction when we return the reason code to them. - */ -#define SHUTDOWN_HALT 4 - -/* Ignore multiple shutdown requests. */ -static int shutting_down = SHUTDOWN_INVALID; - -static void shutdown_handler(struct xenbus_watch *watch, - const char **vec, unsigned int len) -{ - char *str; - struct xenbus_transaction xbt; - int err; - - if (shutting_down != SHUTDOWN_INVALID) - return; - - again: - err = xenbus_transaction_start(&xbt); - if (err) - return; - - str = (char *)xenbus_read(xbt, "control", "shutdown", NULL); - /* Ignore read errors and empty reads. */ - if (XENBUS_IS_ERR_READ(str)) { - xenbus_transaction_end(xbt, 1); - return; - } - - xenbus_write(xbt, "control", "shutdown", ""); - - err = xenbus_transaction_end(xbt, 0); - if (err == -EAGAIN) { - kfree(str); - goto again; - } - - if (strcmp(str, "poweroff") == 0 || - strcmp(str, "halt") == 0) - orderly_poweroff(false); - else if (strcmp(str, "reboot") == 0) - ctrl_alt_del(); - else { - printk(KERN_INFO "Ignoring shutdown request: %s\n", str); - shutting_down = SHUTDOWN_INVALID; - } - - kfree(str); -} - -static void sysrq_handler(struct xenbus_watch *watch, const char **vec, - unsigned int len) -{ - char sysrq_key = '\0'; - struct xenbus_transaction xbt; - int err; - - again: - err = xenbus_transaction_start(&xbt); - if (err) - return; - if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) { - printk(KERN_ERR "Unable to read sysrq code in " - "control/sysrq\n"); - xenbus_transaction_end(xbt, 1); - return; - } - - if (sysrq_key != '\0') - xenbus_printf(xbt, "control", "sysrq", "%c", '\0'); - - err = xenbus_transaction_end(xbt, 0); - if (err == -EAGAIN) - goto again; - - if (sysrq_key != '\0') - handle_sysrq(sysrq_key, NULL); -} - -static struct xenbus_watch shutdown_watch = { - .node = "control/shutdown", - .callback = shutdown_handler -}; - -static struct xenbus_watch sysrq_watch = { - .node = "control/sysrq", - .callback = sysrq_handler -}; - -static int setup_shutdown_watcher(void) -{ - int err; - - err = register_xenbus_watch(&shutdown_watch); - if (err) { - printk(KERN_ERR "Failed to set shutdown watcher\n"); - return err; - } - - err = register_xenbus_watch(&sysrq_watch); - if (err) { - printk(KERN_ERR "Failed to set sysrq watcher\n"); - return err; - } - - return 0; -} - -static int shutdown_event(struct notifier_block *notifier, - unsigned long event, - void *data) -{ - setup_shutdown_watcher(); - return NOTIFY_DONE; -} - -static int __init setup_shutdown_event(void) -{ - static struct notifier_block xenstore_notifier = { - .notifier_call = shutdown_event - }; - register_xenstore_notifier(&xenstore_notifier); - - return 0; -} - -subsys_initcall(setup_shutdown_event); -- cgit v1.1 From 38bb5ab4179572f4d24d3ca7188172a31ca51a69 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 26 May 2008 23:31:16 +0100 Subject: xen: count resched interrupts properly Make sure resched interrupts appear in /proc/interrupts in the proper place. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- arch/x86/xen/smp.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 94e6900..74ab896 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -65,6 +65,12 @@ static struct call_data_struct *call_data; */ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) { +#ifdef CONFIG_X86_32 + __get_cpu_var(irq_stat).irq_resched_count++; +#else + add_pda(irq_resched_count, 1); +#endif + return IRQ_HANDLED; } -- cgit v1.1 From d451bb7aa852627bdf7be7937dc3d9d9f261b235 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 26 May 2008 23:31:18 +0100 Subject: xen: make phys_to_machine structure dynamic We now support the use of memory hotplug, so the physical to machine page mapping structure must be dynamic. This is implemented as a two-level radix tree structure, which allows us to efficiently incrementally allocate memory for the p2m table as new pages are added. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- arch/x86/xen/enlighten.c | 2 +- arch/x86/xen/mmu.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/xen/setup.c | 2 -- arch/x86/xen/xen-ops.h | 2 ++ 4 files changed, 88 insertions(+), 3 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5c0635a..73d3c84 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1221,7 +1221,7 @@ asmlinkage void __init xen_start_kernel(void) /* Get mfn list */ if (!xen_feature(XENFEAT_auto_translated_physmap)) - phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list; + xen_build_dynamic_phys_to_machine(); pgd = (pgd_t *)xen_start_info->pt_base; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 07c2653..c3b27de 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -56,6 +56,91 @@ #include "multicalls.h" #include "mmu.h" +/* + * This should probably be a config option. On 32-bit, it costs 1 + * page/gig of memory; on 64-bit its 2 pages/gig. If we want it to be + * completely unbounded we can add another level to the p2m structure. + */ +#define MAX_GUEST_PAGES (16ull * 1024*1024*1024 / PAGE_SIZE) +#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) + +static unsigned long *p2m_top[MAX_GUEST_PAGES / P2M_ENTRIES_PER_PAGE]; + +static inline unsigned p2m_top_index(unsigned long pfn) +{ + BUG_ON(pfn >= MAX_GUEST_PAGES); + return pfn / P2M_ENTRIES_PER_PAGE; +} + +static inline unsigned p2m_index(unsigned long pfn) +{ + return pfn % P2M_ENTRIES_PER_PAGE; +} + +void __init xen_build_dynamic_phys_to_machine(void) +{ + unsigned pfn; + unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; + + BUG_ON(xen_start_info->nr_pages >= MAX_GUEST_PAGES); + + for(pfn = 0; + pfn < xen_start_info->nr_pages; + pfn += P2M_ENTRIES_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + + p2m_top[topidx] = &mfn_list[pfn]; + } +} + +unsigned long get_phys_to_machine(unsigned long pfn) +{ + unsigned topidx, idx; + + topidx = p2m_top_index(pfn); + if (p2m_top[topidx] == NULL) + return INVALID_P2M_ENTRY; + + idx = p2m_index(pfn); + return p2m_top[topidx][idx]; +} + +static void alloc_p2m(unsigned long **pp) +{ + unsigned long *p; + unsigned i; + + p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); + BUG_ON(p == NULL); + + for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++) + p[i] = INVALID_P2M_ENTRY; + + if (cmpxchg(pp, NULL, p) != NULL) + free_page((unsigned long)p); +} + +void set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + unsigned topidx, idx; + + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); + return; + } + + topidx = p2m_top_index(pfn); + if (p2m_top[topidx] == NULL) { + /* no need to allocate a page to store an invalid entry */ + if (mfn == INVALID_P2M_ENTRY) + return; + alloc_p2m(&p2m_top[topidx]); + } + + idx = p2m_index(pfn); + p2m_top[topidx][idx] = mfn; +} + xmaddr_t arbitrary_virt_to_machine(unsigned long address) { unsigned int level; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 82517e4..37f8f0b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -27,8 +27,6 @@ extern const char xen_hypervisor_callback[]; extern const char xen_failsafe_callback[]; -unsigned long *phys_to_machine_mapping; -EXPORT_SYMBOL(phys_to_machine_mapping); /** * machine_specific_memory_setup - Hook for machine specific memory setup. diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index f1063ae..7bdc8c5 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -22,6 +22,8 @@ void __init xen_arch_setup(void); void __init xen_init_IRQ(void); void xen_enable_sysenter(void); +void __init xen_build_dynamic_phys_to_machine(void); + void xen_setup_timer(int cpu); void xen_setup_cpu_clockevents(void); unsigned long xen_cpu_khz(void); -- cgit v1.1 From 8006ec3e911f93d702e1d4a4e387e244ab434924 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 26 May 2008 23:31:19 +0100 Subject: xen: add configurable max domain size Add a config option to set the max size of a Xen domain. This is used to scale the size of the physical-to-machine array; it ends up using around 1 page/GByte, so there's no reason to be very restrictive. For a 32-bit guest, the default value of 8GB is probably sufficient; there's not much point in giving a 32-bit machine much more memory than that. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- arch/x86/xen/Kconfig | 10 ++++++++++ arch/x86/xen/mmu.c | 25 ++++++++++++------------- arch/x86/xen/setup.c | 3 +++ 3 files changed, 25 insertions(+), 13 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 525b108..d0f1c7c 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -11,3 +11,13 @@ config XEN This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the Xen hypervisor. + +config XEN_MAX_DOMAIN_MEMORY + int "Maximum allowed size of a domain in gigabytes" + default 8 + depends on XEN + help + The pseudo-physical to machine address array is sized + according to the maximum possible memory size of a Xen + domain. This array uses 1 page per gigabyte, so there's no + need to be too stingy here. \ No newline at end of file diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index c3b27de..644232a 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -56,19 +56,13 @@ #include "multicalls.h" #include "mmu.h" -/* - * This should probably be a config option. On 32-bit, it costs 1 - * page/gig of memory; on 64-bit its 2 pages/gig. If we want it to be - * completely unbounded we can add another level to the p2m structure. - */ -#define MAX_GUEST_PAGES (16ull * 1024*1024*1024 / PAGE_SIZE) #define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) -static unsigned long *p2m_top[MAX_GUEST_PAGES / P2M_ENTRIES_PER_PAGE]; +static unsigned long *p2m_top[MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE]; static inline unsigned p2m_top_index(unsigned long pfn) { - BUG_ON(pfn >= MAX_GUEST_PAGES); + BUG_ON(pfn >= MAX_DOMAIN_PAGES); return pfn / P2M_ENTRIES_PER_PAGE; } @@ -81,12 +75,9 @@ void __init xen_build_dynamic_phys_to_machine(void) { unsigned pfn; unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; + unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); - BUG_ON(xen_start_info->nr_pages >= MAX_GUEST_PAGES); - - for(pfn = 0; - pfn < xen_start_info->nr_pages; - pfn += P2M_ENTRIES_PER_PAGE) { + for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); p2m_top[topidx] = &mfn_list[pfn]; @@ -97,6 +88,9 @@ unsigned long get_phys_to_machine(unsigned long pfn) { unsigned topidx, idx; + if (unlikely(pfn >= MAX_DOMAIN_PAGES)) + return INVALID_P2M_ENTRY; + topidx = p2m_top_index(pfn); if (p2m_top[topidx] == NULL) return INVALID_P2M_ENTRY; @@ -129,6 +123,11 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn) return; } + if (unlikely(pfn >= MAX_DOMAIN_PAGES)) { + BUG_ON(mfn != INVALID_P2M_ENTRY); + return; + } + topidx = p2m_top_index(pfn); if (p2m_top[topidx] == NULL) { /* no need to allocate a page to store an invalid entry */ diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 37f8f0b..4884478 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -36,6 +37,8 @@ char * __init xen_memory_setup(void) { unsigned long max_pfn = xen_start_info->nr_pages; + max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); + e820.nr_map = 0; add_memory_region(0, LOWMEMSIZE(), E820_RAM); add_memory_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM); -- cgit v1.1 From cf0923ea295ba08ae656ef04164a43cb6553ba99 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 26 May 2008 23:31:20 +0100 Subject: xen: efficiently support a holey p2m table When using sparsemem and memory hotplug, the kernel's pseudo-physical address space can be discontigious. Previously this was dealt with by having the upper parts of the radix tree stubbed off. Unfortunately, this is incompatible with save/restore, which requires a complete p2m table. The solution is to have a special distinguished all-invalid p2m leaf page, which we can point all the hole areas at. This allows the tools to see a complete p2m table, but it only costs a page for all memory holes. It also simplifies the code since it removes a few special cases. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- arch/x86/xen/mmu.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 644232a..da7b45b 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -57,8 +57,17 @@ #include "mmu.h" #define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) +#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) -static unsigned long *p2m_top[MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE]; +/* Placeholder for holes in the address space */ +static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] + __attribute__((section(".data.page_aligned"))) = + { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; + + /* Array of pointers to pages containing p2m entries */ +static unsigned long *p2m_top[TOP_ENTRIES] + __attribute__((section(".data.page_aligned"))) = + { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; static inline unsigned p2m_top_index(unsigned long pfn) { @@ -92,9 +101,6 @@ unsigned long get_phys_to_machine(unsigned long pfn) return INVALID_P2M_ENTRY; topidx = p2m_top_index(pfn); - if (p2m_top[topidx] == NULL) - return INVALID_P2M_ENTRY; - idx = p2m_index(pfn); return p2m_top[topidx][idx]; } @@ -110,7 +116,7 @@ static void alloc_p2m(unsigned long **pp) for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++) p[i] = INVALID_P2M_ENTRY; - if (cmpxchg(pp, NULL, p) != NULL) + if (cmpxchg(pp, p2m_missing, p) != p2m_missing) free_page((unsigned long)p); } @@ -129,7 +135,7 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn) } topidx = p2m_top_index(pfn); - if (p2m_top[topidx] == NULL) { + if (p2m_top[topidx] == p2m_missing) { /* no need to allocate a page to store an invalid entry */ if (mfn == INVALID_P2M_ENTRY) return; -- cgit v1.1 From a0d695c821544947342a2d372ec4108bc813b979 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 26 May 2008 23:31:21 +0100 Subject: xen: make dummy_shared_info non-static Rename dummy_shared_info to xen_dummy_shared_info and make it non-static, in anticipation of users outside of enlighten.c Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- arch/x86/xen/enlighten.c | 6 +++--- arch/x86/xen/xen-ops.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 73d3c84..d9faaa2 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -75,13 +75,13 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ struct start_info *xen_start_info; EXPORT_SYMBOL_GPL(xen_start_info); -static /* __initdata */ struct shared_info dummy_shared_info; +struct shared_info xen_dummy_shared_info; /* * Point at some empty memory to start with. We map the real shared_info * page as soon as fixmap is up and running. */ -struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info; +struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; /* * Flag to determine whether vcpu info placement is available on all @@ -104,7 +104,7 @@ static void __init xen_vcpu_setup(int cpu) int err; struct vcpu_info *vcpup; - BUG_ON(HYPERVISOR_shared_info == &dummy_shared_info); + BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; if (!have_vcpu_info_placement) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 7bdc8c5..826d6e4 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -15,6 +15,7 @@ DECLARE_PER_CPU(unsigned long, xen_cr3); DECLARE_PER_CPU(unsigned long, xen_current_cr3); extern struct start_info *xen_start_info; +extern struct shared_info xen_dummy_shared_info; extern struct shared_info *HYPERVISOR_shared_info; char * __init xen_memory_setup(void); -- cgit v1.1 From d5edbc1f75420935b1ec7e65df10c8f81cea82de Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 26 May 2008 23:31:22 +0100 Subject: xen: add p2m mfn_list_list When saving a domain, the Xen tools need to remap all our mfns to portable pfns. In order to remap our p2m table, it needs to know where all its pages are, so maintain the references to the p2m table for it to use. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- arch/x86/xen/enlighten.c | 2 ++ arch/x86/xen/mmu.c | 39 ++++++++++++++++++++++++++++++++++++--- arch/x86/xen/xen-ops.h | 2 ++ 3 files changed, 40 insertions(+), 3 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index d9faaa2..ce67dc8 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -880,6 +880,8 @@ static __init void setup_shared_info(void) /* In UP this is as good a place as any to set up shared info */ xen_setup_vcpu_info_placement(); #endif + + xen_setup_mfn_list_list(); } static __init void xen_pagetable_setup_done(pgd_t *base) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index da7b45b..4740cda 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -69,6 +69,13 @@ static unsigned long *p2m_top[TOP_ENTRIES] __attribute__((section(".data.page_aligned"))) = { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; +/* Arrays of p2m arrays expressed in mfns used for save/restore */ +static unsigned long p2m_top_mfn[TOP_ENTRIES] + __attribute__((section(".bss.page_aligned"))); + +static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE] + __attribute__((section(".bss.page_aligned"))); + static inline unsigned p2m_top_index(unsigned long pfn) { BUG_ON(pfn >= MAX_DOMAIN_PAGES); @@ -80,11 +87,35 @@ static inline unsigned p2m_index(unsigned long pfn) return pfn % P2M_ENTRIES_PER_PAGE; } +/* Build the parallel p2m_top_mfn structures */ +void xen_setup_mfn_list_list(void) +{ + unsigned pfn, idx; + + for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + + p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); + } + + for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { + unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; + p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); + } + + BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); + + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = + virt_to_mfn(p2m_top_mfn_list); + HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages; +} + +/* Set up p2m_top to point to the domain-builder provided p2m pages */ void __init xen_build_dynamic_phys_to_machine(void) { - unsigned pfn; unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); + unsigned pfn; for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); @@ -105,7 +136,7 @@ unsigned long get_phys_to_machine(unsigned long pfn) return p2m_top[topidx][idx]; } -static void alloc_p2m(unsigned long **pp) +static void alloc_p2m(unsigned long **pp, unsigned long *mfnp) { unsigned long *p; unsigned i; @@ -118,6 +149,8 @@ static void alloc_p2m(unsigned long **pp) if (cmpxchg(pp, p2m_missing, p) != p2m_missing) free_page((unsigned long)p); + else + *mfnp = virt_to_mfn(p); } void set_phys_to_machine(unsigned long pfn, unsigned long mfn) @@ -139,7 +172,7 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn) /* no need to allocate a page to store an invalid entry */ if (mfn == INVALID_P2M_ENTRY) return; - alloc_p2m(&p2m_top[topidx]); + alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]); } idx = p2m_index(pfn); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 826d6e4..a1bc89a 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -18,6 +18,8 @@ extern struct start_info *xen_start_info; extern struct shared_info xen_dummy_shared_info; extern struct shared_info *HYPERVISOR_shared_info; +void xen_setup_mfn_list_list(void); + char * __init xen_memory_setup(void); void __init xen_arch_setup(void); void __init xen_init_IRQ(void); -- cgit v1.1 From 0e91398f2a5d4eb6b07df8115917d0d1cf3e9b58 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 26 May 2008 23:31:27 +0100 Subject: xen: implement save/restore This patch implements Xen save/restore and migration. Saving is triggered via xenbus, which is polled in drivers/xen/manage.c. When a suspend request comes in, the kernel prepares itself for saving by: 1 - Freeze all processes. This is primarily to prevent any partially-completed pagetable updates from confusing the suspend process. If CONFIG_PREEMPT isn't defined, then this isn't necessary. 2 - Suspend xenbus and other devices 3 - Stop_machine, to make sure all the other vcpus are quiescent. The Xen tools require the domain to run its save off vcpu0. 4 - Within the stop_machine state, it pins any unpinned pgds (under construction or destruction), performs canonicalizes various other pieces of state (mostly converting mfns to pfns), and finally 5 - Suspend the domain Restore reverses the steps used to save the domain, ending when all the frozen processes are thawed. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- arch/x86/xen/Makefile | 2 +- arch/x86/xen/enlighten.c | 6 +++--- arch/x86/xen/mmu.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/xen/smp.c | 2 +- arch/x86/xen/suspend.c | 42 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/xen/time.c | 8 ++++++++ arch/x86/xen/xen-ops.h | 4 ++++ 7 files changed, 105 insertions(+), 5 deletions(-) create mode 100644 arch/x86/xen/suspend.c (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 40b119b..2ba2d16 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,4 +1,4 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o \ - time.o xen-asm.o grant-table.o + time.o xen-asm.o grant-table.o suspend.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ce67dc8..b94f63a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -857,7 +857,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base) PFN_DOWN(__pa(xen_start_info->pt_base))); } -static __init void setup_shared_info(void) +void xen_setup_shared_info(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) { unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); @@ -894,7 +894,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base) pv_mmu_ops.release_pmd = xen_release_pmd; pv_mmu_ops.set_pte = xen_set_pte; - setup_shared_info(); + xen_setup_shared_info(); /* Actually pin the pagetable down, but we can't set PG_pinned yet because the page structures don't exist yet. */ @@ -902,7 +902,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base) } /* This is called once we have the cpu_possible_map */ -void __init xen_setup_vcpu_info_placement(void) +void xen_setup_vcpu_info_placement(void) { int cpu; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 4740cda..e959559 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -560,6 +560,29 @@ void xen_pgd_pin(pgd_t *pgd) xen_mc_issue(0); } +/* + * On save, we need to pin all pagetables to make sure they get their + * mfns turned into pfns. Search the list for any unpinned pgds and pin + * them (unpinned pgds are not currently in use, probably because the + * process is under construction or destruction). + */ +void xen_mm_pin_all(void) +{ + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&pgd_lock, flags); + + list_for_each_entry(page, &pgd_list, lru) { + if (!PagePinned(page)) { + xen_pgd_pin((pgd_t *)page_address(page)); + SetPageSavePinned(page); + } + } + + spin_unlock_irqrestore(&pgd_lock, flags); +} + /* The init_mm pagetable is really pinned as soon as its created, but that's before we have page structures to store the bits. So do all the book-keeping now. */ @@ -617,6 +640,29 @@ static void xen_pgd_unpin(pgd_t *pgd) xen_mc_issue(0); } +/* + * On resume, undo any pinning done at save, so that the rest of the + * kernel doesn't see any unexpected pinned pagetables. + */ +void xen_mm_unpin_all(void) +{ + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&pgd_lock, flags); + + list_for_each_entry(page, &pgd_list, lru) { + if (PageSavePinned(page)) { + BUG_ON(!PagePinned(page)); + printk("unpinning pinned %p\n", page_address(page)); + xen_pgd_unpin((pgd_t *)page_address(page)); + ClearPageSavePinned(page); + } + } + + spin_unlock_irqrestore(&pgd_lock, flags); +} + void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) { spin_lock(&next->page_table_lock); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 74ab896..d2e3c20 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -35,7 +35,7 @@ #include "xen-ops.h" #include "mmu.h" -static cpumask_t xen_cpu_initialized_map; +cpumask_t xen_cpu_initialized_map; static DEFINE_PER_CPU(int, resched_irq) = -1; static DEFINE_PER_CPU(int, callfunc_irq) = -1; static DEFINE_PER_CPU(int, debug_irq) = -1; diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c new file mode 100644 index 0000000..7620a16 --- /dev/null +++ b/arch/x86/xen/suspend.c @@ -0,0 +1,42 @@ +#include + +#include +#include +#include + +#include +#include + +#include "xen-ops.h" +#include "mmu.h" + +void xen_pre_suspend(void) +{ + xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); + xen_start_info->console.domU.mfn = + mfn_to_pfn(xen_start_info->console.domU.mfn); + + BUG_ON(!irqs_disabled()); + + HYPERVISOR_shared_info = &xen_dummy_shared_info; + if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP), + __pte_ma(0), 0)) + BUG(); +} + +void xen_post_suspend(int suspend_cancelled) +{ + if (suspend_cancelled) { + xen_start_info->store_mfn = + pfn_to_mfn(xen_start_info->store_mfn); + xen_start_info->console.domU.mfn = + pfn_to_mfn(xen_start_info->console.domU.mfn); + } else { +#ifdef CONFIG_SMP + xen_cpu_initialized_map = cpu_online_map; +#endif + } + + xen_setup_shared_info(); +} + diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index c39e1a5..0bef256 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -572,6 +572,14 @@ void xen_setup_cpu_clockevents(void) clockevents_register_device(&__get_cpu_var(xen_clock_events)); } +void xen_time_suspend(void) +{ +} + +void xen_time_resume(void) +{ +} + __init void xen_time_init(void) { int cpu = smp_processor_id(); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index a1bc89a..a0503ac 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -9,6 +9,7 @@ extern const char xen_hypervisor_callback[]; extern const char xen_failsafe_callback[]; +struct trap_info; void xen_copy_trap_info(struct trap_info *traps); DECLARE_PER_CPU(unsigned long, xen_cr3); @@ -19,6 +20,7 @@ extern struct shared_info xen_dummy_shared_info; extern struct shared_info *HYPERVISOR_shared_info; void xen_setup_mfn_list_list(void); +void xen_setup_shared_info(void); char * __init xen_memory_setup(void); void __init xen_arch_setup(void); @@ -59,6 +61,8 @@ int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info, int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, int wait); +extern cpumask_t xen_cpu_initialized_map; + /* Declare an asm function, along with symbols needed to make it inlineable */ -- cgit v1.1 From 359cdd3f866b6219a6729e313faf2221397f3278 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 26 May 2008 23:31:28 +0100 Subject: xen: maintain clock offset over save/restore Hook into the device model to make sure that timekeeping's resume handler is called. This deals with our clocksource's non-monotonicity over the save/restore. Explicitly call clock_has_changed() to make sure that all the timers get retriggered properly. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- arch/x86/xen/time.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 0bef256..c39e1a5 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -572,14 +572,6 @@ void xen_setup_cpu_clockevents(void) clockevents_register_device(&__get_cpu_var(xen_clock_events)); } -void xen_time_suspend(void) -{ -} - -void xen_time_resume(void) -{ -} - __init void xen_time_init(void) { int cpu = smp_processor_id(); -- cgit v1.1 From b20aeccd6ad42ccb6be1b3d1d32618ddd2b31bf0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2008 14:24:38 +0200 Subject: xen: fix early bootup crash on native hardware -tip tree auto-testing found the following early bootup hang: --------------> get_memcfg_from_srat: assigning address to rsdp RSD PTR v0 [Nvidia] BUG: Int 14: CR2 ffd00040 EDI 8092fbfe ESI ffd00040 EBP 80b0aee8 ESP 80b0aed0 EBX 000f76f0 EDX 0000000e ECX 00000003 EAX ffd00040 err 00000000 EIP 802c055a CS 00000060 flg 00010006 Stack: ffd00040 80bc78d0 80b0af6c 80b1dbfe 8093d8ba 00000008 80b42810 80b4ddb4 80b42842 00000000 80b0af1c 801079c8 808e724e 00000000 80b42871 802c0531 00000100 00000000 0003fff0 80b0af40 80129999 00040100 00040100 00000000 Pid: 0, comm: swapper Not tainted 2.6.26-rc4-sched-devel.git #570 [<802c055a>] ? strncmp+0x11/0x25 [<80b1dbfe>] ? get_memcfg_from_srat+0xb4/0x568 [<801079c8>] ? mcount_call+0x5/0x9 [<802c0531>] ? strcmp+0xa/0x22 [<80129999>] ? printk+0x38/0x3a [<80129999>] ? printk+0x38/0x3a [<8011b122>] ? memory_present+0x66/0x6f [<80b216b4>] ? setup_memory+0x13/0x40c [<80b16b47>] ? propagate_e820_map+0x80/0x97 [<80b1622a>] ? setup_arch+0x248/0x477 [<80129999>] ? printk+0x38/0x3a [<80b11759>] ? start_kernel+0x6e/0x2eb [<80b110fc>] ? i386_start_kernel+0xeb/0xf2 ======================= <------ with this config: http://redhat.com/~mingo/misc/config-Wed_May_28_01_33_33_CEST_2008.bad The thing is, the crash makes little sense at first sight. We crash on a benign-looking printk. The code around it got changed in -tip but checking those topic branches individually did not reproduce the bug. Bisection led to this commit: | d5edbc1f75420935b1ec7e65df10c8f81cea82de is first bad commit | commit d5edbc1f75420935b1ec7e65df10c8f81cea82de | Author: Jeremy Fitzhardinge | Date: Mon May 26 23:31:22 2008 +0100 | | xen: add p2m mfn_list_list Which is somewhat surprising, as on native hardware Xen client side should have little to no side-effects. After some head scratching, it turns out the following happened: randconfig enabled the following Xen options: CONFIG_XEN=y CONFIG_XEN_MAX_DOMAIN_MEMORY=8 # CONFIG_XEN_BLKDEV_FRONTEND is not set # CONFIG_XEN_NETDEV_FRONTEND is not set CONFIG_HVC_XEN=y # CONFIG_XEN_BALLOON is not set which activated this piece of code in arch/x86/xen/mmu.c: > @@ -69,6 +69,13 @@ > __attribute__((section(".data.page_aligned"))) = > { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; > > +/* Arrays of p2m arrays expressed in mfns used for save/restore */ > +static unsigned long p2m_top_mfn[TOP_ENTRIES] > + __attribute__((section(".bss.page_aligned"))); > + > +static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE] > + __attribute__((section(".bss.page_aligned"))); The problem is, you must only put variables into .bss.page_aligned that have a _size_ that is _exactly_ page aligned. In this case the size of p2m_top_mfn_list is not page aligned: 80b8d000 b p2m_top_mfn 80b8f000 b p2m_top_mfn_list 80b8f008 b softirq_stack 80b97008 b hardirq_stack 80b9f008 b bm_pte So all subsequent variables get unaligned which, depending on luck, breaks the kernel in various funny ways. In this case what killed the kernel first was the misaligned bootmap pte page, resulting in that creative crash above. Anyway, this was a fun bug to track down :-) I think the moral is that .bss.page_aligned is a dangerous construct in its current form, and the symptoms of breakage are very non-trivial, so i think we need build-time checks to make sure all symbols in .bss.page_aligned are truly page aligned. The Xen fix below gets the kernel booting again. Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index e959559..eef3b5c 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -73,7 +73,8 @@ static unsigned long *p2m_top[TOP_ENTRIES] static unsigned long p2m_top_mfn[TOP_ENTRIES] __attribute__((section(".bss.page_aligned"))); -static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE] +static unsigned long p2m_top_mfn_list[ + PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)] __attribute__((section(".bss.page_aligned"))); static inline unsigned p2m_top_index(unsigned long pfn) -- cgit v1.1 From f0d43100f13be0fa5bf52741d7084bb27f00e621 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 29 May 2008 12:56:36 -0700 Subject: x86: extend e820 early_res support 32bit -fix #3 introduce init_pg_table_start, so xen PV could specify the value. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c8a56e4..ccc7b84 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1211,6 +1211,7 @@ asmlinkage void __init xen_start_kernel(void) pgd = (pgd_t *)xen_start_info->pt_base; + init_pg_tables_start = __pa(pgd); init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; init_mm.pgd = pgd; /* use the Xen pagetables to start */ @@ -1246,5 +1247,5 @@ asmlinkage void __init xen_start_kernel(void) add_preferred_console("hvc", 0, NULL); /* Start the world */ - start_kernel(); + i386_start_kernel(); } -- cgit v1.1 From 15ce60056b24a65b65e28de973a9fd8ac0750a2f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 2 Jun 2008 13:20:11 +0200 Subject: xen: export get_phys_to_machine -tip testing found the following xen-console symbols trouble: ERROR: "get_phys_to_machine" [drivers/video/xen-fbfront.ko] undefined! ERROR: "get_phys_to_machine" [drivers/net/xen-netfront.ko] undefined! ERROR: "get_phys_to_machine" [drivers/input/xen-kbdfront.ko] undefined! with: http://redhat.com/~mingo/misc/config-Mon_Jun__2_12_25_13_CEST_2008.bad --- arch/x86/xen/mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index eef3b5c..17f374e 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -136,6 +136,7 @@ unsigned long get_phys_to_machine(unsigned long pfn) idx = p2m_index(pfn); return p2m_top[topidx][idx]; } +EXPORT_SYMBOL_GPL(get_phys_to_machine); static void alloc_p2m(unsigned long **pp, unsigned long *mfnp) { -- cgit v1.1 From e2426cf85f8db5891fb5831323d2d0c176c4dadc Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 31 May 2008 01:24:27 +0100 Subject: xen: avoid hypercalls when updating unpinned pud/pmd When operating on an unpinned pagetable (ie, one under construction or destruction), it isn't necessary to use a hypercall to update a pud/pmd entry. Jan Beulich observed that a similar optimisation avoided many thousands of hypercalls while doing a kernel build. One tricky part is that early in the kernel boot there's no page structure, so we can't check to see if the page is pinned. In that case, we just always use the hypercall. Signed-off-by: Jeremy Fitzhardinge Cc: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 14 +++++++++++--- arch/x86/xen/mmu.c | 39 +++++++++++++++++++++++++++++++++++---- arch/x86/xen/mmu.h | 8 ++++---- 3 files changed, 50 insertions(+), 11 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b94f63a..ed9f04b 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -901,6 +901,14 @@ static __init void xen_pagetable_setup_done(pgd_t *base) pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base))); } +static __init void xen_post_allocator_init(void) +{ + pv_mmu_ops.set_pmd = xen_set_pmd; + pv_mmu_ops.set_pud = xen_set_pud; + + xen_mark_init_mm_pinned(); +} + /* This is called once we have the cpu_possible_map */ void xen_setup_vcpu_info_placement(void) { @@ -988,7 +996,7 @@ static const struct pv_init_ops xen_init_ops __initdata = { .banner = xen_banner, .memory_setup = xen_memory_setup, .arch_setup = xen_arch_setup, - .post_allocator_init = xen_mark_init_mm_pinned, + .post_allocator_init = xen_post_allocator_init, }; static const struct pv_time_ops xen_time_ops __initdata = { @@ -1100,7 +1108,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .set_pte = NULL, /* see xen_pagetable_setup_* */ .set_pte_at = xen_set_pte_at, - .set_pmd = xen_set_pmd, + .set_pmd = xen_set_pmd_hyper, .pte_val = xen_pte_val, .pte_flags = native_pte_val, @@ -1111,7 +1119,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .set_pte_atomic = xen_set_pte_atomic, .set_pte_present = xen_set_pte_at, - .set_pud = xen_set_pud, + .set_pud = xen_set_pud_hyper, .pte_clear = xen_pte_clear, .pmd_clear = xen_pmd_clear, diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 17f374e..4fa0934 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -223,7 +223,14 @@ void make_lowmem_page_readwrite(void *vaddr) } -void xen_set_pmd(pmd_t *ptr, pmd_t val) +static bool page_pinned(void *ptr) +{ + struct page *page = virt_to_page(ptr); + + return PagePinned(page); +} + +void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) { struct multicall_space mcs; struct mmu_update *u; @@ -241,6 +248,18 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val) preempt_enable(); } +void xen_set_pmd(pmd_t *ptr, pmd_t val) +{ + /* If page is not pinned, we can just update the entry + directly */ + if (!page_pinned(ptr)) { + *ptr = val; + return; + } + + xen_set_pmd_hyper(ptr, val); +} + /* * Associate a virtual page frame with a given physical page frame * and protection flags for that frame. @@ -348,7 +367,7 @@ pmdval_t xen_pmd_val(pmd_t pmd) return ret; } -void xen_set_pud(pud_t *ptr, pud_t val) +void xen_set_pud_hyper(pud_t *ptr, pud_t val) { struct multicall_space mcs; struct mmu_update *u; @@ -366,6 +385,18 @@ void xen_set_pud(pud_t *ptr, pud_t val) preempt_enable(); } +void xen_set_pud(pud_t *ptr, pud_t val) +{ + /* If page is not pinned, we can just update the entry + directly */ + if (!page_pinned(ptr)) { + *ptr = val; + return; + } + + xen_set_pud_hyper(ptr, val); +} + void xen_set_pte(pte_t *ptep, pte_t pte) { ptep->pte_high = pte.pte_high; @@ -387,7 +418,7 @@ void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) void xen_pmd_clear(pmd_t *pmdp) { - xen_set_pmd(pmdp, __pmd(0)); + set_pmd(pmdp, __pmd(0)); } pmd_t xen_make_pmd(pmdval_t pmd) @@ -758,7 +789,7 @@ void xen_exit_mmap(struct mm_struct *mm) spin_lock(&mm->page_table_lock); /* pgd may not be pinned in the error exit path of execve */ - if (PagePinned(virt_to_page(mm->pgd))) + if (page_pinned(mm->pgd)) xen_pgd_unpin(mm->pgd); spin_unlock(&mm->page_table_lock); diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 5fe961c..e3dd09e 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -25,10 +25,6 @@ enum pt_level { void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -void xen_set_pte(pte_t *ptep, pte_t pteval); -void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval); -void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); @@ -45,10 +41,14 @@ pte_t xen_make_pte(pteval_t); pmd_t xen_make_pmd(pmdval_t); pgd_t xen_make_pgd(pgdval_t); +void xen_set_pte(pte_t *ptep, pte_t pteval); void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval); void xen_set_pte_atomic(pte_t *ptep, pte_t pte); +void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); void xen_set_pud(pud_t *ptr, pud_t val); +void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval); +void xen_set_pud_hyper(pud_t *ptr, pud_t val); void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void xen_pmd_clear(pmd_t *pmdp); -- cgit v1.1 From 9c7a794209f8a91f47697c3be20597eb60531e6d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 31 May 2008 01:33:02 +0100 Subject: xen: restore vcpu_info mapping If we're using vcpu_info mapping, then make sure its restored on all processors before relasing them from stop_machine. The only complication is that if this fails, we can't continue because we've already made assumptions that the mapping is available (baked in calls to the _direct versions of the functions, for example). Fortunately this can only happen with a 32-bit hypervisor, which may possibly run out of mapping space. On a 64-bit hypervisor, this is a non-issue. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 30 +++++++++++++++++++++++++++++- arch/x86/xen/suspend.c | 4 +++- arch/x86/xen/xen-ops.h | 1 + 3 files changed, 33 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ed9f04b..8e6152e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -98,7 +98,7 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; */ static int have_vcpu_info_placement = 1; -static void __init xen_vcpu_setup(int cpu) +static void xen_vcpu_setup(int cpu) { struct vcpu_register_vcpu_info info; int err; @@ -136,6 +136,34 @@ static void __init xen_vcpu_setup(int cpu) } } +/* + * On restore, set the vcpu placement up again. + * If it fails, then we're in a bad state, since + * we can't back out from using it... + */ +void xen_vcpu_restore(void) +{ + if (have_vcpu_info_placement) { + int cpu; + + for_each_online_cpu(cpu) { + bool other_cpu = (cpu != smp_processor_id()); + + if (other_cpu && + HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) + BUG(); + + xen_vcpu_setup(cpu); + + if (other_cpu && + HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) + BUG(); + } + + BUG_ON(!have_vcpu_info_placement); + } +} + static void __init xen_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 7620a16..7ab10f6 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -26,6 +26,8 @@ void xen_pre_suspend(void) void xen_post_suspend(int suspend_cancelled) { + xen_setup_shared_info(); + if (suspend_cancelled) { xen_start_info->store_mfn = pfn_to_mfn(xen_start_info->store_mfn); @@ -35,8 +37,8 @@ void xen_post_suspend(int suspend_cancelled) #ifdef CONFIG_SMP xen_cpu_initialized_map = cpu_online_map; #endif + xen_vcpu_restore(); } - xen_setup_shared_info(); } diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index a0503ac..a457e03 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -26,6 +26,7 @@ char * __init xen_memory_setup(void); void __init xen_arch_setup(void); void __init xen_init_IRQ(void); void xen_enable_sysenter(void); +void xen_vcpu_restore(void); void __init xen_build_dynamic_phys_to_machine(void); -- cgit v1.1 From d07af1f0e3a3e378074fc36322dd7b0e72d9a3e2 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 31 May 2008 01:33:03 +0100 Subject: xen: resume timers on all vcpus On resume, the vcpu timer modes will not be restored. The timer infrastructure doesn't do this for us, since it assumes the cpus are offline. We can just poke the other vcpus into the right mode directly though. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/suspend.c | 1 + arch/x86/xen/time.c | 13 +++++++++++++ arch/x86/xen/xen-ops.h | 1 + 3 files changed, 15 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 7ab10f6..251669a 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -38,6 +38,7 @@ void xen_post_suspend(int suspend_cancelled) xen_cpu_initialized_map = cpu_online_map; #endif xen_vcpu_restore(); + xen_timer_resume(); } } diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index c39e1a5..ea137fb 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -572,6 +572,19 @@ void xen_setup_cpu_clockevents(void) clockevents_register_device(&__get_cpu_var(xen_clock_events)); } +void xen_timer_resume(void) +{ + int cpu; + + if (xen_clockevent != &xen_vcpuop_clockevent) + return; + + for_each_online_cpu(cpu) { + if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) + BUG(); + } +} + __init void xen_time_init(void) { int cpu = smp_processor_id(); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index a457e03..9a05559 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -37,6 +37,7 @@ void __init xen_time_init(void); unsigned long xen_get_wallclock(void); int xen_set_wallclock(unsigned long time); unsigned long long xen_sched_clock(void); +void xen_timer_resume(void); irqreturn_t xen_debug_interrupt(int irq, void *dev_id); -- cgit v1.1 From 7e0edc1bc343231029084761ebf59e522902eb49 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 31 May 2008 01:33:04 +0100 Subject: xen: add new Xen elfnote types and use them appropriately Define recently added XEN_ELFNOTEs, and use them appropriately. Most significantly, this enables domain checkpointing (xm save -c). Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/xen-head.S | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 2ab5f42..ef6c9e0 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -7,6 +7,7 @@ #include #include #include +#include __INIT ENTRY(startup_xen) @@ -32,5 +33,9 @@ ENTRY(hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, + .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) + ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long __HYPERVISOR_VIRT_START) #endif /*CONFIG_XEN */ -- cgit v1.1 From eb179e443deb0a5c81a62b4c157124a4b7ff1813 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 16 Jun 2008 15:01:53 -0700 Subject: xen: mask unwanted pte bits in __supported_pte_mask [ Stable: this isn't a bugfix in itself, but it's a pre-requiste for "xen: don't drop NX bit" ] Signed-off-by: Jeremy Fitzhardinge Cc: Stable Kernel Cc: the arch/x86 maintainers Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 5 +++++ arch/x86/xen/mmu.c | 4 +--- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 8e6152e..73fb0c4 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1280,6 +1280,11 @@ asmlinkage void __init xen_start_kernel(void) if (xen_feature(XENFEAT_supervisor_mode_kernel)) pv_info.kernel_rpl = 0; + /* Prevent unwanted bits from being set in PTEs. */ + __supported_pte_mask &= ~_PAGE_GLOBAL; + if (!is_initial_xendomain()) + __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); + /* set the limit of our address space */ xen_reserve_top(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 7c99358..f0078d9 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -343,10 +343,8 @@ pgdval_t xen_pgd_val(pgd_t pgd) pte_t xen_make_pte(pteval_t pte) { - if (pte & _PAGE_PRESENT) { + if (pte & _PAGE_PRESENT) pte = phys_to_machine(XPADDR(pte)).maddr; - pte &= ~(_PAGE_PCD | _PAGE_PWT); - } return (pte_t){ .pte = pte }; } -- cgit v1.1 From a987b16cc6123af2c9414032701bab5f73c54c89 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 16 Jun 2008 15:01:56 -0700 Subject: xen: don't drop NX bit Because NX is now enforced properly, we must put the hypercall page into the .text segment so that it is executable. Signed-off-by: Jeremy Fitzhardinge Cc: Stable Kernel Cc: the arch/x86 maintainers Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 54 +++++++++++++++++++++++++++---------------------- arch/x86/xen/xen-head.S | 2 +- 2 files changed, 31 insertions(+), 25 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index f0078d9..8132aa8 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -323,46 +323,54 @@ out: preempt_enable(); } -pteval_t xen_pte_val(pte_t pte) +/* Assume pteval_t is equivalent to all the other *val_t types. */ +static pteval_t pte_mfn_to_pfn(pteval_t val) { - pteval_t ret = pte.pte; + if (val & _PAGE_PRESENT) { + unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT; + pteval_t flags = val & ~PTE_MASK; + val = (mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; + } - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; + return val; +} - return ret; +static pteval_t pte_pfn_to_mfn(pteval_t val) +{ + if (val & _PAGE_PRESENT) { + unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT; + pteval_t flags = val & ~PTE_MASK; + val = (pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; + } + + return val; +} + +pteval_t xen_pte_val(pte_t pte) +{ + return pte_mfn_to_pfn(pte.pte); } pgdval_t xen_pgd_val(pgd_t pgd) { - pgdval_t ret = pgd.pgd; - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; - return ret; + return pte_mfn_to_pfn(pgd.pgd); } pte_t xen_make_pte(pteval_t pte) { - if (pte & _PAGE_PRESENT) - pte = phys_to_machine(XPADDR(pte)).maddr; - - return (pte_t){ .pte = pte }; + pte = pte_pfn_to_mfn(pte); + return native_make_pte(pte); } pgd_t xen_make_pgd(pgdval_t pgd) { - if (pgd & _PAGE_PRESENT) - pgd = phys_to_machine(XPADDR(pgd)).maddr; - - return (pgd_t){ pgd }; + pgd = pte_pfn_to_mfn(pgd); + return native_make_pgd(pgd); } pmdval_t xen_pmd_val(pmd_t pmd) { - pmdval_t ret = native_pmd_val(pmd); - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; - return ret; + return pte_mfn_to_pfn(pmd.pmd); } void xen_set_pud_hyper(pud_t *ptr, pud_t val) @@ -421,9 +429,7 @@ void xen_pmd_clear(pmd_t *pmdp) pmd_t xen_make_pmd(pmdval_t pmd) { - if (pmd & _PAGE_PRESENT) - pmd = phys_to_machine(XPADDR(pmd)).maddr; - + pmd = pte_pfn_to_mfn(pmd); return native_make_pmd(pmd); } diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index ef6c9e0..7c0cf63 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -18,7 +18,7 @@ ENTRY(startup_xen) __FINIT -.pushsection .bss.page_aligned +.pushsection .text .align PAGE_SIZE_asm ENTRY(hypercall_page) .skip 0x1000 -- cgit v1.1 From aeaaa59c7e15dcfaaf57ce069ef81683067d575d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jun 2008 11:42:01 -0700 Subject: x86/paravirt/xen: add set_fixmap pv_mmu_ops Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Juan Quintela Signed-off-by: Eduardo Habkost Signed-off-by: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c8a56e4..0ad8a64 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -960,6 +960,33 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, return ret; } +static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) +{ + pte_t pte; + + phys >>= PAGE_SHIFT; + + switch (idx) { + case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: +#ifdef CONFIG_X86_F00F_BUG + case FIX_F00F_IDT: +#endif + case FIX_WP_TEST: + case FIX_VDSO: +#ifdef CONFIG_X86_LOCAL_APIC + case FIX_APIC_BASE: /* maps dummy local APIC */ +#endif + pte = pfn_pte(phys, prot); + break; + + default: + pte = mfn_pte(phys, prot); + break; + } + + __native_set_fixmap(idx, pte); +} + static const struct pv_info xen_info __initdata = { .paravirt_enabled = 1, .shared_kernel_pmd = 0, @@ -1112,6 +1139,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .enter = paravirt_enter_lazy_mmu, .leave = xen_leave_lazy, }, + + .set_fixmap = xen_set_fixmap, }; #ifdef CONFIG_SMP -- cgit v1.1 From 08b882c627aeeeb3cfd3c4354f0d360d7949549d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 16 Jun 2008 04:30:01 -0700 Subject: paravirt: add hooks for ptep_modify_prot_start/commit This patch adds paravirt-ops hooks in pv_mmu_ops for ptep_modify_prot_start and ptep_modify_prot_commit. This allows the hypervisor-specific backends to implement these in some more efficient way. Signed-off-by: Jeremy Fitzhardinge Acked-by: Linus Torvalds Acked-by: Hugh Dickins Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 73fb0c4..0b7553c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1138,6 +1138,9 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .set_pte_at = xen_set_pte_at, .set_pmd = xen_set_pmd_hyper, + .ptep_modify_prot_start = __ptep_modify_prot_start, + .ptep_modify_prot_commit = __ptep_modify_prot_commit, + .pte_val = xen_pte_val, .pte_flags = native_pte_val, .pgd_val = xen_pgd_val, -- cgit v1.1 From e57778a1e30470c9f5b79e370511b9af29b59c48 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 16 Jun 2008 04:30:02 -0700 Subject: xen: implement ptep_modify_prot_start/commit Xen has a pte update function which will update a pte while preserving its accessed and dirty bits. This means that ptep_modify_prot_start() can be implemented as a simple read of the pte value. The hardware may update the pte in the meantime, but ptep_modify_prot_commit() updates it while preserving any changes that may have happened in the meantime. The updates in ptep_modify_prot_commit() are batched if we're currently in lazy mmu mode. The mmu_update hypercall can take a batch of updates to perform, but this code doesn't make particular use of that feature, in favour of using generic multicall batching to get them all into the hypervisor. The net effect of this is that each mprotect pte update turns from two expensive trap-and-emulate faults into they hypervisor into a single hypercall whose cost is amortized in a batched multicall. Signed-off-by: Jeremy Fitzhardinge Acked-by: Linus Torvalds Acked-by: Hugh Dickins Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 13 ++++++++++--- arch/x86/xen/mmu.c | 21 +++++++++++++++++++++ arch/x86/xen/mmu.h | 4 ++++ 3 files changed, 35 insertions(+), 3 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0b7553c..bd74229 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -168,7 +168,9 @@ static void __init xen_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", pv_info.name); - printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); + printk(KERN_INFO "Hypervisor signature: %s%s\n", + xen_start_info->magic, + xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); } static void xen_cpuid(unsigned int *ax, unsigned int *bx, @@ -1243,6 +1245,8 @@ asmlinkage void __init xen_start_kernel(void) BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); + xen_setup_features(); + /* Install Xen paravirt ops */ pv_info = xen_info; pv_init_ops = xen_init_ops; @@ -1252,14 +1256,17 @@ asmlinkage void __init xen_start_kernel(void) pv_apic_ops = xen_apic_ops; pv_mmu_ops = xen_mmu_ops; + if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { + pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; + pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; + } + machine_ops = xen_machine_ops; #ifdef CONFIG_SMP smp_ops = xen_smp_ops; #endif - xen_setup_features(); - /* Get mfn list */ if (!xen_feature(XENFEAT_auto_translated_physmap)) xen_build_dynamic_phys_to_machine(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 8132aa8..846dad7 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -323,6 +323,27 @@ out: preempt_enable(); } +pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + /* Just return the pte as-is. We preserve the bits on commit */ + return *ptep; +} + +void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + struct multicall_space mcs; + struct mmu_update *u; + + mcs = xen_mc_entry(sizeof(*u)); + u = mcs.args; + u->ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; + u->val = pte_val_ma(pte); + MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} + /* Assume pteval_t is equivalent to all the other *val_t types. */ static pteval_t pte_mfn_to_pfn(pteval_t val) { diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index e3dd09e..297bf9f 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -52,4 +52,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val); void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void xen_pmd_clear(pmd_t *pmdp); +pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); + #endif /* _XEN_MMU_H */ -- cgit v1.1 From 400d34944c4ad82a817c06e570bc93b1114aa596 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 16 Jun 2008 04:30:03 -0700 Subject: xen: add mechanism to extend existing multicalls Some Xen hypercalls accept an array of operations to work on. In general this is because its more efficient for the hypercall to the work all at once rather than as separate hypercalls (even batched as a multicall). This patch adds a mechanism (xen_mc_extend_args()) to allocate more argument space to the last-issued multicall, in order to extend its argument list. The user of this mechanism is xen/mmu.c, which uses it to extend the args array of mmu_update. This is particularly valuable when doing the update for a large mprotect, which goes via ptep_modify_prot_commit(), but it also manages to batch updates to pgd/pmds as well. Signed-off-by: Jeremy Fitzhardinge Acked-by: Linus Torvalds Acked-by: Hugh Dickins Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 55 ++++++++++++++++++++++++++++++----------------- arch/x86/xen/multicalls.c | 40 ++++++++++++++++++++++++++++------ arch/x86/xen/multicalls.h | 12 +++++++++++ 3 files changed, 81 insertions(+), 26 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 846dad7..f6b8225 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -230,18 +230,35 @@ static bool page_pinned(void *ptr) return PagePinned(page); } -void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) +static void extend_mmu_update(const struct mmu_update *update) { struct multicall_space mcs; struct mmu_update *u; - preempt_disable(); + mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); + + if (mcs.mc != NULL) + mcs.mc->args[1]++; + else { + mcs = __xen_mc_entry(sizeof(*u)); + MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); + } - mcs = xen_mc_entry(sizeof(*u)); u = mcs.args; - u->ptr = virt_to_machine(ptr).maddr; - u->val = pmd_val_ma(val); - MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); + *u = *update; +} + +void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) +{ + struct mmu_update u; + + preempt_disable(); + + xen_mc_batch(); + + u.ptr = virt_to_machine(ptr).maddr; + u.val = pmd_val_ma(val); + extend_mmu_update(&u); xen_mc_issue(PARAVIRT_LAZY_MMU); @@ -332,14 +349,13 @@ pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - struct multicall_space mcs; - struct mmu_update *u; + struct mmu_update u; - mcs = xen_mc_entry(sizeof(*u)); - u = mcs.args; - u->ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; - u->val = pte_val_ma(pte); - MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); + xen_mc_batch(); + + u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; + u.val = pte_val_ma(pte); + extend_mmu_update(&u); xen_mc_issue(PARAVIRT_LAZY_MMU); } @@ -396,16 +412,15 @@ pmdval_t xen_pmd_val(pmd_t pmd) void xen_set_pud_hyper(pud_t *ptr, pud_t val) { - struct multicall_space mcs; - struct mmu_update *u; + struct mmu_update u; preempt_disable(); - mcs = xen_mc_entry(sizeof(*u)); - u = mcs.args; - u->ptr = virt_to_machine(ptr).maddr; - u->val = pud_val_ma(val); - MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); + xen_mc_batch(); + + u.ptr = virt_to_machine(ptr).maddr; + u.val = pud_val_ma(val); + extend_mmu_update(&u); xen_mc_issue(PARAVIRT_LAZY_MMU); diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 5791eb2..3c63c4da 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -29,14 +29,14 @@ #define MC_DEBUG 1 #define MC_BATCH 32 -#define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) +#define MC_ARGS (MC_BATCH * 16) struct mc_buffer { struct multicall_entry entries[MC_BATCH]; #if MC_DEBUG struct multicall_entry debug[MC_BATCH]; #endif - u64 args[MC_ARGS]; + unsigned char args[MC_ARGS]; struct callback { void (*fn)(void *); void *data; @@ -107,20 +107,48 @@ struct multicall_space __xen_mc_entry(size_t args) { struct mc_buffer *b = &__get_cpu_var(mc_buffer); struct multicall_space ret; - unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64); + unsigned argidx = roundup(b->argidx, sizeof(u64)); BUG_ON(preemptible()); - BUG_ON(argspace > MC_ARGS); + BUG_ON(b->argidx > MC_ARGS); if (b->mcidx == MC_BATCH || - (b->argidx + argspace) > MC_ARGS) + (argidx + args) > MC_ARGS) { xen_mc_flush(); + argidx = roundup(b->argidx, sizeof(u64)); + } ret.mc = &b->entries[b->mcidx]; b->mcidx++; + ret.args = &b->args[argidx]; + b->argidx = argidx + args; + + BUG_ON(b->argidx > MC_ARGS); + return ret; +} + +struct multicall_space xen_mc_extend_args(unsigned long op, size_t size) +{ + struct mc_buffer *b = &__get_cpu_var(mc_buffer); + struct multicall_space ret = { NULL, NULL }; + + BUG_ON(preemptible()); + BUG_ON(b->argidx > MC_ARGS); + + if (b->mcidx == 0) + return ret; + + if (b->entries[b->mcidx - 1].op != op) + return ret; + + if ((b->argidx + size) > MC_ARGS) + return ret; + + ret.mc = &b->entries[b->mcidx - 1]; ret.args = &b->args[b->argidx]; - b->argidx += argspace; + b->argidx += size; + BUG_ON(b->argidx > MC_ARGS); return ret; } diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h index 8bae996..8589382 100644 --- a/arch/x86/xen/multicalls.h +++ b/arch/x86/xen/multicalls.h @@ -45,4 +45,16 @@ static inline void xen_mc_issue(unsigned mode) /* Set up a callback to be called when the current batch is flushed */ void xen_mc_callback(void (*fn)(void *), void *data); +/* + * Try to extend the arguments of the previous multicall command. The + * previous command's op must match. If it does, then it attempts to + * extend the argument space allocated to the multicall entry by + * arg_size bytes. + * + * The returned multicall_space will return with mc pointing to the + * command on success, or NULL on failure, and args pointing to the + * newly allocated space. + */ +struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size); + #endif /* _XEN_MULTICALLS_H */ -- cgit v1.1 From 3b16cf874861436725c43ba0b68bdd799297be7c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 Jun 2008 11:21:54 +0200 Subject: x86: convert to generic helpers for IPI function calls This converts x86, x86-64, and xen to use the new helpers for smp_call_function() and friends, and adds support for smp_call_function_single(). Acked-by: Ingo Molnar Acked-by: Jeremy Fitzhardinge Signed-off-by: Jens Axboe --- arch/x86/xen/enlighten.c | 4 +- arch/x86/xen/mmu.c | 2 +- arch/x86/xen/smp.c | 133 ++++++++++++++++------------------------------- arch/x86/xen/xen-ops.h | 9 +--- 4 files changed, 52 insertions(+), 96 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index f09c1c6..8e31778 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1108,7 +1108,9 @@ static const struct smp_ops xen_smp_ops __initdata = { .smp_send_stop = xen_smp_send_stop, .smp_send_reschedule = xen_smp_send_reschedule, - .smp_call_function_mask = xen_smp_call_function_mask, + + .send_call_func_ipi = xen_smp_send_call_function_ipi, + .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi, }; #endif /* CONFIG_SMP */ diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index df40bf7..5c01590 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -558,7 +558,7 @@ static void drop_mm_ref(struct mm_struct *mm) } if (!cpus_empty(mask)) - xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); + smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); } #else static void drop_mm_ref(struct mm_struct *mm) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 94e6900..b3786e7 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -36,27 +36,14 @@ #include "mmu.h" static cpumask_t xen_cpu_initialized_map; -static DEFINE_PER_CPU(int, resched_irq) = -1; -static DEFINE_PER_CPU(int, callfunc_irq) = -1; -static DEFINE_PER_CPU(int, debug_irq) = -1; - -/* - * Structure and data for smp_call_function(). This is designed to minimise - * static memory requirements. It also looks cleaner. - */ -static DEFINE_SPINLOCK(call_lock); -struct call_data_struct { - void (*func) (void *info); - void *info; - atomic_t started; - atomic_t finished; - int wait; -}; +static DEFINE_PER_CPU(int, resched_irq); +static DEFINE_PER_CPU(int, callfunc_irq); +static DEFINE_PER_CPU(int, callfuncsingle_irq); +static DEFINE_PER_CPU(int, debug_irq) = -1; static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); - -static struct call_data_struct *call_data; +static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); /* * Reschedule call back. Nothing to do, @@ -122,6 +109,17 @@ static int xen_smp_intr_init(unsigned int cpu) goto fail; per_cpu(debug_irq, cpu) = rc; + callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu); + rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR, + cpu, + xen_call_function_single_interrupt, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + callfunc_name, + NULL); + if (rc < 0) + goto fail; + per_cpu(callfuncsingle_irq, cpu) = rc; + return 0; fail: @@ -131,6 +129,9 @@ static int xen_smp_intr_init(unsigned int cpu) unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); if (per_cpu(debug_irq, cpu) >= 0) unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); + if (per_cpu(callfuncsingle_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL); + return rc; } @@ -338,7 +339,6 @@ void xen_smp_send_reschedule(int cpu) xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); } - static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) { unsigned cpu; @@ -349,83 +349,42 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) xen_send_IPI_one(cpu, vector); } +void xen_smp_send_call_function_ipi(cpumask_t mask) +{ + int cpu; + + xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); + + /* Make sure other vcpus get a chance to run if they need to. */ + for_each_cpu_mask(cpu, mask) { + if (xen_vcpu_stolen(cpu)) { + HYPERVISOR_sched_op(SCHEDOP_yield, 0); + break; + } + } +} + +void xen_smp_send_call_function_single_ipi(int cpu) +{ + xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); +} + static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) { - void (*func) (void *info) = call_data->func; - void *info = call_data->info; - int wait = call_data->wait; - - /* - * Notify initiating CPU that I've grabbed the data and am - * about to execute the function - */ - mb(); - atomic_inc(&call_data->started); - /* - * At this point the info structure may be out of scope unless wait==1 - */ irq_enter(); - (*func)(info); + generic_smp_call_function_interrupt(); __get_cpu_var(irq_stat).irq_call_count++; irq_exit(); - if (wait) { - mb(); /* commit everything before setting finished */ - atomic_inc(&call_data->finished); - } - return IRQ_HANDLED; } -int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), - void *info, int wait) +static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) { - struct call_data_struct data; - int cpus, cpu; - bool yield; - - /* Holding any lock stops cpus from going down. */ - spin_lock(&call_lock); - - cpu_clear(smp_processor_id(), mask); - - cpus = cpus_weight(mask); - if (!cpus) { - spin_unlock(&call_lock); - return 0; - } - - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); - - call_data = &data; - mb(); /* write everything before IPI */ - - /* Send a message to other CPUs and wait for them to respond */ - xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); - - /* Make sure other vcpus get a chance to run if they need to. */ - yield = false; - for_each_cpu_mask(cpu, mask) - if (xen_vcpu_stolen(cpu)) - yield = true; - - if (yield) - HYPERVISOR_sched_op(SCHEDOP_yield, 0); - - /* Wait for response */ - while (atomic_read(&data.started) != cpus || - (wait && atomic_read(&data.finished) != cpus)) - cpu_relax(); - - spin_unlock(&call_lock); + irq_enter(); + generic_smp_call_function_single_interrupt(); + __get_cpu_var(irq_stat).irq_call_count++; + irq_exit(); - return 0; + return IRQ_HANDLED; } diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index f1063ae..a636ab5 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -46,13 +46,8 @@ void xen_smp_cpus_done(unsigned int max_cpus); void xen_smp_send_stop(void); void xen_smp_send_reschedule(int cpu); -int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic, - int wait); -int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait); - -int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), - void *info, int wait); +void xen_smp_send_call_function_ipi(cpumask_t mask); +void xen_smp_send_call_function_single_ipi(int cpu); /* Declare an asm function, along with symbols needed to make it -- cgit v1.1 From 8691e5a8f691cc2a4fda0651e8d307aaba0e7d68 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 6 Jun 2008 11:18:06 +0200 Subject: smp_call_function: get rid of the unused nonatomic/retry argument It's never used and the comments refer to nonatomic and retry interchangably. So get rid of it. Acked-by: Jeremy Fitzhardinge Signed-off-by: Jens Axboe --- arch/x86/xen/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index b3786e7..a1651d0 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -331,7 +331,7 @@ static void stop_self(void *v) void xen_smp_send_stop(void) { - smp_call_function(stop_self, NULL, 0, 0); + smp_call_function(stop_self, NULL, 0); } void xen_smp_send_reschedule(int cpu) -- cgit v1.1 From d8355aca23863be659ec5b7e0393cfbfa91ec221 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 3 Jul 2008 22:10:18 -0700 Subject: xen: fix address truncation in pte mfn<->pfn conversion When converting the page number in a pte/pmd/pud/pgd between machine and pseudo-physical addresses, the converted result was being truncated at 32-bits. This caused failures on machines with more than 4G of physical memory. Signed-off-by: Jeremy Fitzhardinge Cc: "Christopher S. Aker" Cc: Ian Campbell Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index df40bf7..4e527e7 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -185,7 +185,7 @@ static pteval_t pte_mfn_to_pfn(pteval_t val) if (val & _PAGE_PRESENT) { unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT; pteval_t flags = val & ~PTE_MASK; - val = (mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; + val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; } return val; @@ -196,7 +196,7 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) if (val & _PAGE_PRESENT) { unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT; pteval_t flags = val & ~PTE_MASK; - val = (pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; + val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; } return val; -- cgit v1.1 From d0be6bdea103b8d04c8a3495538b7c0011ae4129 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 15 Jun 2008 18:58:51 -0700 Subject: x86: rename two e820 related functions rename update_memory_range to e820_update_range rename add_memory_region to e820_add_region to make it more clear that they are about e820 map operations. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/xen/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 82517e4..9001c9d 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -39,8 +39,8 @@ char * __init xen_memory_setup(void) unsigned long max_pfn = xen_start_info->nr_pages; e820.nr_map = 0; - add_memory_region(0, LOWMEMSIZE(), E820_RAM); - add_memory_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM); + e820_add_region(0, LOWMEMSIZE(), E820_RAM); + e820_add_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM); return "Xen"; } -- cgit v1.1 From be5bf9fa1c327fa6fd6e7ba44665437dd558dfe3 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 16 Jun 2008 14:54:46 -0700 Subject: xen: reserve Xen-specific memory in e820 map Signed-off-by: Jeremy Fitzhardinge Cc: Yinghai Lu Cc: the arch/x86 maintainers Signed-off-by: Ingo Molnar Cc: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/xen/setup.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index a295758..dc2ca8a 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -40,9 +40,22 @@ char * __init xen_memory_setup(void) max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); e820.nr_map = 0; + e820_add_region(0, LOWMEMSIZE(), E820_RAM); e820_add_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM); + /* + * Reserve Xen bits: + * - mfn_list + * - xen_start_info + * See comment above "struct start_info" in + */ + e820_add_region(__pa(xen_start_info->mfn_list), + xen_start_info->pt_base - xen_start_info->mfn_list, + E820_RESERVED); + + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + return "Xen"; } -- cgit v1.1 From b792c755907cffceab393585b626ef2553c38538 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 16 Jun 2008 14:54:49 -0700 Subject: xen: reserve ISA space in e820 map [ TODO: release the underlying memory back to Xen. ] Signed-off-by: Jeremy Fitzhardinge Cc: Yinghai Lu Cc: the arch/x86 maintainers Signed-off-by: Ingo Molnar Cc: Yinghai Lu Cc: Ian Campbell Signed-off-by: Ingo Molnar --- arch/x86/xen/setup.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index dc2ca8a..e0a3959 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -41,8 +42,15 @@ char * __init xen_memory_setup(void) e820.nr_map = 0; - e820_add_region(0, LOWMEMSIZE(), E820_RAM); - e820_add_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM); + e820_add_region(0, PFN_PHYS(max_pfn), E820_RAM); + + /* + * Even though this is normal, usable memory under Xen, reserve + * ISA memory anyway because too many things think they can poke + * about in there. + */ + e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, + E820_RESERVED); /* * Reserve Xen bits: -- cgit v1.1 From 88a6846c70ad6bf33a545d554ace801d69e8a1a5 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 16 Jun 2008 14:54:51 -0700 Subject: xen: set max_pfn_mapped Signed-off-by: Jeremy Fitzhardinge Cc: Yinghai Lu Cc: the arch/x86 maintainers Signed-off-by: Ingo Molnar Cc: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 316623c..76ad1ef 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1304,6 +1304,7 @@ asmlinkage void __init xen_start_kernel(void) init_pg_tables_start = __pa(pgd); init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; + max_pfn_mapped = (init_pg_tables_end + 512*1024) >> PAGE_SHIFT; init_mm.pgd = pgd; /* use the Xen pagetables to start */ -- cgit v1.1 From eba0045ff87bab465d3c80c289f3bf709c1800f5 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 25 Jun 2008 00:19:12 -0400 Subject: x86/paravirt: add a pgd_alloc/free hooks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add hooks which are called at pgd_alloc/free time. The pgd_alloc hook may return an error code, which if non-zero, causes the pgd allocation to be failed. The hooks may be used to allocate/free auxillary per-pgd information. also fix: > * Ingo Molnar wrote: > > include/asm/pgalloc.h: In function ‘paravirt_pgd_free': > include/asm/pgalloc.h:14: error: parameter name omitted > arch/x86/kernel/entry_64.S: In file included from > arch/x86/kernel/traps_64.c:51:include/asm/pgalloc.h: In function ‘paravirt_pgd_free': > include/asm/pgalloc.h:14: error: parameter name omitted Signed-off-by: Jeremy Fitzhardinge Cc: xen-devel Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 76ad1ef..d62f14e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -45,6 +45,7 @@ #include #include #include +#include #include "xen-ops.h" #include "mmu.h" @@ -1153,6 +1154,9 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .pte_update = paravirt_nop, .pte_update_defer = paravirt_nop, + .pgd_alloc = __paravirt_pgd_alloc, + .pgd_free = paravirt_nop, + .alloc_pte = xen_alloc_pte_init, .release_pte = xen_release_pte_init, .alloc_pmd = xen_alloc_pte_init, -- cgit v1.1 From d75cd22fdd5f7d203fb60014d426942df33dd9a6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 25 Jun 2008 00:19:26 -0400 Subject: x86/paravirt: split sysret and sysexit Don't conflate sysret and sysexit; they're different instructions with different semantics, and may be in use at the same time (at least within the same kernel, depending on whether its an Intel or AMD system). sysexit - just return to userspace, does no register restoration of any kind; must explicitly atomically enable interrupts. sysret - reloads flags from r11, so no need to explicitly enable interrupts on 64-bit, responsible for restoring usermode %gs Signed-off-by: Jeremy Fitzhardinge Cc: xen-devel Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index d62f14e..119c88f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1089,7 +1089,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .read_pmc = native_read_pmc, .iret = xen_iret, - .irq_enable_syscall_ret = xen_sysexit, + .irq_enable_sysexit = xen_sysexit, .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, -- cgit v1.1 From fab58420ac0007a452b540cfb07923225ea4f48d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 25 Jun 2008 00:19:31 -0400 Subject: x86/paravirt, 64-bit: add adjust_exception_frame 64-bit Xen pushes a couple of extra words onto an exception frame. Add a hook to deal with them. Signed-off-by: Jeremy Fitzhardinge Cc: xen-devel Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 119c88f..3b98083 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1123,6 +1123,9 @@ static const struct pv_irq_ops xen_irq_ops __initdata = { .irq_enable = xen_irq_enable, .safe_halt = xen_safe_halt, .halt = xen_halt, +#ifdef CONFIG_X86_64 + .adjust_exception_frame = paravirt_nop, +#endif }; static const struct pv_apic_ops xen_apic_ops __initdata = { -- cgit v1.1 From e93ef949fd9a3f237aedfb8e64414b28980530b8 Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Tue, 1 Jul 2008 11:43:36 -0700 Subject: x86: rename paravirtualized TSC functions Rename the paravirtualized calculate_cpu_khz to calibrate_tsc. In all cases, we actually calibrate_tsc and use that as the cpu_khz value. Signed-off-by: Alok N Kataria Signed-off-by: Dan Hecht Cc: Dan Hecht Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- arch/x86/xen/time.c | 4 ++-- arch/x86/xen/xen-ops.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 3b98083..dcd4e51 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1062,7 +1062,7 @@ static const struct pv_time_ops xen_time_ops __initdata = { .set_wallclock = xen_set_wallclock, .get_wallclock = xen_get_wallclock, - .get_cpu_khz = xen_cpu_khz, + .get_tsc_khz = xen_tsc_khz, .sched_clock = xen_sched_clock, }; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 64f0038..685b774 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -197,8 +197,8 @@ unsigned long long xen_sched_clock(void) } -/* Get the CPU speed from Xen */ -unsigned long xen_cpu_khz(void) +/* Get the TSC speed from Xen */ +unsigned long xen_tsc_khz(void) { u64 xen_khz = 1000000ULL << 32; const struct pvclock_vcpu_time_info *info = diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9a05559..d852ddb 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -32,7 +32,7 @@ void __init xen_build_dynamic_phys_to_machine(void); void xen_setup_timer(int cpu); void xen_setup_cpu_clockevents(void); -unsigned long xen_cpu_khz(void); +unsigned long xen_tsc_khz(void); void __init xen_time_init(void); unsigned long xen_get_wallclock(void); int xen_set_wallclock(unsigned long time); -- cgit v1.1 From ad66dd340f561bdde2285992314d9e4fd9b6191e Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 11 Jul 2008 13:11:56 -0700 Subject: x2apic: xen64 paravirt basic apic ops Define the Xen specific basic apic ops, in additon to paravirt apic ops, with some misc warning fixes. Signed-off-by: Suresh Siddha Cc: Jeremy Fitzhardinge Cc: akpm@linux-foundation.org Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index dcd4e51..54e2556 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -548,16 +548,45 @@ static void xen_io_delay(void) } #ifdef CONFIG_X86_LOCAL_APIC -static u32 xen_apic_read(unsigned long reg) +static u32 xen_apic_read(u32 reg) { return 0; } -static void xen_apic_write(unsigned long reg, u32 val) +static void xen_apic_write(u32 reg, u32 val) { /* Warn to see if there's any stray references */ WARN_ON(1); } + +#ifdef CONFIG_X86_64 +static u64 xen_apic_icr_read(void) +{ + return 0; +} + +static void xen_apic_icr_write(u32 low, u32 id) +{ + /* Warn to see if there's any stray references */ + WARN_ON(1); +} + +static void xen_apic_wait_icr_idle(void) +{ + return; +} + +static struct apic_ops xen_basic_apic_ops = { + .read = xen_apic_read, + .write = xen_apic_write, + .write_atomic = xen_apic_write, + .icr_read = xen_apic_icr_read, + .icr_write = xen_apic_icr_write, + .wait_icr_idle = xen_apic_wait_icr_idle, + .safe_wait_icr_idle = xen_apic_wait_icr_idle, +}; +#endif + #endif static void xen_flush_tlb(void) @@ -1130,9 +1159,11 @@ static const struct pv_irq_ops xen_irq_ops __initdata = { static const struct pv_apic_ops xen_apic_ops __initdata = { #ifdef CONFIG_X86_LOCAL_APIC +#ifndef CONFIG_X86_64 .apic_write = xen_apic_write, .apic_write_atomic = xen_apic_write, .apic_read = xen_apic_read, +#endif .setup_boot_clock = paravirt_nop, .setup_secondary_clock = paravirt_nop, .startup_ipi_hook = paravirt_nop, @@ -1291,6 +1322,12 @@ asmlinkage void __init xen_start_kernel(void) pv_irq_ops = xen_irq_ops; pv_apic_ops = xen_apic_ops; pv_mmu_ops = xen_mmu_ops; +#ifdef CONFIG_X86_64 + /* + * for 64bit, set up the basic apic ops aswell. + */ + apic_ops = &xen_basic_apic_ops; +#endif if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; -- cgit v1.1 From 94a8c3c2437c8946f1b6c8e0b2c560a7db8ed3c6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 13 Jul 2008 22:19:35 -0700 Subject: x86: let 32bit use apic_ops too - fix fix for pv - clean up the namespace there too. Signed-off-by: Yinghai Lu Cc: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 54e2556..d11dda7e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -559,7 +559,6 @@ static void xen_apic_write(u32 reg, u32 val) WARN_ON(1); } -#ifdef CONFIG_X86_64 static u64 xen_apic_icr_read(void) { return 0; @@ -576,6 +575,11 @@ static void xen_apic_wait_icr_idle(void) return; } +static u32 xen_safe_apic_wait_icr_idle(void) +{ + return 0; +} + static struct apic_ops xen_basic_apic_ops = { .read = xen_apic_read, .write = xen_apic_write, @@ -583,9 +587,8 @@ static struct apic_ops xen_basic_apic_ops = { .icr_read = xen_apic_icr_read, .icr_write = xen_apic_icr_write, .wait_icr_idle = xen_apic_wait_icr_idle, - .safe_wait_icr_idle = xen_apic_wait_icr_idle, + .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle, }; -#endif #endif @@ -1159,11 +1162,6 @@ static const struct pv_irq_ops xen_irq_ops __initdata = { static const struct pv_apic_ops xen_apic_ops __initdata = { #ifdef CONFIG_X86_LOCAL_APIC -#ifndef CONFIG_X86_64 - .apic_write = xen_apic_write, - .apic_write_atomic = xen_apic_write, - .apic_read = xen_apic_read, -#endif .setup_boot_clock = paravirt_nop, .setup_secondary_clock = paravirt_nop, .startup_ipi_hook = paravirt_nop, @@ -1322,9 +1320,10 @@ asmlinkage void __init xen_start_kernel(void) pv_irq_ops = xen_irq_ops; pv_apic_ops = xen_apic_ops; pv_mmu_ops = xen_mmu_ops; -#ifdef CONFIG_X86_64 + +#ifdef CONFIG_X86_LOCAL_APIC /* - * for 64bit, set up the basic apic ops aswell. + * set up the basic apic ops. */ apic_ops = &xen_basic_apic_ops; #endif -- cgit v1.1 From a312b37b2a212fd2e227d1d6321f903b91b65ec7 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Tue, 8 Jul 2008 15:06:23 -0700 Subject: x86/paravirt: call paravirt_pagetable_setup_{start, done} Call paravirt_pagetable_setup_{start,done} These paravirt_ops functions were not being called on x86_64. Signed-off-by: Eduardo Habkost Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index bb50845..eaab6c9 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -841,6 +841,7 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) static __init void xen_pagetable_setup_start(pgd_t *base) { +#ifdef CONFIG_X86_32 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; int i; @@ -886,6 +887,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base) /* Unpin initial Xen pagetable */ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(xen_start_info->pt_base))); +#endif /* CONFIG_X86_32 */ } void xen_setup_shared_info(void) @@ -927,9 +929,11 @@ static __init void xen_pagetable_setup_done(pgd_t *base) xen_setup_shared_info(); +#ifdef CONFIG_X86_32 /* Actually pin the pagetable down, but we can't set PG_pinned yet because the page structures don't exist yet. */ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base))); +#endif } static __init void xen_post_allocator_init(void) -- cgit v1.1 From c1f2f09ef66d5dadd5fe42ea909e708470c9636d Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Tue, 8 Jul 2008 15:06:24 -0700 Subject: pvops-64: call paravirt_post_allocator_init() on setup_arch() Signed-off-by: Eduardo Habkost Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index ff0aa74..ebd6900 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -656,9 +656,11 @@ void xen_mm_pin_all(void) spin_unlock_irqrestore(&pgd_lock, flags); } -/* The init_mm pagetable is really pinned as soon as its created, but - that's before we have page structures to store the bits. So do all - the book-keeping now. */ +/* + * The init_mm pagetable is really pinned as soon as its created, but + * that's before we have page structures to store the bits. So do all + * the book-keeping now. + */ static __init int mark_pinned(struct page *page, enum pt_level level) { SetPagePinned(page); -- cgit v1.1 From cbcd79c2e5b496b84845618cef734b4c40736576 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:27 -0700 Subject: x86: use __page_aligned_data/bss Update arch/x86's use of page-aligned variables. The change to arch/x86/xen/mmu.c fixes an actual bug, but the rest are cleanups and to set a precedent. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index ebd6900..4fca9d8 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -60,22 +61,18 @@ #define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) /* Placeholder for holes in the address space */ -static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] - __attribute__((section(".data.page_aligned"))) = +static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data = { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; /* Array of pointers to pages containing p2m entries */ -static unsigned long *p2m_top[TOP_ENTRIES] - __attribute__((section(".data.page_aligned"))) = +static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data = { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; /* Arrays of p2m arrays expressed in mfns used for save/restore */ -static unsigned long p2m_top_mfn[TOP_ENTRIES] - __attribute__((section(".bss.page_aligned"))); +static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss; -static unsigned long p2m_top_mfn_list[ - PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)] - __attribute__((section(".bss.page_aligned"))); +static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE] + __page_aligned_bss; static inline unsigned p2m_top_index(unsigned long pfn) { -- cgit v1.1 From 8ba6c2b0958c332d2f3336f4ca9c116ed81f38e9 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:30 -0700 Subject: xen: print backtrace on multicall failure Print a backtrace if a multicall fails, to help with debugging. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/multicalls.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 3c63c4da..9efd1c6 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -76,6 +76,7 @@ void xen_mc_flush(void) if (ret) { printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", ret, smp_processor_id()); + dump_stack(); for (i = 0; i < b->mcidx; i++) { printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", i+1, b->mcidx, -- cgit v1.1 From ad55db9fed6d6cd09333045945cb03ba2c070085 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Tue, 8 Jul 2008 15:06:32 -0700 Subject: xen: add xen_arch_resume()/xen_timer_resume hook for ia64 support add xen_timer_resume() hook. Timer resume should be done after event channel is resumed. add xen_arch_resume() hook when ipi becomes usable after resume. After resume, some cpu specific resource must be reinitialized on ia64 that can't be set by another cpu. However available hooks is run once on only one cpu so that ipi has to be used. During stop_machine_run() ipi can't be used because interrupt is masked. So add another hook after stop_machine_run(). Another approach might be use resume hook which is run by device_resume(). However device_resume() may be executed on suspend error recovery path. So it is necessary to determine whether it is executed on real resume path or error recovery path. Signed-off-by: Isaku Yamahata Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/suspend.c | 5 ++++- arch/x86/xen/xen-ops.h | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 251669a..2a234db 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -38,8 +38,11 @@ void xen_post_suspend(int suspend_cancelled) xen_cpu_initialized_map = cpu_online_map; #endif xen_vcpu_restore(); - xen_timer_resume(); } } +void xen_arch_resume(void) +{ + /* nothing */ +} diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 6f4b104..77354d2 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -37,7 +37,6 @@ void __init xen_time_init(void); unsigned long xen_get_wallclock(void); int xen_set_wallclock(unsigned long time); unsigned long long xen_sched_clock(void); -void xen_timer_resume(void); irqreturn_t xen_debug_interrupt(int irq, void *dev_id); -- cgit v1.1 From 851fa3c4e7b50d6a946d8b4c0a68683b5e56b2f1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:33 -0700 Subject: xen: define set_pte from the outset We need set_pte to work from a relatively early point, so enable it from the start. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index eaab6c9..c5f0b40 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -845,9 +845,6 @@ static __init void xen_pagetable_setup_start(pgd_t *base) pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; int i; - /* special set_pte for pagetable initialization */ - pv_mmu_ops.set_pte = xen_set_pte_init; - init_mm.pgd = base; /* * copy top-level of Xen-supplied pagetable into place. This @@ -1174,7 +1171,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .kmap_atomic_pte = xen_kmap_atomic_pte, #endif - .set_pte = NULL, /* see xen_pagetable_setup_* */ + .set_pte = xen_set_pte_init, .set_pte_at = xen_set_pte_at, .set_pmd = xen_set_pmd_hyper, -- cgit v1.1 From 48b5db20621388582ca11ac3c61d3403966dbe51 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:34 -0700 Subject: xen64: define asm/xen/interface for 64-bit Copy 64-bit definitions of various interface structures into place. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 297bf9f..7856e37 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -10,18 +10,6 @@ enum pt_level { PT_PTE }; -/* - * Page-directory addresses above 4GB do not fit into architectural %cr3. - * When accessing %cr3, or equivalent field in vcpu_guest_context, guests - * must use the following accessor macros to pack/unpack valid MFNs. - * - * Note that Xen is using the fact that the pagetable base is always - * page-aligned, and putting the 12 MSB of the address into the 12 LSB - * of cr3. - */ -#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20)) -#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20)) - void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -- cgit v1.1 From 7077c33d81a8d790135ae87cd19e6efcb075c23a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:35 -0700 Subject: xen: make ELF notes work for 32 and 64 bit Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/xen-head.S | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 7c0cf63..a9cac9d 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -5,7 +5,10 @@ #include #include + #include +#include + #include #include @@ -21,21 +24,21 @@ ENTRY(startup_xen) .pushsection .text .align PAGE_SIZE_asm ENTRY(hypercall_page) - .skip 0x1000 + .skip PAGE_SIZE_asm .popsection ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") - ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) - ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) - ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET) + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) - ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long __HYPERVISOR_VIRT_START) + ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START) #endif /*CONFIG_XEN */ -- cgit v1.1 From f6e587325b3bc7e5c829a407ddc25b52c1e73851 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:38 -0700 Subject: xen64: add extra pv_mmu_ops We need extra pv_mmu_ops for 64-bit, to deal with the extra level of pagetable. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 33 ++++++++++++++++++++++++++++++- arch/x86/xen/mmu.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++- arch/x86/xen/mmu.h | 15 ++++++++++++-- 3 files changed, 95 insertions(+), 4 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c5f0b40..afb047e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -803,6 +803,18 @@ static void xen_release_pmd(u32 pfn) xen_release_ptpage(pfn, PT_PMD); } +#if PAGETABLE_LEVELS == 4 +static void xen_alloc_pud(struct mm_struct *mm, u32 pfn) +{ + xen_alloc_ptpage(mm, pfn, PT_PUD); +} + +static void xen_release_pud(u32 pfn) +{ + xen_release_ptpage(pfn, PT_PUD); +} +#endif + #ifdef CONFIG_HIGHPTE static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) { @@ -922,6 +934,11 @@ static __init void xen_pagetable_setup_done(pgd_t *base) pv_mmu_ops.alloc_pmd = xen_alloc_pmd; pv_mmu_ops.release_pte = xen_release_pte; pv_mmu_ops.release_pmd = xen_release_pmd; +#if PAGETABLE_LEVELS == 4 + pv_mmu_ops.alloc_pud = xen_alloc_pud; + pv_mmu_ops.release_pud = xen_release_pud; +#endif + pv_mmu_ops.set_pte = xen_set_pte; xen_setup_shared_info(); @@ -937,6 +954,9 @@ static __init void xen_post_allocator_init(void) { pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; +#if PAGETABLE_LEVELS == 4 + pv_mmu_ops.set_pgd = xen_set_pgd; +#endif xen_mark_init_mm_pinned(); } @@ -1185,15 +1205,26 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .make_pte = xen_make_pte, .make_pgd = xen_make_pgd, +#ifdef CONFIG_X86_PAE .set_pte_atomic = xen_set_pte_atomic, .set_pte_present = xen_set_pte_at, - .set_pud = xen_set_pud_hyper, .pte_clear = xen_pte_clear, .pmd_clear = xen_pmd_clear, +#endif /* CONFIG_X86_PAE */ + .set_pud = xen_set_pud_hyper, .make_pmd = xen_make_pmd, .pmd_val = xen_pmd_val, +#if PAGETABLE_LEVELS == 4 + .pud_val = xen_pud_val, + .make_pud = xen_make_pud, + .set_pgd = xen_set_pgd_hyper, + + .alloc_pud = xen_alloc_pte_init, + .release_pud = xen_release_pte_init, +#endif /* PAGETABLE_LEVELS == 4 */ + .activate_mm = xen_activate_mm, .dup_mmap = xen_dup_mmap, .exit_mmap = xen_exit_mmap, diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 4fca9d8..d0976b8 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -438,14 +438,19 @@ void xen_set_pud(pud_t *ptr, pud_t val) void xen_set_pte(pte_t *ptep, pte_t pte) { +#ifdef CONFIG_X86_PAE ptep->pte_high = pte.pte_high; smp_wmb(); ptep->pte_low = pte.pte_low; +#else + *ptep = pte; +#endif } +#ifdef CONFIG_X86_PAE void xen_set_pte_atomic(pte_t *ptep, pte_t pte) { - set_64bit((u64 *)ptep, pte_val_ma(pte)); + set_64bit((u64 *)ptep, native_pte_val(pte)); } void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) @@ -459,6 +464,7 @@ void xen_pmd_clear(pmd_t *pmdp) { set_pmd(pmdp, __pmd(0)); } +#endif /* CONFIG_X86_PAE */ pmd_t xen_make_pmd(pmdval_t pmd) { @@ -466,6 +472,49 @@ pmd_t xen_make_pmd(pmdval_t pmd) return native_make_pmd(pmd); } +#if PAGETABLE_LEVELS == 4 +pudval_t xen_pud_val(pud_t pud) +{ + return pte_mfn_to_pfn(pud.pud); +} + +pud_t xen_make_pud(pudval_t pud) +{ + pud = pte_pfn_to_mfn(pud); + + return native_make_pud(pud); +} + +void xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) +{ + struct mmu_update u; + + preempt_disable(); + + xen_mc_batch(); + + u.ptr = virt_to_machine(ptr).maddr; + u.val = pgd_val_ma(val); + extend_mmu_update(&u); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +void xen_set_pgd(pgd_t *ptr, pgd_t val) +{ + /* If page is not pinned, we can just update the entry + directly */ + if (!page_pinned(ptr)) { + *ptr = val; + return; + } + + xen_set_pgd_hyper(ptr, val); +} +#endif /* PAGETABLE_LEVELS == 4 */ + /* (Yet another) pagetable walker. This one is intended for pinning a pagetable. This means that it walks a pagetable and calls the diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 7856e37..19d544b 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -32,13 +32,24 @@ pgd_t xen_make_pgd(pgdval_t); void xen_set_pte(pte_t *ptep, pte_t pteval); void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval); + +#ifdef CONFIG_X86_PAE void xen_set_pte_atomic(pte_t *ptep, pte_t pte); +void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +void xen_pmd_clear(pmd_t *pmdp); +#endif /* CONFIG_X86_PAE */ + void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); void xen_set_pud(pud_t *ptr, pud_t val); void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval); void xen_set_pud_hyper(pud_t *ptr, pud_t val); -void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -void xen_pmd_clear(pmd_t *pmdp); + +#if PAGETABLE_LEVELS == 4 +pudval_t xen_pud_val(pud_t pud); +pud_t xen_make_pud(pudval_t pudval); +void xen_set_pgd(pgd_t *pgdp, pgd_t pgd); +void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd); +#endif pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, -- cgit v1.1 From f5d36de069f4b343f64e858e7377cfc9c772c4fb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:39 -0700 Subject: xen64: random ifdefs to mask out 32-bit only code Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index afb047e..ada2e1a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1296,6 +1296,7 @@ static const struct machine_ops __initdata xen_machine_ops = { static void __init xen_reserve_top(void) { +#ifdef CONFIG_X86_32 unsigned long top = HYPERVISOR_VIRT_START; struct xen_platform_parameters pp; @@ -1303,6 +1304,7 @@ static void __init xen_reserve_top(void) top = pp.virt_start; reserve_top_address(-top + 2 * PAGE_SIZE); +#endif /* CONFIG_X86_32 */ } /* First C function to be called on Xen boot */ @@ -1333,6 +1335,11 @@ asmlinkage void __init xen_start_kernel(void) machine_ops = xen_machine_ops; +#ifdef CONFIG_X86_64 + /* Disable until direct per-cpu data access. */ + have_vcpu_info_placement = 0; +#endif + #ifdef CONFIG_SMP smp_ops = xen_smp_ops; #endif @@ -1343,9 +1350,11 @@ asmlinkage void __init xen_start_kernel(void) pgd = (pgd_t *)xen_start_info->pt_base; +#ifdef CONFIG_X86_32 init_pg_tables_start = __pa(pgd); init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; max_pfn_mapped = (init_pg_tables_end + 512*1024) >> PAGE_SHIFT; +#endif init_mm.pgd = pgd; /* use the Xen pagetables to start */ @@ -1372,7 +1381,9 @@ asmlinkage void __init xen_start_kernel(void) /* set up basic CPUID stuff */ cpu_detect(&new_cpu_data); +#ifdef CONFIG_X86_32 new_cpu_data.hard_math = 1; +#endif new_cpu_data.x86_capability[0] = cpuid_edx(1); /* Poke various useful things into boot_params */ @@ -1388,5 +1399,9 @@ asmlinkage void __init xen_start_kernel(void) } /* Start the world */ +#ifdef CONFIG_X86_32 i386_start_kernel(); +#else + x86_64_start_kernel((char *)&boot_params); +#endif } -- cgit v1.1 From ce87b3d326de733c72b47662f106ee6cd699a20f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:40 -0700 Subject: xen64: get active_mm from the pda x86_64 stores the active_mm in the pda, so fetch it from there. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index d0976b8..2579e70 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -805,8 +805,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) static void drop_other_mm_ref(void *info) { struct mm_struct *mm = info; + struct mm_struct *active_mm; - if (__get_cpu_var(cpu_tlbstate).active_mm == mm) +#ifdef CONFIG_X86_64 + active_mm = read_pda(active_mm); +#else + active_mm = __get_cpu_var(cpu_tlbstate).active_mm; +#endif + + if (active_mm == mm) leave_mm(smp_processor_id()); /* If this cpu still has a stale cr3 reference, then make sure -- cgit v1.1 From a9e7062d7339f1a1df2b6d7e5d595c7d55b56bfb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:41 -0700 Subject: xen: move smp setup into smp.c Move all the smp_ops setup into smp.c, allowing a lot of things to become static. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 19 +------------------ arch/x86/xen/smp.c | 34 ++++++++++++++++++++++++++-------- arch/x86/xen/xen-ops.h | 13 +++++-------- 3 files changed, 32 insertions(+), 34 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ada2e1a..a85f447 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1237,21 +1237,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .set_fixmap = xen_set_fixmap, }; -#ifdef CONFIG_SMP -static const struct smp_ops xen_smp_ops __initdata = { - .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, - .smp_prepare_cpus = xen_smp_prepare_cpus, - .cpu_up = xen_cpu_up, - .smp_cpus_done = xen_smp_cpus_done, - - .smp_send_stop = xen_smp_send_stop, - .smp_send_reschedule = xen_smp_send_reschedule, - - .send_call_func_ipi = xen_smp_send_call_function_ipi, - .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi, -}; -#endif /* CONFIG_SMP */ - static void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; @@ -1340,9 +1325,7 @@ asmlinkage void __init xen_start_kernel(void) have_vcpu_info_placement = 0; #endif -#ifdef CONFIG_SMP - smp_ops = xen_smp_ops; -#endif + xen_smp_init(); /* Get mfn list */ if (!xen_feature(XENFEAT_auto_translated_physmap)) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 233156f..91fae8f 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -152,7 +152,7 @@ void __init xen_fill_possible_map(void) } } -void __init xen_smp_prepare_boot_cpu(void) +static void __init xen_smp_prepare_boot_cpu(void) { int cpu; @@ -176,7 +176,7 @@ void __init xen_smp_prepare_boot_cpu(void) xen_setup_vcpu_info_placement(); } -void __init xen_smp_prepare_cpus(unsigned int max_cpus) +static void __init xen_smp_prepare_cpus(unsigned int max_cpus) { unsigned cpu; @@ -276,7 +276,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) return 0; } -int __cpuinit xen_cpu_up(unsigned int cpu) +static int __cpuinit xen_cpu_up(unsigned int cpu) { struct task_struct *idle = idle_task(cpu); int rc; @@ -319,7 +319,7 @@ int __cpuinit xen_cpu_up(unsigned int cpu) return 0; } -void xen_smp_cpus_done(unsigned int max_cpus) +static void xen_smp_cpus_done(unsigned int max_cpus) { } @@ -335,12 +335,12 @@ static void stop_self(void *v) BUG(); } -void xen_smp_send_stop(void) +static void xen_smp_send_stop(void) { smp_call_function(stop_self, NULL, 0); } -void xen_smp_send_reschedule(int cpu) +static void xen_smp_send_reschedule(int cpu) { xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); } @@ -355,7 +355,7 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) xen_send_IPI_one(cpu, vector); } -void xen_smp_send_call_function_ipi(cpumask_t mask) +static void xen_smp_send_call_function_ipi(cpumask_t mask) { int cpu; @@ -370,7 +370,7 @@ void xen_smp_send_call_function_ipi(cpumask_t mask) } } -void xen_smp_send_call_function_single_ipi(int cpu) +static void xen_smp_send_call_function_single_ipi(int cpu) { xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); } @@ -394,3 +394,21 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } + +static const struct smp_ops xen_smp_ops __initdata = { + .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, + .smp_prepare_cpus = xen_smp_prepare_cpus, + .cpu_up = xen_cpu_up, + .smp_cpus_done = xen_smp_cpus_done, + + .smp_send_stop = xen_smp_send_stop, + .smp_send_reschedule = xen_smp_send_reschedule, + + .send_call_func_ipi = xen_smp_send_call_function_ipi, + .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi, +}; + +void __init xen_smp_init(void) +{ + smp_ops = xen_smp_ops; +} diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 77354d2..81a779f 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -47,17 +47,14 @@ void xen_mark_init_mm_pinned(void); void __init xen_fill_possible_map(void); void __init xen_setup_vcpu_info_placement(void); -void xen_smp_prepare_boot_cpu(void); -void xen_smp_prepare_cpus(unsigned int max_cpus); -int xen_cpu_up(unsigned int cpu); -void xen_smp_cpus_done(unsigned int max_cpus); -void xen_smp_send_stop(void); -void xen_smp_send_reschedule(int cpu); -void xen_smp_send_call_function_ipi(cpumask_t mask); -void xen_smp_send_call_function_single_ipi(int cpu); +#ifdef CONFIG_SMP +void xen_smp_init(void); extern cpumask_t xen_cpu_initialized_map; +#else +static inline void xen_smp_init(void) {} +#endif /* Declare an asm function, along with symbols needed to make it -- cgit v1.1 From 5b09b2876ed1a8e34a0da8f069575fc6174e2077 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:42 -0700 Subject: x86_64: add workaround for no %gs-based percpu As a stopgap until Mike Travis's x86-64 gs-based percpu patches are ready, provide workaround functions for x86_read/write_percpu for Xen's use. Specifically, this means that we can't really make use of vcpu placement, because we can't use a single gs-based memory access to get to vcpu fields. So disable all that for now. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a85f447..f3f11ac 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -971,6 +971,7 @@ void xen_setup_vcpu_info_placement(void) /* xen_vcpu_setup managed to place the vcpu_info within the percpu area for all cpus, so make use of it */ +#ifdef CONFIG_X86_32 if (have_vcpu_info_placement) { printk(KERN_INFO "Xen: using vcpu_info placement\n"); @@ -980,6 +981,7 @@ void xen_setup_vcpu_info_placement(void) pv_irq_ops.irq_enable = xen_irq_enable_direct; pv_mmu_ops.read_cr2 = xen_read_cr2_direct; } +#endif } static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, @@ -1000,10 +1002,12 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, goto patch_site switch (type) { +#ifdef CONFIG_X86_32 SITE(pv_irq_ops, irq_enable); SITE(pv_irq_ops, irq_disable); SITE(pv_irq_ops, save_fl); SITE(pv_irq_ops, restore_fl); +#endif /* CONFIG_X86_32 */ #undef SITE patch_site: @@ -1323,6 +1327,7 @@ asmlinkage void __init xen_start_kernel(void) #ifdef CONFIG_X86_64 /* Disable until direct per-cpu data access. */ have_vcpu_info_placement = 0; + x86_64_init_pda(); #endif xen_smp_init(); -- cgit v1.1 From c7b75947f89d45493562ede6d9ee7311dfa5c4ce Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:43 -0700 Subject: xen64: smp.c compile hacking A number of random changes to make xen/smp.c compile in 64-bit mode. Signed-off-by: Jeremy Fitzhardinge a Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/setup.c | 7 +--- arch/x86/xen/smp.c | 98 +++++++++++++++++++++++++++++--------------------- arch/x86/xen/xen-ops.h | 2 -- 3 files changed, 58 insertions(+), 49 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index e0a3959..f52f385 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -98,7 +98,7 @@ void xen_enable_sysenter(void) /* Mask events on entry, even though they get enabled immediately */ static struct callback_register sysenter = { .type = CALLBACKTYPE_sysenter, - .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target }, + .address = XEN_CALLBACK(__KERNEL_CS, xen_sysenter_target), .flags = CALLBACKF_mask_events, }; @@ -143,11 +143,6 @@ void __init xen_arch_setup(void) pm_idle = xen_idle; -#ifdef CONFIG_SMP - /* fill cpus_possible with all available cpus */ - xen_fill_possible_map(); -#endif - paravirt_disable_iospace(); fiddle_vdso(); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 91fae8f..800bb21 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -66,13 +66,21 @@ static __cpuinit void cpu_bringup_and_idle(void) int cpu = smp_processor_id(); cpu_init(); + preempt_disable(); + xen_enable_sysenter(); - preempt_disable(); - per_cpu(cpu_state, cpu) = CPU_ONLINE; + cpu = smp_processor_id(); + smp_store_cpu_info(cpu); + cpu_data(cpu).x86_max_cores = 1; + set_cpu_sibling_map(cpu); xen_setup_cpu_clockevents(); + cpu_set(cpu, cpu_online_map); + x86_write_percpu(cpu_state, CPU_ONLINE); + wmb(); + /* We can take interrupts now: we're officially "up". */ local_irq_enable(); @@ -141,7 +149,7 @@ static int xen_smp_intr_init(unsigned int cpu) return rc; } -void __init xen_fill_possible_map(void) +static void __init xen_fill_possible_map(void) { int i, rc; @@ -154,24 +162,12 @@ void __init xen_fill_possible_map(void) static void __init xen_smp_prepare_boot_cpu(void) { - int cpu; - BUG_ON(smp_processor_id() != 0); native_smp_prepare_boot_cpu(); /* We've switched to the "real" per-cpu gdt, so make sure the old memory can be recycled */ - make_lowmem_page_readwrite(&per_cpu__gdt_page); - - for_each_possible_cpu(cpu) { - cpus_clear(per_cpu(cpu_sibling_map, cpu)); - /* - * cpu_core_map lives in a per cpu area that is cleared - * when the per cpu array is allocated. - * - * cpus_clear(per_cpu(cpu_core_map, cpu)); - */ - } + make_lowmem_page_readwrite(&per_cpu_var(gdt_page)); xen_setup_vcpu_info_placement(); } @@ -180,17 +176,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) { unsigned cpu; - for_each_possible_cpu(cpu) { - cpus_clear(per_cpu(cpu_sibling_map, cpu)); - /* - * cpu_core_ map will be zeroed when the per - * cpu area is allocated. - * - * cpus_clear(per_cpu(cpu_core_map, cpu)); - */ - } - smp_store_cpu_info(0); + cpu_data(0).x86_max_cores = 1; set_cpu_sibling_map(0); if (xen_smp_intr_init(0)) @@ -225,7 +212,7 @@ static __cpuinit int cpu_initialize_context(unsigned int cpu, struct task_struct *idle) { struct vcpu_guest_context *ctxt; - struct gdt_page *gdt = &per_cpu(gdt_page, cpu); + struct desc_struct *gdt; if (cpu_test_and_set(cpu, xen_cpu_initialized_map)) return 0; @@ -234,12 +221,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) if (ctxt == NULL) return -ENOMEM; + gdt = get_cpu_gdt_table(cpu); + ctxt->flags = VGCF_IN_KERNEL; ctxt->user_regs.ds = __USER_DS; ctxt->user_regs.es = __USER_DS; - ctxt->user_regs.fs = __KERNEL_PERCPU; - ctxt->user_regs.gs = 0; ctxt->user_regs.ss = __KERNEL_DS; +#ifdef CONFIG_X86_32 + ctxt->user_regs.fs = __KERNEL_PERCPU; +#endif ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ @@ -249,11 +239,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->ldt_ents = 0; - BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); - make_lowmem_page_readonly(gdt->gdt); + BUG_ON((unsigned long)gdt & ~PAGE_MASK); + make_lowmem_page_readonly(gdt); - ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); - ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); + ctxt->gdt_frames[0] = virt_to_mfn(gdt); + ctxt->gdt_ents = GDT_ENTRIES; ctxt->user_regs.cs = __KERNEL_CS; ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); @@ -261,9 +251,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->kernel_ss = __KERNEL_DS; ctxt->kernel_sp = idle->thread.sp0; +#ifdef CONFIG_X86_32 ctxt->event_callback_cs = __KERNEL_CS; - ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback; ctxt->failsafe_callback_cs = __KERNEL_CS; +#endif + ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback; ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); @@ -287,11 +279,28 @@ static int __cpuinit xen_cpu_up(unsigned int cpu) return rc; #endif +#ifdef CONFIG_X86_64 + /* Allocate node local memory for AP pdas */ + WARN_ON(cpu == 0); + if (cpu > 0) { + rc = get_local_pda(cpu); + if (rc) + return rc; + } +#endif + +#ifdef CONFIG_X86_32 init_gdt(cpu); per_cpu(current_task, cpu) = idle; irq_ctx_init(cpu); +#else + cpu_pda(cpu)->pcurrent = idle; + clear_tsk_thread_flag(idle, TIF_FORK); +#endif xen_setup_timer(cpu); + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; + /* make sure interrupts start blocked */ per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; @@ -306,16 +315,14 @@ static int __cpuinit xen_cpu_up(unsigned int cpu) if (rc) return rc; - smp_store_cpu_info(cpu); - set_cpu_sibling_map(cpu); - /* This must be done before setting cpu_online_map */ - wmb(); - - cpu_set(cpu, cpu_online_map); - rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); BUG_ON(rc); + while(per_cpu(cpu_state, cpu) != CPU_ONLINE) { + HYPERVISOR_sched_op(SCHEDOP_yield, 0); + barrier(); + } + return 0; } @@ -379,7 +386,11 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) { irq_enter(); generic_smp_call_function_interrupt(); +#ifdef CONFIG_X86_32 __get_cpu_var(irq_stat).irq_call_count++; +#else + add_pda(irq_call_count, 1); +#endif irq_exit(); return IRQ_HANDLED; @@ -389,7 +400,11 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) { irq_enter(); generic_smp_call_function_single_interrupt(); +#ifdef CONFIG_X86_32 __get_cpu_var(irq_stat).irq_call_count++; +#else + add_pda(irq_call_count, 1); +#endif irq_exit(); return IRQ_HANDLED; @@ -411,4 +426,5 @@ static const struct smp_ops xen_smp_ops __initdata = { void __init xen_smp_init(void) { smp_ops = xen_smp_ops; + xen_fill_possible_map(); } diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 81a779f..aca4a78 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -44,8 +44,6 @@ bool xen_vcpu_stolen(int vcpu); void xen_mark_init_mm_pinned(void); -void __init xen_fill_possible_map(void); - void __init xen_setup_vcpu_info_placement(void); #ifdef CONFIG_SMP -- cgit v1.1 From 8c5e5ac32fe08793246709fbb94c055ec76a7c0e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:44 -0700 Subject: xen64: add xen-head code to head_64.S Add the Xen entrypoint and ELF notes to head_64.S. Adapts xen-head.S to compile either 32-bit or 64-bit. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/xen-head.S | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index a9cac9d..63d49a5 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -8,15 +8,21 @@ #include #include +#include #include #include __INIT ENTRY(startup_xen) - movl %esi,xen_start_info cld - movl $(init_thread_union+THREAD_SIZE),%esp +#ifdef CONFIG_X86_32 + mov %esi,xen_start_info + mov $init_thread_union+THREAD_SIZE,%esp +#else + mov %rsi,xen_start_info + mov $init_thread_union+THREAD_SIZE,%rsp +#endif jmp xen_start_kernel __FINIT @@ -30,7 +36,11 @@ ENTRY(hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") +#ifdef CONFIG_X86_32 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET) +#else + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) +#endif ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") @@ -40,5 +50,6 @@ ENTRY(hypercall_page) .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START) + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0) #endif /*CONFIG_XEN */ -- cgit v1.1 From cdacc1278b12d929f9a053c245ff3d16eb7af9f8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:46 -0700 Subject: xen64: add 64-bit assembler Split xen-asm into 32- and 64-bit files, and implement the 64-bit variants. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/Makefile | 2 +- arch/x86/xen/xen-asm.S | 305 ---------------------------------------------- arch/x86/xen/xen-asm_32.S | 305 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/xen/xen-asm_64.S | 141 +++++++++++++++++++++ 4 files changed, 447 insertions(+), 306 deletions(-) delete mode 100644 arch/x86/xen/xen-asm.S create mode 100644 arch/x86/xen/xen-asm_32.S create mode 100644 arch/x86/xen/xen-asm_64.S (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 2ba2d16..59c1e53 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,4 +1,4 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o \ - time.o xen-asm.o grant-table.o suspend.o + time.o xen-asm_$(BITS).o grant-table.o suspend.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S deleted file mode 100644 index 2497a30..0000000 --- a/arch/x86/xen/xen-asm.S +++ /dev/null @@ -1,305 +0,0 @@ -/* - Asm versions of Xen pv-ops, suitable for either direct use or inlining. - The inline versions are the same as the direct-use versions, with the - pre- and post-amble chopped off. - - This code is encoded for size rather than absolute efficiency, - with a view to being able to inline as much as possible. - - We only bother with direct forms (ie, vcpu in pda) of the operations - here; the indirect forms are better handled in C, since they're - generally too large to inline anyway. - */ - -#include - -#include -#include -#include -#include -#include - -#include - -#define RELOC(x, v) .globl x##_reloc; x##_reloc=v -#define ENDPATCH(x) .globl x##_end; x##_end=. - -/* Pseudo-flag used for virtual NMI, which we don't implement yet */ -#define XEN_EFLAGS_NMI 0x80000000 - -/* - Enable events. This clears the event mask and tests the pending - event status with one and operation. If there are pending - events, then enter the hypervisor to get them handled. - */ -ENTRY(xen_irq_enable_direct) - /* Unmask events */ - movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask - - /* Preempt here doesn't matter because that will deal with - any pending interrupts. The pending check may end up being - run on the wrong CPU, but that doesn't hurt. */ - - /* Test for pending */ - testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending - jz 1f - -2: call check_events -1: -ENDPATCH(xen_irq_enable_direct) - ret - ENDPROC(xen_irq_enable_direct) - RELOC(xen_irq_enable_direct, 2b+1) - - -/* - Disabling events is simply a matter of making the event mask - non-zero. - */ -ENTRY(xen_irq_disable_direct) - movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask -ENDPATCH(xen_irq_disable_direct) - ret - ENDPROC(xen_irq_disable_direct) - RELOC(xen_irq_disable_direct, 0) - -/* - (xen_)save_fl is used to get the current interrupt enable status. - Callers expect the status to be in X86_EFLAGS_IF, and other bits - may be set in the return value. We take advantage of this by - making sure that X86_EFLAGS_IF has the right value (and other bits - in that byte are 0), but other bits in the return value are - undefined. We need to toggle the state of the bit, because - Xen and x86 use opposite senses (mask vs enable). - */ -ENTRY(xen_save_fl_direct) - testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask - setz %ah - addb %ah,%ah -ENDPATCH(xen_save_fl_direct) - ret - ENDPROC(xen_save_fl_direct) - RELOC(xen_save_fl_direct, 0) - - -/* - In principle the caller should be passing us a value return - from xen_save_fl_direct, but for robustness sake we test only - the X86_EFLAGS_IF flag rather than the whole byte. After - setting the interrupt mask state, it checks for unmasked - pending events and enters the hypervisor to get them delivered - if so. - */ -ENTRY(xen_restore_fl_direct) - testb $X86_EFLAGS_IF>>8, %ah - setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask - /* Preempt here doesn't matter because that will deal with - any pending interrupts. The pending check may end up being - run on the wrong CPU, but that doesn't hurt. */ - - /* check for unmasked and pending */ - cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending - jz 1f -2: call check_events -1: -ENDPATCH(xen_restore_fl_direct) - ret - ENDPROC(xen_restore_fl_direct) - RELOC(xen_restore_fl_direct, 2b+1) - -/* - We can't use sysexit directly, because we're not running in ring0. - But we can easily fake it up using iret. Assuming xen_sysexit - is jumped to with a standard stack frame, we can just strip it - back to a standard iret frame and use iret. - */ -ENTRY(xen_sysexit) - movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */ - orl $X86_EFLAGS_IF, PT_EFLAGS(%esp) - lea PT_EIP(%esp), %esp - - jmp xen_iret -ENDPROC(xen_sysexit) - -/* - This is run where a normal iret would be run, with the same stack setup: - 8: eflags - 4: cs - esp-> 0: eip - - This attempts to make sure that any pending events are dealt - with on return to usermode, but there is a small window in - which an event can happen just before entering usermode. If - the nested interrupt ends up setting one of the TIF_WORK_MASK - pending work flags, they will not be tested again before - returning to usermode. This means that a process can end up - with pending work, which will be unprocessed until the process - enters and leaves the kernel again, which could be an - unbounded amount of time. This means that a pending signal or - reschedule event could be indefinitely delayed. - - The fix is to notice a nested interrupt in the critical - window, and if one occurs, then fold the nested interrupt into - the current interrupt stack frame, and re-process it - iteratively rather than recursively. This means that it will - exit via the normal path, and all pending work will be dealt - with appropriately. - - Because the nested interrupt handler needs to deal with the - current stack state in whatever form its in, we keep things - simple by only using a single register which is pushed/popped - on the stack. - */ -ENTRY(xen_iret) - /* test eflags for special cases */ - testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) - jnz hyper_iret - - push %eax - ESP_OFFSET=4 # bytes pushed onto stack - - /* Store vcpu_info pointer for easy access. Do it this - way to avoid having to reload %fs */ -#ifdef CONFIG_SMP - GET_THREAD_INFO(%eax) - movl TI_cpu(%eax),%eax - movl __per_cpu_offset(,%eax,4),%eax - mov per_cpu__xen_vcpu(%eax),%eax -#else - movl per_cpu__xen_vcpu, %eax -#endif - - /* check IF state we're restoring */ - testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) - - /* Maybe enable events. Once this happens we could get a - recursive event, so the critical region starts immediately - afterwards. However, if that happens we don't end up - resuming the code, so we don't have to be worried about - being preempted to another CPU. */ - setz XEN_vcpu_info_mask(%eax) -xen_iret_start_crit: - - /* check for unmasked and pending */ - cmpw $0x0001, XEN_vcpu_info_pending(%eax) - - /* If there's something pending, mask events again so we - can jump back into xen_hypervisor_callback */ - sete XEN_vcpu_info_mask(%eax) - - popl %eax - - /* From this point on the registers are restored and the stack - updated, so we don't need to worry about it if we're preempted */ -iret_restore_end: - - /* Jump to hypervisor_callback after fixing up the stack. - Events are masked, so jumping out of the critical - region is OK. */ - je xen_hypervisor_callback - -1: iret -xen_iret_end_crit: -.section __ex_table,"a" - .align 4 - .long 1b,iret_exc -.previous - -hyper_iret: - /* put this out of line since its very rarely used */ - jmp hypercall_page + __HYPERVISOR_iret * 32 - - .globl xen_iret_start_crit, xen_iret_end_crit - -/* - This is called by xen_hypervisor_callback in entry.S when it sees - that the EIP at the time of interrupt was between xen_iret_start_crit - and xen_iret_end_crit. We're passed the EIP in %eax so we can do - a more refined determination of what to do. - - The stack format at this point is: - ---------------- - ss : (ss/esp may be present if we came from usermode) - esp : - eflags } outer exception info - cs } - eip } - ---------------- <- edi (copy dest) - eax : outer eax if it hasn't been restored - ---------------- - eflags } nested exception info - cs } (no ss/esp because we're nested - eip } from the same ring) - orig_eax }<- esi (copy src) - - - - - - - - - - fs } - es } - ds } SAVE_ALL state - eax } - : : - ebx }<- esp - ---------------- - - In order to deliver the nested exception properly, we need to shift - everything from the return addr up to the error code so it - sits just under the outer exception info. This means that when we - handle the exception, we do it in the context of the outer exception - rather than starting a new one. - - The only caveat is that if the outer eax hasn't been - restored yet (ie, it's still on stack), we need to insert - its value into the SAVE_ALL state before going on, since - it's usermode state which we eventually need to restore. - */ -ENTRY(xen_iret_crit_fixup) - /* - Paranoia: Make sure we're really coming from kernel space. - One could imagine a case where userspace jumps into the - critical range address, but just before the CPU delivers a GP, - it decides to deliver an interrupt instead. Unlikely? - Definitely. Easy to avoid? Yes. The Intel documents - explicitly say that the reported EIP for a bad jump is the - jump instruction itself, not the destination, but some virtual - environments get this wrong. - */ - movl PT_CS(%esp), %ecx - andl $SEGMENT_RPL_MASK, %ecx - cmpl $USER_RPL, %ecx - je 2f - - lea PT_ORIG_EAX(%esp), %esi - lea PT_EFLAGS(%esp), %edi - - /* If eip is before iret_restore_end then stack - hasn't been restored yet. */ - cmp $iret_restore_end, %eax - jae 1f - - movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */ - movl %eax, PT_EAX(%esp) - - lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ - - /* set up the copy */ -1: std - mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */ - rep movsl - cld - - lea 4(%edi),%esp /* point esp to new frame */ -2: jmp xen_do_upcall - - -/* - Force an event check by making a hypercall, - but preserve regs before making the call. - */ -check_events: - push %eax - push %ecx - push %edx - call force_evtchn_callback - pop %edx - pop %ecx - pop %eax - ret diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S new file mode 100644 index 0000000..2497a30 --- /dev/null +++ b/arch/x86/xen/xen-asm_32.S @@ -0,0 +1,305 @@ +/* + Asm versions of Xen pv-ops, suitable for either direct use or inlining. + The inline versions are the same as the direct-use versions, with the + pre- and post-amble chopped off. + + This code is encoded for size rather than absolute efficiency, + with a view to being able to inline as much as possible. + + We only bother with direct forms (ie, vcpu in pda) of the operations + here; the indirect forms are better handled in C, since they're + generally too large to inline anyway. + */ + +#include + +#include +#include +#include +#include +#include + +#include + +#define RELOC(x, v) .globl x##_reloc; x##_reloc=v +#define ENDPATCH(x) .globl x##_end; x##_end=. + +/* Pseudo-flag used for virtual NMI, which we don't implement yet */ +#define XEN_EFLAGS_NMI 0x80000000 + +/* + Enable events. This clears the event mask and tests the pending + event status with one and operation. If there are pending + events, then enter the hypervisor to get them handled. + */ +ENTRY(xen_irq_enable_direct) + /* Unmask events */ + movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask + + /* Preempt here doesn't matter because that will deal with + any pending interrupts. The pending check may end up being + run on the wrong CPU, but that doesn't hurt. */ + + /* Test for pending */ + testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending + jz 1f + +2: call check_events +1: +ENDPATCH(xen_irq_enable_direct) + ret + ENDPROC(xen_irq_enable_direct) + RELOC(xen_irq_enable_direct, 2b+1) + + +/* + Disabling events is simply a matter of making the event mask + non-zero. + */ +ENTRY(xen_irq_disable_direct) + movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask +ENDPATCH(xen_irq_disable_direct) + ret + ENDPROC(xen_irq_disable_direct) + RELOC(xen_irq_disable_direct, 0) + +/* + (xen_)save_fl is used to get the current interrupt enable status. + Callers expect the status to be in X86_EFLAGS_IF, and other bits + may be set in the return value. We take advantage of this by + making sure that X86_EFLAGS_IF has the right value (and other bits + in that byte are 0), but other bits in the return value are + undefined. We need to toggle the state of the bit, because + Xen and x86 use opposite senses (mask vs enable). + */ +ENTRY(xen_save_fl_direct) + testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask + setz %ah + addb %ah,%ah +ENDPATCH(xen_save_fl_direct) + ret + ENDPROC(xen_save_fl_direct) + RELOC(xen_save_fl_direct, 0) + + +/* + In principle the caller should be passing us a value return + from xen_save_fl_direct, but for robustness sake we test only + the X86_EFLAGS_IF flag rather than the whole byte. After + setting the interrupt mask state, it checks for unmasked + pending events and enters the hypervisor to get them delivered + if so. + */ +ENTRY(xen_restore_fl_direct) + testb $X86_EFLAGS_IF>>8, %ah + setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask + /* Preempt here doesn't matter because that will deal with + any pending interrupts. The pending check may end up being + run on the wrong CPU, but that doesn't hurt. */ + + /* check for unmasked and pending */ + cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending + jz 1f +2: call check_events +1: +ENDPATCH(xen_restore_fl_direct) + ret + ENDPROC(xen_restore_fl_direct) + RELOC(xen_restore_fl_direct, 2b+1) + +/* + We can't use sysexit directly, because we're not running in ring0. + But we can easily fake it up using iret. Assuming xen_sysexit + is jumped to with a standard stack frame, we can just strip it + back to a standard iret frame and use iret. + */ +ENTRY(xen_sysexit) + movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */ + orl $X86_EFLAGS_IF, PT_EFLAGS(%esp) + lea PT_EIP(%esp), %esp + + jmp xen_iret +ENDPROC(xen_sysexit) + +/* + This is run where a normal iret would be run, with the same stack setup: + 8: eflags + 4: cs + esp-> 0: eip + + This attempts to make sure that any pending events are dealt + with on return to usermode, but there is a small window in + which an event can happen just before entering usermode. If + the nested interrupt ends up setting one of the TIF_WORK_MASK + pending work flags, they will not be tested again before + returning to usermode. This means that a process can end up + with pending work, which will be unprocessed until the process + enters and leaves the kernel again, which could be an + unbounded amount of time. This means that a pending signal or + reschedule event could be indefinitely delayed. + + The fix is to notice a nested interrupt in the critical + window, and if one occurs, then fold the nested interrupt into + the current interrupt stack frame, and re-process it + iteratively rather than recursively. This means that it will + exit via the normal path, and all pending work will be dealt + with appropriately. + + Because the nested interrupt handler needs to deal with the + current stack state in whatever form its in, we keep things + simple by only using a single register which is pushed/popped + on the stack. + */ +ENTRY(xen_iret) + /* test eflags for special cases */ + testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) + jnz hyper_iret + + push %eax + ESP_OFFSET=4 # bytes pushed onto stack + + /* Store vcpu_info pointer for easy access. Do it this + way to avoid having to reload %fs */ +#ifdef CONFIG_SMP + GET_THREAD_INFO(%eax) + movl TI_cpu(%eax),%eax + movl __per_cpu_offset(,%eax,4),%eax + mov per_cpu__xen_vcpu(%eax),%eax +#else + movl per_cpu__xen_vcpu, %eax +#endif + + /* check IF state we're restoring */ + testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) + + /* Maybe enable events. Once this happens we could get a + recursive event, so the critical region starts immediately + afterwards. However, if that happens we don't end up + resuming the code, so we don't have to be worried about + being preempted to another CPU. */ + setz XEN_vcpu_info_mask(%eax) +xen_iret_start_crit: + + /* check for unmasked and pending */ + cmpw $0x0001, XEN_vcpu_info_pending(%eax) + + /* If there's something pending, mask events again so we + can jump back into xen_hypervisor_callback */ + sete XEN_vcpu_info_mask(%eax) + + popl %eax + + /* From this point on the registers are restored and the stack + updated, so we don't need to worry about it if we're preempted */ +iret_restore_end: + + /* Jump to hypervisor_callback after fixing up the stack. + Events are masked, so jumping out of the critical + region is OK. */ + je xen_hypervisor_callback + +1: iret +xen_iret_end_crit: +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous + +hyper_iret: + /* put this out of line since its very rarely used */ + jmp hypercall_page + __HYPERVISOR_iret * 32 + + .globl xen_iret_start_crit, xen_iret_end_crit + +/* + This is called by xen_hypervisor_callback in entry.S when it sees + that the EIP at the time of interrupt was between xen_iret_start_crit + and xen_iret_end_crit. We're passed the EIP in %eax so we can do + a more refined determination of what to do. + + The stack format at this point is: + ---------------- + ss : (ss/esp may be present if we came from usermode) + esp : + eflags } outer exception info + cs } + eip } + ---------------- <- edi (copy dest) + eax : outer eax if it hasn't been restored + ---------------- + eflags } nested exception info + cs } (no ss/esp because we're nested + eip } from the same ring) + orig_eax }<- esi (copy src) + - - - - - - - - + fs } + es } + ds } SAVE_ALL state + eax } + : : + ebx }<- esp + ---------------- + + In order to deliver the nested exception properly, we need to shift + everything from the return addr up to the error code so it + sits just under the outer exception info. This means that when we + handle the exception, we do it in the context of the outer exception + rather than starting a new one. + + The only caveat is that if the outer eax hasn't been + restored yet (ie, it's still on stack), we need to insert + its value into the SAVE_ALL state before going on, since + it's usermode state which we eventually need to restore. + */ +ENTRY(xen_iret_crit_fixup) + /* + Paranoia: Make sure we're really coming from kernel space. + One could imagine a case where userspace jumps into the + critical range address, but just before the CPU delivers a GP, + it decides to deliver an interrupt instead. Unlikely? + Definitely. Easy to avoid? Yes. The Intel documents + explicitly say that the reported EIP for a bad jump is the + jump instruction itself, not the destination, but some virtual + environments get this wrong. + */ + movl PT_CS(%esp), %ecx + andl $SEGMENT_RPL_MASK, %ecx + cmpl $USER_RPL, %ecx + je 2f + + lea PT_ORIG_EAX(%esp), %esi + lea PT_EFLAGS(%esp), %edi + + /* If eip is before iret_restore_end then stack + hasn't been restored yet. */ + cmp $iret_restore_end, %eax + jae 1f + + movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */ + movl %eax, PT_EAX(%esp) + + lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ + + /* set up the copy */ +1: std + mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */ + rep movsl + cld + + lea 4(%edi),%esp /* point esp to new frame */ +2: jmp xen_do_upcall + + +/* + Force an event check by making a hypercall, + but preserve regs before making the call. + */ +check_events: + push %eax + push %ecx + push %edx + call force_evtchn_callback + pop %edx + pop %ecx + pop %eax + ret diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S new file mode 100644 index 0000000..4ec1082 --- /dev/null +++ b/arch/x86/xen/xen-asm_64.S @@ -0,0 +1,141 @@ +/* + Asm versions of Xen pv-ops, suitable for either direct use or inlining. + The inline versions are the same as the direct-use versions, with the + pre- and post-amble chopped off. + + This code is encoded for size rather than absolute efficiency, + with a view to being able to inline as much as possible. + + We only bother with direct forms (ie, vcpu in pda) of the operations + here; the indirect forms are better handled in C, since they're + generally too large to inline anyway. + */ + +#include + +#include +#include + +#include + +#define RELOC(x, v) .globl x##_reloc; x##_reloc=v +#define ENDPATCH(x) .globl x##_end; x##_end=. + +/* Pseudo-flag used for virtual NMI, which we don't implement yet */ +#define XEN_EFLAGS_NMI 0x80000000 + +#if 0 +#include + +/* + Enable events. This clears the event mask and tests the pending + event status with one and operation. If there are pending + events, then enter the hypervisor to get them handled. + */ +ENTRY(xen_irq_enable_direct) + /* Unmask events */ + movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) + + /* Preempt here doesn't matter because that will deal with + any pending interrupts. The pending check may end up being + run on the wrong CPU, but that doesn't hurt. */ + + /* Test for pending */ + testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending) + jz 1f + +2: call check_events +1: +ENDPATCH(xen_irq_enable_direct) + ret + ENDPROC(xen_irq_enable_direct) + RELOC(xen_irq_enable_direct, 2b+1) + +/* + Disabling events is simply a matter of making the event mask + non-zero. + */ +ENTRY(xen_irq_disable_direct) + movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) +ENDPATCH(xen_irq_disable_direct) + ret + ENDPROC(xen_irq_disable_direct) + RELOC(xen_irq_disable_direct, 0) + +/* + (xen_)save_fl is used to get the current interrupt enable status. + Callers expect the status to be in X86_EFLAGS_IF, and other bits + may be set in the return value. We take advantage of this by + making sure that X86_EFLAGS_IF has the right value (and other bits + in that byte are 0), but other bits in the return value are + undefined. We need to toggle the state of the bit, because + Xen and x86 use opposite senses (mask vs enable). + */ +ENTRY(xen_save_fl_direct) + testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) + setz %ah + addb %ah,%ah +ENDPATCH(xen_save_fl_direct) + ret + ENDPROC(xen_save_fl_direct) + RELOC(xen_save_fl_direct, 0) + +/* + In principle the caller should be passing us a value return + from xen_save_fl_direct, but for robustness sake we test only + the X86_EFLAGS_IF flag rather than the whole byte. After + setting the interrupt mask state, it checks for unmasked + pending events and enters the hypervisor to get them delivered + if so. + */ +ENTRY(xen_restore_fl_direct) + testb $X86_EFLAGS_IF>>8, %ah + setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) + /* Preempt here doesn't matter because that will deal with + any pending interrupts. The pending check may end up being + run on the wrong CPU, but that doesn't hurt. */ + + /* check for unmasked and pending */ + cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending) + jz 1f +2: call check_events +1: +ENDPATCH(xen_restore_fl_direct) + ret + ENDPROC(xen_restore_fl_direct) + RELOC(xen_restore_fl_direct, 2b+1) + + +/* + Force an event check by making a hypercall, + but preserve regs before making the call. + */ +check_events: + push %rax + push %rcx + push %rdx + push %rsi + push %rdi + push %r8 + push %r9 + push %r10 + push %r11 + call force_evtchn_callback + pop %r11 + pop %r10 + pop %r9 + pop %r8 + pop %rdi + pop %rsi + pop %rdx + pop %rcx + pop %rax + ret +#endif + +ENTRY(xen_iret) + pushq $0 + jmp hypercall_page + __HYPERVISOR_iret * 32 + +ENTRY(xen_sysexit) + ud2a -- cgit v1.1 From 15664f968a95d8fbf4a0d7b462fcc20f88906bb3 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:47 -0700 Subject: xen64: use set_fixmap for shared_info structure Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index f3f11ac..dbe3549 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -902,18 +902,11 @@ static __init void xen_pagetable_setup_start(pgd_t *base) void xen_setup_shared_info(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) { - unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); - - /* - * Create a mapping for the shared info page. - * Should be set_fixmap(), but shared_info is a machine - * address with no corresponding pseudo-phys address. - */ - set_pte_mfn(addr, - PFN_DOWN(xen_start_info->shared_info), - PAGE_KERNEL); - - HYPERVISOR_shared_info = (struct shared_info *)addr; + set_fixmap(FIX_PARAVIRT_BOOTMAP, + xen_start_info->shared_info); + + HYPERVISOR_shared_info = + (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); } else HYPERVISOR_shared_info = (struct shared_info *)__va(xen_start_info->shared_info); @@ -1050,8 +1043,13 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) #ifdef CONFIG_X86_F00F_BUG case FIX_F00F_IDT: #endif +#ifdef CONFIG_X86_32 case FIX_WP_TEST: case FIX_VDSO: + case FIX_KMAP_BEGIN ... FIX_KMAP_END: +#else + case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: +#endif #ifdef CONFIG_X86_LOCAL_APIC case FIX_APIC_BASE: /* maps dummy local APIC */ #endif -- cgit v1.1 From 7d087b68d6ddb2398fb7f6e45990b7248de640ef Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:48 -0700 Subject: xen: cpu_detect is 32-bit only Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index dbe3549..2b7bea3 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1365,12 +1365,12 @@ asmlinkage void __init xen_start_kernel(void) /* set the limit of our address space */ xen_reserve_top(); +#ifdef CONFIG_X86_32 /* set up basic CPUID stuff */ cpu_detect(&new_cpu_data); -#ifdef CONFIG_X86_32 new_cpu_data.hard_math = 1; -#endif new_cpu_data.x86_capability[0] = cpuid_edx(1); +#endif /* Poke various useful things into boot_params */ boot_params.hdr.type_of_loader = (9 << 4) | 0; -- cgit v1.1 From 084a2a4e7656209ea93aac9778defa03213ca31d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:50 -0700 Subject: xen64: early mapping setup Set up the initial pagetables to map the kernel mapping into the physical mapping space. This makes __va() usable, since it requires physical mappings. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 192 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 176 insertions(+), 16 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 2b7bea3..a991ee7 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -1294,6 +1295,157 @@ static void __init xen_reserve_top(void) #endif /* CONFIG_X86_32 */ } +#ifdef CONFIG_X86_64 +/* + * Like __va(), but returns address in the kernel mapping (which is + * all we have until the physical memory mapping has been set up. + */ +static void *__ka(phys_addr_t paddr) +{ + return (void *)(paddr + __START_KERNEL_map); +} + +/* Convert a machine address to physical address */ +static unsigned long m2p(phys_addr_t maddr) +{ + phys_addr_t paddr; + + maddr &= PTE_MASK; + paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; + + return paddr; +} + +/* Convert a machine address to kernel virtual */ +static void *m2v(phys_addr_t maddr) +{ + return __ka(m2p(maddr)); +} + +static void walk(pgd_t *pgd, unsigned long addr) +{ + unsigned l4idx = pgd_index(addr); + unsigned l3idx = pud_index(addr); + unsigned l2idx = pmd_index(addr); + unsigned l1idx = pte_index(addr); + pgd_t l4; + pud_t l3; + pmd_t l2; + pte_t l1; + + xen_raw_printk("walk %p, %lx -> %d %d %d %d\n", + pgd, addr, l4idx, l3idx, l2idx, l1idx); + + l4 = pgd[l4idx]; + xen_raw_printk(" l4: %016lx\n", l4.pgd); + xen_raw_printk(" %016lx\n", pgd_val(l4)); + + l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx]; + xen_raw_printk(" l3: %016lx\n", l3.pud); + xen_raw_printk(" %016lx\n", pud_val(l3)); + + l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx]; + xen_raw_printk(" l2: %016lx\n", l2.pmd); + xen_raw_printk(" %016lx\n", pmd_val(l2)); + + l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx]; + xen_raw_printk(" l1: %016lx\n", l1.pte); + xen_raw_printk(" %016lx\n", pte_val(l1)); +} + +static void set_page_prot(void *addr, pgprot_t prot) +{ + unsigned long pfn = __pa(addr) >> PAGE_SHIFT; + pte_t pte = pfn_pte(pfn, prot); + + xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016x pte=%016x\n", + addr, pfn, get_phys_to_machine(pfn), + pgprot_val(prot), pte.pte); + + if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) + BUG(); +} + +static void convert_pfn_mfn(void *v) +{ + pte_t *pte = v; + int i; + + /* All levels are converted the same way, so just treat them + as ptes. */ + for(i = 0; i < PTRS_PER_PTE; i++) + pte[i] = xen_make_pte(pte[i].pte); +} + +/* + * Set up the inital kernel pagetable. + * + * We can construct this by grafting the Xen provided pagetable into + * head_64.S's preconstructed pagetables. We copy the Xen L2's into + * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This + * means that only the kernel has a physical mapping to start with - + * but that's enough to get __va working. We need to fill in the rest + * of the physical mapping once some sort of allocator has been set + * up. + */ +static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd) +{ + pud_t *l3; + pmd_t *l2; + + /* Zap identity mapping */ + init_level4_pgt[0] = __pgd(0); + + /* Pre-constructed entries are in pfn, so convert to mfn */ + convert_pfn_mfn(init_level4_pgt); + convert_pfn_mfn(level3_ident_pgt); + convert_pfn_mfn(level3_kernel_pgt); + + l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); + l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); + + memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + + l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); + l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); + memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + + /* Make pagetable pieces RO */ + set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); + + /* Pin down new L4 */ + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(init_level4_pgt))); + + /* Unpin Xen-provided one */ + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + + /* Switch over */ + pgd = init_level4_pgt; + xen_write_cr3(__pa(pgd)); + + max_pfn_mapped = PFN_DOWN(__pa(pgd) + + xen_start_info->nr_pt_frames*PAGE_SIZE + + 512*1024); + + return pgd; +} +#else +static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd) +{ + init_pg_tables_start = __pa(pgd); + init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; + max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024); + + return pgd; +} +#endif /* CONFIG_X86_64 */ + /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) { @@ -1336,32 +1488,29 @@ asmlinkage void __init xen_start_kernel(void) pgd = (pgd_t *)xen_start_info->pt_base; -#ifdef CONFIG_X86_32 - init_pg_tables_start = __pa(pgd); - init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; - max_pfn_mapped = (init_pg_tables_end + 512*1024) >> PAGE_SHIFT; -#endif + /* Prevent unwanted bits from being set in PTEs. */ + __supported_pte_mask &= ~_PAGE_GLOBAL; + if (!is_initial_xendomain()) + __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); + + /* Don't do the full vcpu_info placement stuff until we have a + possible map and a non-dummy shared_info. */ + per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; + + xen_raw_console_write("mapping kernel into physical memory\n"); + pgd = xen_setup_kernel_pagetable(pgd); - init_mm.pgd = pgd; /* use the Xen pagetables to start */ + init_mm.pgd = pgd; /* keep using Xen gdt for now; no urgent need to change it */ x86_write_percpu(xen_cr3, __pa(pgd)); x86_write_percpu(xen_current_cr3, __pa(pgd)); - /* Don't do the full vcpu_info placement stuff until we have a - possible map and a non-dummy shared_info. */ - per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; - pv_info.kernel_rpl = 1; if (xen_feature(XENFEAT_supervisor_mode_kernel)) pv_info.kernel_rpl = 0; - /* Prevent unwanted bits from being set in PTEs. */ - __supported_pte_mask &= ~_PAGE_GLOBAL; - if (!is_initial_xendomain()) - __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); - /* set the limit of our address space */ xen_reserve_top(); @@ -1384,10 +1533,21 @@ asmlinkage void __init xen_start_kernel(void) add_preferred_console("hvc", 0, NULL); } + xen_raw_console_write("about to get started...\n"); + +#if 0 + xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n", + &boot_params, __pa_symbol(&boot_params), + __va(__pa_symbol(&boot_params))); + + walk(pgd, &boot_params); + walk(pgd, __va(__pa(&boot_params))); +#endif + /* Start the world */ #ifdef CONFIG_X86_32 i386_start_kernel(); #else - x86_64_start_kernel((char *)&boot_params); + x86_64_start_reservations((char *)__pa_symbol(&boot_params)); #endif } -- cgit v1.1 From 22911b3f1cf5431058e56b1727e8ef77be5e0ac9 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:51 -0700 Subject: xen64: 64-bit starts using set_pte from very early It also doesn't need the 32-bit hack version of set_pte for initial pagetable construction, so just make it use the real thing. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a991ee7..3924507 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1194,7 +1194,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .kmap_atomic_pte = xen_kmap_atomic_pte, #endif +#ifdef CONFIG_X86_64 + .set_pte = xen_set_pte, +#else .set_pte = xen_set_pte_init, +#endif .set_pte_at = xen_set_pte_at, .set_pmd = xen_set_pmd_hyper, -- cgit v1.1 From d114e1981cc1a51131230993a082c27c79ab370a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:52 -0700 Subject: xen64: map an initial chunk of physical memory Early in boot, map a chunk of extra physical memory for use later on. We need a pool of mapped pages to allocate further pages to construct pagetables mapping all physical memory. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 79 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 69 insertions(+), 10 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 3924507..e9e3baf 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1382,6 +1382,61 @@ static void convert_pfn_mfn(void *v) } /* + * Identity map, in addition to plain kernel map. This needs to be + * large enough to allocate page table pages to allocate the rest. + * Each page can map 2MB. + */ +static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; + +static __init void xen_map_identity_early(unsigned long max_pfn) +{ + unsigned pmdidx, pteidx; + unsigned ident_pte; + unsigned long pfn; + + ident_pte = 0; + pfn = 0; + for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { + pte_t *pte_page; + + BUG_ON(level2_ident_pgt[pmdidx].pmd != level2_kernel_pgt[pmdidx].pmd); + + /* Reuse or allocate a page of ptes */ + if (pmd_present(level2_ident_pgt[pmdidx])) + pte_page = m2v(level2_ident_pgt[pmdidx].pmd); + else { + /* Check for free pte pages */ + if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) + break; + + pte_page = &level1_ident_pgt[ident_pte]; + ident_pte += PTRS_PER_PTE; + + /* Install new l1 in l2(s) */ + level2_ident_pgt[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); + level2_kernel_pgt[pmdidx] = level2_ident_pgt[pmdidx]; + } + + /* Install mappings */ + for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { + pte_t pte; + + if (pfn > max_pfn_mapped) + max_pfn_mapped = pfn; + + if (!pte_none(pte_page[pteidx])) + continue; + + pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); + pte_page[pteidx] = pte; + } + } + + for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) + set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); +} + +/* * Set up the inital kernel pagetable. * * We can construct this by grafting the Xen provided pagetable into @@ -1392,7 +1447,7 @@ static void convert_pfn_mfn(void *v) * of the physical mapping once some sort of allocator has been set * up. */ -static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd) +static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pud_t *l3; pmd_t *l2; @@ -1415,6 +1470,9 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd) l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + /* Set up identity map */ + xen_map_identity_early(max_pfn); + /* Make pagetable pieces RO */ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); @@ -1424,7 +1482,7 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd) set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); /* Pin down new L4 */ - pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(init_level4_pgt))); + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa_symbol(init_level4_pgt))); /* Unpin Xen-provided one */ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); @@ -1433,19 +1491,23 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd) pgd = init_level4_pgt; xen_write_cr3(__pa(pgd)); - max_pfn_mapped = PFN_DOWN(__pa(pgd) + - xen_start_info->nr_pt_frames*PAGE_SIZE + - 512*1024); + reserve_early(__pa(xen_start_info->pt_base), + __pa(xen_start_info->pt_base + + xen_start_info->nr_pt_frames * PAGE_SIZE), + "XEN PAGETABLES"); return pgd; } #else -static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd) +static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { init_pg_tables_start = __pa(pgd); init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024); + x86_write_percpu(xen_cr3, __pa(pgd)); + x86_write_percpu(xen_current_cr3, __pa(pgd)); + return pgd; } #endif /* CONFIG_X86_64 */ @@ -1502,15 +1564,12 @@ asmlinkage void __init xen_start_kernel(void) per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; xen_raw_console_write("mapping kernel into physical memory\n"); - pgd = xen_setup_kernel_pagetable(pgd); + pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); init_mm.pgd = pgd; /* keep using Xen gdt for now; no urgent need to change it */ - x86_write_percpu(xen_cr3, __pa(pgd)); - x86_write_percpu(xen_current_cr3, __pa(pgd)); - pv_info.kernel_rpl = 1; if (xen_feature(XENFEAT_supervisor_mode_kernel)) pv_info.kernel_rpl = 0; -- cgit v1.1 From 39dbc5bd345ebf93e066dde7f8e29467eb61b42e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:53 -0700 Subject: xen32: create initial mappings like 64-bit Rearrange the pagetable initialization to share code with the 64-bit kernel. Rather than deferring anything to pagetable_setup_start, just set up an initial pagetable in swapper_pg_dir early at startup, and create an additional 8MB of physical memory mappings. This matches the native head_32.S mappings to a large degree, and allows the rest of the pagetable setup to continue without much Xen vs. native difference. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 130 +++++++++++++++++++---------------------------- 1 file changed, 52 insertions(+), 78 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e9e3baf..19c12a6 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -854,50 +854,6 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) static __init void xen_pagetable_setup_start(pgd_t *base) { -#ifdef CONFIG_X86_32 - pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; - int i; - - init_mm.pgd = base; - /* - * copy top-level of Xen-supplied pagetable into place. This - * is a stand-in while we copy the pmd pages. - */ - memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); - - /* - * For PAE, need to allocate new pmds, rather than - * share Xen's, since Xen doesn't like pmd's being - * shared between address spaces. - */ - for (i = 0; i < PTRS_PER_PGD; i++) { - if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { - pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); - - memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), - PAGE_SIZE); - - make_lowmem_page_readonly(pmd); - - set_pgd(&base[i], __pgd(1 + __pa(pmd))); - } else - pgd_clear(&base[i]); - } - - /* make sure zero_page is mapped RO so we can use it in pagetables */ - make_lowmem_page_readonly(empty_zero_page); - make_lowmem_page_readonly(base); - /* - * Switch to new pagetable. This is done before - * pagetable_init has done anything so that the new pages - * added to the table can be prepared properly for Xen. - */ - xen_write_cr3(__pa(base)); - - /* Unpin initial Xen pagetable */ - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, - PFN_DOWN(__pa(xen_start_info->pt_base))); -#endif /* CONFIG_X86_32 */ } void xen_setup_shared_info(void) @@ -936,12 +892,6 @@ static __init void xen_pagetable_setup_done(pgd_t *base) pv_mmu_ops.set_pte = xen_set_pte; xen_setup_shared_info(); - -#ifdef CONFIG_X86_32 - /* Actually pin the pagetable down, but we can't set PG_pinned - yet because the page structures don't exist yet. */ - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base))); -#endif } static __init void xen_post_allocator_init(void) @@ -1299,14 +1249,17 @@ static void __init xen_reserve_top(void) #endif /* CONFIG_X86_32 */ } -#ifdef CONFIG_X86_64 /* * Like __va(), but returns address in the kernel mapping (which is * all we have until the physical memory mapping has been set up. */ static void *__ka(phys_addr_t paddr) { +#ifdef CONFIG_X86_64 return (void *)(paddr + __START_KERNEL_map); +#else + return __va(paddr); +#endif } /* Convert a machine address to physical address */ @@ -1326,6 +1279,7 @@ static void *m2v(phys_addr_t maddr) return __ka(m2p(maddr)); } +#ifdef CONFIG_X86_64 static void walk(pgd_t *pgd, unsigned long addr) { unsigned l4idx = pgd_index(addr); @@ -1356,13 +1310,14 @@ static void walk(pgd_t *pgd, unsigned long addr) xen_raw_printk(" l1: %016lx\n", l1.pte); xen_raw_printk(" %016lx\n", pte_val(l1)); } +#endif static void set_page_prot(void *addr, pgprot_t prot) { unsigned long pfn = __pa(addr) >> PAGE_SHIFT; pte_t pte = pfn_pte(pfn, prot); - xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016x pte=%016x\n", + xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n", addr, pfn, get_phys_to_machine(pfn), pgprot_val(prot), pte.pte); @@ -1370,17 +1325,6 @@ static void set_page_prot(void *addr, pgprot_t prot) BUG(); } -static void convert_pfn_mfn(void *v) -{ - pte_t *pte = v; - int i; - - /* All levels are converted the same way, so just treat them - as ptes. */ - for(i = 0; i < PTRS_PER_PTE; i++) - pte[i] = xen_make_pte(pte[i].pte); -} - /* * Identity map, in addition to plain kernel map. This needs to be * large enough to allocate page table pages to allocate the rest. @@ -1388,7 +1332,7 @@ static void convert_pfn_mfn(void *v) */ static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; -static __init void xen_map_identity_early(unsigned long max_pfn) +static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) { unsigned pmdidx, pteidx; unsigned ident_pte; @@ -1399,11 +1343,9 @@ static __init void xen_map_identity_early(unsigned long max_pfn) for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { pte_t *pte_page; - BUG_ON(level2_ident_pgt[pmdidx].pmd != level2_kernel_pgt[pmdidx].pmd); - /* Reuse or allocate a page of ptes */ - if (pmd_present(level2_ident_pgt[pmdidx])) - pte_page = m2v(level2_ident_pgt[pmdidx].pmd); + if (pmd_present(pmd[pmdidx])) + pte_page = m2v(pmd[pmdidx].pmd); else { /* Check for free pte pages */ if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) @@ -1412,9 +1354,7 @@ static __init void xen_map_identity_early(unsigned long max_pfn) pte_page = &level1_ident_pgt[ident_pte]; ident_pte += PTRS_PER_PTE; - /* Install new l1 in l2(s) */ - level2_ident_pgt[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); - level2_kernel_pgt[pmdidx] = level2_ident_pgt[pmdidx]; + pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); } /* Install mappings */ @@ -1434,6 +1374,20 @@ static __init void xen_map_identity_early(unsigned long max_pfn) for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); + + set_page_prot(pmd, PAGE_KERNEL_RO); +} + +#ifdef CONFIG_X86_64 +static void convert_pfn_mfn(void *v) +{ + pte_t *pte = v; + int i; + + /* All levels are converted the same way, so just treat them + as ptes. */ + for(i = 0; i < PTRS_PER_PTE; i++) + pte[i] = xen_make_pte(pte[i].pte); } /* @@ -1471,18 +1425,18 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pf memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); /* Set up identity map */ - xen_map_identity_early(max_pfn); + xen_map_identity_early(level2_ident_pgt, max_pfn); /* Make pagetable pieces RO */ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); /* Pin down new L4 */ - pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa_symbol(init_level4_pgt))); + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, + PFN_DOWN(__pa_symbol(init_level4_pgt))); /* Unpin Xen-provided one */ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); @@ -1498,17 +1452,37 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pf return pgd; } -#else +#else /* !CONFIG_X86_64 */ +static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; + static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { + pmd_t *kernel_pmd; + init_pg_tables_start = __pa(pgd); init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024); - x86_write_percpu(xen_cr3, __pa(pgd)); - x86_write_percpu(xen_current_cr3, __pa(pgd)); + kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); + memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); - return pgd; + xen_map_identity_early(level2_kernel_pgt, max_pfn); + + memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); + set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], + __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); + + set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); + set_page_prot(empty_zero_page, PAGE_KERNEL_RO); + + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + + xen_write_cr3(__pa(swapper_pg_dir)); + + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); + + return swapper_pg_dir; } #endif /* CONFIG_X86_64 */ -- cgit v1.1 From ebd879e397f6361727c36267a12d1650710e465a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:54 -0700 Subject: xen: fix truncation of machine address arbitrary_virt_to_machine can truncate a machine address if its above 4G. Cast the problem away. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 2579e70..05d7392 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -186,7 +186,7 @@ xmaddr_t arbitrary_virt_to_machine(unsigned long address) BUG_ON(pte == NULL); - return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); + return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset); } void make_lowmem_page_readonly(void *vaddr) -- cgit v1.1 From ce803e705f1cbdd2703e83061622089b5b4a5417 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:55 -0700 Subject: xen64: use arbitrary_virt_to_machine for xen_set_pmd When building initial pagetables in 64-bit kernel the pud/pmd pointer may be in ioremap/fixmap space, so we need to walk the pagetable to look up the physical address. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 05d7392..a8f0232 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -178,8 +178,9 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn) p2m_top[topidx][idx] = mfn; } -xmaddr_t arbitrary_virt_to_machine(unsigned long address) +xmaddr_t arbitrary_virt_to_machine(void *vaddr) { + unsigned long address = (unsigned long)vaddr; unsigned int level; pte_t *pte = lookup_address(address, &level); unsigned offset = address & ~PAGE_MASK; @@ -253,7 +254,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) xen_mc_batch(); - u.ptr = virt_to_machine(ptr).maddr; + /* ptr may be ioremapped for 64-bit pagetable setup */ + u.ptr = arbitrary_virt_to_machine(ptr).maddr; u.val = pmd_val_ma(val); extend_mmu_update(&u); @@ -415,7 +417,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) xen_mc_batch(); - u.ptr = virt_to_machine(ptr).maddr; + /* ptr may be ioremapped for 64-bit pagetable setup */ + u.ptr = arbitrary_virt_to_machine(ptr).maddr; u.val = pud_val_ma(val); extend_mmu_update(&u); -- cgit v1.1 From 4560a2947e32670fc6ede108c2b032c396180649 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:56 -0700 Subject: xen: set num_processors Someone's got to do it. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/smp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 800bb21..8310ca0 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -155,8 +155,10 @@ static void __init xen_fill_possible_map(void) for (i = 0; i < NR_CPUS; i++) { rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); - if (rc >= 0) + if (rc >= 0) { + num_processors++; cpu_set(i, cpu_possible_map); + } } } -- cgit v1.1 From 8745f8b0b914cf1d617ecc49726c24011858c74e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:57 -0700 Subject: xen64: defer setting pagetable alloc/release ops We need to wait until the page structure is available to use the proper pagetable page alloc/release operations, since they use struct page to determine if a pagetable is pinned. This happened to work in 32bit because nobody allocated new pagetable pages in the interim between xen_pagetable_setup_done and xen_post_allocator_init, but the 64-bit kenrel needs to allocate more pagetable levels. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 19c12a6..da91404 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -878,30 +878,29 @@ void xen_setup_shared_info(void) static __init void xen_pagetable_setup_done(pgd_t *base) { - /* This will work as long as patching hasn't happened yet - (which it hasn't) */ - pv_mmu_ops.alloc_pte = xen_alloc_pte; - pv_mmu_ops.alloc_pmd = xen_alloc_pmd; - pv_mmu_ops.release_pte = xen_release_pte; - pv_mmu_ops.release_pmd = xen_release_pmd; -#if PAGETABLE_LEVELS == 4 - pv_mmu_ops.alloc_pud = xen_alloc_pud; - pv_mmu_ops.release_pud = xen_release_pud; -#endif - - pv_mmu_ops.set_pte = xen_set_pte; - xen_setup_shared_info(); } static __init void xen_post_allocator_init(void) { + pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; #if PAGETABLE_LEVELS == 4 pv_mmu_ops.set_pgd = xen_set_pgd; #endif + /* This will work as long as patching hasn't happened yet + (which it hasn't) */ + pv_mmu_ops.alloc_pte = xen_alloc_pte; + pv_mmu_ops.alloc_pmd = xen_alloc_pmd; + pv_mmu_ops.release_pte = xen_release_pte; + pv_mmu_ops.release_pmd = xen_release_pmd; +#if PAGETABLE_LEVELS == 4 + pv_mmu_ops.alloc_pud = xen_alloc_pud; + pv_mmu_ops.release_pud = xen_release_pud; +#endif + xen_mark_init_mm_pinned(); } -- cgit v1.1 From 836fe2f291cb450a6193fa713878efe7d32bec6e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:58 -0700 Subject: xen: use set_pte_vaddr Make Xen's set_pte_mfn() use set_pte_vaddr rather than copying it. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Juan Quintela Signed-off-by: Mark McLoughlin Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index a8f0232..eb31ed2 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -282,35 +282,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val) */ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pgd = swapper_pg_dir + pgd_index(vaddr); - if (pgd_none(*pgd)) { - BUG(); - return; - } - pud = pud_offset(pgd, vaddr); - if (pud_none(*pud)) { - BUG(); - return; - } - pmd = pmd_offset(pud, vaddr); - if (pmd_none(*pmd)) { - BUG(); - return; - } - pte = pte_offset_kernel(pmd, vaddr); - /* stored as-is, to permit clearing entries */ - xen_set_pte(pte, mfn_pte(mfn, flags)); - - /* - * It's enough to flush this one mapping. - * (PGE mappings get flushed as well) - */ - __flush_tlb_one(vaddr); + set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); } void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, -- cgit v1.1 From e176d367d0cc8b8efd2e0960c9edf5d2fe7cd9f1 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Tue, 8 Jul 2008 15:06:59 -0700 Subject: xen64: xen_write_idt_entry() and cvt_gate_to_trap() Changed to use the (to-be-)unified descriptor structs. Signed-off-by: Eduardo Habkost Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index da91404..f5e96f7 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -401,23 +401,18 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, preempt_enable(); } -static int cvt_gate_to_trap(int vector, u32 low, u32 high, +static int cvt_gate_to_trap(int vector, const gate_desc *val, struct trap_info *info) { - u8 type, dpl; - - type = (high >> 8) & 0x1f; - dpl = (high >> 13) & 3; - - if (type != 0xf && type != 0xe) + if (val->type != 0xf && val->type != 0xe) return 0; info->vector = vector; - info->address = (high & 0xffff0000) | (low & 0x0000ffff); - info->cs = low >> 16; - info->flags = dpl; + info->address = gate_offset(*val); + info->cs = gate_segment(*val); + info->flags = val->dpl; /* interrupt gates clear IF */ - if (type == 0xe) + if (val->type == 0xe) info->flags |= 4; return 1; @@ -444,11 +439,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) if (p >= start && (p + 8) <= end) { struct trap_info info[2]; - u32 *desc = (u32 *)g; info[1].address = 0; - if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0])) + if (cvt_gate_to_trap(entrynum, g, &info[0])) if (HYPERVISOR_set_trap_table(info)) BUG(); } @@ -461,13 +455,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc, { unsigned in, out, count; - count = (desc->size+1) / 8; + count = (desc->size+1) / sizeof(gate_desc); BUG_ON(count > 256); for (in = out = 0; in < count; in++) { - const u32 *entry = (u32 *)(desc->address + in * 8); + gate_desc *entry = (gate_desc*)(desc->address) + in; - if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) + if (cvt_gate_to_trap(in, entry, &traps[out])) out++; } traps[out].address = 0; -- cgit v1.1 From 997409d3d0bd6894f33e31ced251c0fdf523aa14 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:00 -0700 Subject: xen64: deal with extra words Xen pushes onto exception frames Xen pushes two extra words containing the values of rcx and r11. This pvop hook copies the words back into their appropriate registers, and cleans them off the stack. This leaves the stack in native form, so the normal handler can run unchanged. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- arch/x86/xen/xen-asm_64.S | 5 +++++ arch/x86/xen/xen-ops.h | 2 ++ 3 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index f5e96f7..9d94483 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1091,7 +1091,7 @@ static const struct pv_irq_ops xen_irq_ops __initdata = { .safe_halt = xen_safe_halt, .halt = xen_halt, #ifdef CONFIG_X86_64 - .adjust_exception_frame = paravirt_nop, + .adjust_exception_frame = xen_adjust_exception_frame, #endif }; diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 4ec1082..b147b49 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -133,6 +133,11 @@ check_events: ret #endif +ENTRY(xen_adjust_exception_frame) + mov 8+0(%rsp),%rcx + mov 8+8(%rsp),%r11 + ret $16 + ENTRY(xen_iret) pushq $0 jmp hypercall_page + __HYPERVISOR_iret * 32 diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index aca4a78..c4800a2 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -67,7 +67,9 @@ DECL_ASM(void, xen_irq_disable_direct, void); DECL_ASM(unsigned long, xen_save_fl_direct, void); DECL_ASM(void, xen_restore_fl_direct, unsigned long); +/* These are not functions, and cannot be called normally */ void xen_iret(void); void xen_sysexit(void); +void xen_adjust_exception_frame(void); #endif /* XEN_OPS_H */ -- cgit v1.1 From 952d1d7055c8cbf95b4ad2f90be5ed37db8a48ee Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:01 -0700 Subject: xen64: add pvop for swapgs swapgs is a no-op under Xen, because the hypervisor makes sure the right version of %gs is current when switching between user and kernel modes. This means that the swapgs "implementation" can be inlined and used when the stack is unsafe (usermode). Unfortunately, it means that disabling patching will result in a non-booting kernel... Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 9d94483..8b60982 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1076,6 +1076,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .set_iopl_mask = xen_set_iopl_mask, .io_delay = xen_io_delay, + /* Xen takes care of %gs when switching to usermode for us */ + .swapgs = paravirt_nop, + .lazy_mode = { .enter = paravirt_enter_lazy_cpu, .leave = xen_leave_lazy, -- cgit v1.1 From 88459d4c7eb68c4a15609e00e5d100e2a305f040 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:02 -0700 Subject: xen64: register callbacks in arch-independent way Use callback_op hypercall to register callbacks in a 32/64-bit independent way (64-bit doesn't need a code segment, but that detail is hidden in XEN_CALLBACK). Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/setup.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index f52f385..bea3d4f 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -91,19 +91,25 @@ static void __init fiddle_vdso(void) *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; } -void xen_enable_sysenter(void) +static __cpuinit int register_callback(unsigned type, const void *func) { - int cpu = smp_processor_id(); - extern void xen_sysenter_target(void); - /* Mask events on entry, even though they get enabled immediately */ - static struct callback_register sysenter = { - .type = CALLBACKTYPE_sysenter, - .address = XEN_CALLBACK(__KERNEL_CS, xen_sysenter_target), + struct callback_register callback = { + .type = type, + .address = XEN_CALLBACK(__KERNEL_CS, func), .flags = CALLBACKF_mask_events, }; + return HYPERVISOR_callback_op(CALLBACKOP_register, &callback); +} + +void __cpuinit xen_enable_sysenter(void) +{ + int cpu = smp_processor_id(); + extern void xen_sysenter_target(void); + if (!boot_cpu_has(X86_FEATURE_SEP) || - HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) { + register_callback(CALLBACKTYPE_sysenter, + xen_sysenter_target) != 0) { clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); } @@ -120,8 +126,9 @@ void __init xen_arch_setup(void) if (!xen_feature(XENFEAT_auto_translated_physmap)) HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); - HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, - __KERNEL_CS, (unsigned long)xen_failsafe_callback); + if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || + register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) + BUG(); xen_enable_sysenter(); -- cgit v1.1 From 0725cbb97793d4e65bf148e4872959cdbb8c6ddd Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:03 -0700 Subject: xen64: add identity irq->vector map The x86_64 interrupt subsystem is oriented towards vectors, as opposed to a flat irq space as it is in x86-32. This patch adds a simple identity irq->vector mapping so that we can continue to feed irqs into do_IRQ() and get a good result. Ideally x86_32 will unify with the 64-bit code and use vectors too. At that point we can move to mapping event channels to vectors, which will allow us to economise on irqs (so per-cpu event channels can share irqs, rather than having to allocte one per cpu, for example). Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 8b60982..52f2292 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1085,8 +1085,25 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { }, }; +static void __init __xen_init_IRQ(void) +{ +#ifdef CONFIG_X86_64 + int i; + + /* Create identity vector->irq map */ + for(i = 0; i < NR_VECTORS; i++) { + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(vector_irq, cpu)[i] = i; + } +#endif /* CONFIG_X86_64 */ + + xen_init_IRQ(); +} + static const struct pv_irq_ops xen_irq_ops __initdata = { - .init_IRQ = xen_init_IRQ, + .init_IRQ = __xen_init_IRQ, .save_fl = xen_save_fl, .restore_fl = xen_restore_fl, .irq_disable = xen_irq_disable, -- cgit v1.1 From a8fc1089e49caa5dca346dfacb5c84abf9a22a0c Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Tue, 8 Jul 2008 15:07:05 -0700 Subject: xen64: implement xen_load_gs_index() xen-64: implement xen_load_gs_index() Signed-off-by: Eduardo Habkost Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 52f2292..3b6b7fc 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -385,6 +385,14 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) loadsegment(gs, 0); } +#ifdef CONFIG_X86_64 +static void xen_load_gs_index(unsigned int idx) +{ + if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx)) + BUG(); +} +#endif + static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, const void *ptr) { @@ -1063,6 +1071,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .load_gdt = xen_load_gdt, .load_idt = xen_load_idt, .load_tls = xen_load_tls, +#ifdef CONFIG_X86_64 + .load_gs_index = xen_load_gs_index, +#endif .store_gdt = native_store_gdt, .store_idt = native_store_idt, -- cgit v1.1 From 5deb30d194d28b6bf7dacfb758267a51bf7c5b78 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:06 -0700 Subject: xen: rework pgd_walk to deal with 32/64 bit Rewrite pgd_walk to deal with 64-bit address spaces. There are two notible features of 64-bit workspaces: 1. The physical address is only 48 bits wide, with the upper 16 bits being sign extension; kernel addresses are negative, and userspace is positive. 2. The Xen hypervisor mapping is at the negative-most address, just above the sign-extension hole. 1. means that we can't easily use addresses when traversing the space, since we must deal with sign extension. This rewrite expresses everything in terms of pgd/pud/pmd indices, which means we don't need to worry about the exact configuration of the virtual memory space. This approach works equally well in 32-bit. To deal with 2, assume the hole is between the uppermost userspace address and PAGE_OFFSET. For 64-bit this skips the Xen mapping hole. For 32-bit, the hole is zero-sized. In all cases, the uppermost kernel address is FIXADDR_TOP. A side-effect of this patch is that the upper boundary is actually handled properly, exposing a long-standing bug in 32-bit, which failed to pin kernel pmd page. The kernel pmd is not shared, and so must be explicitly pinned, even though the kernel ptes are shared and don't need pinning. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 115 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 75 insertions(+), 40 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index eb31ed2..046c1f2 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -44,6 +44,7 @@ #include #include +#include #include #include #include @@ -491,77 +492,103 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) #endif /* PAGETABLE_LEVELS == 4 */ /* - (Yet another) pagetable walker. This one is intended for pinning a - pagetable. This means that it walks a pagetable and calls the - callback function on each page it finds making up the page table, - at every level. It walks the entire pagetable, but it only bothers - pinning pte pages which are below pte_limit. In the normal case - this will be TASK_SIZE, but at boot we need to pin up to - FIXADDR_TOP. But the important bit is that we don't pin beyond - there, because then we start getting into Xen's ptes. -*/ -static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), + * (Yet another) pagetable walker. This one is intended for pinning a + * pagetable. This means that it walks a pagetable and calls the + * callback function on each page it finds making up the page table, + * at every level. It walks the entire pagetable, but it only bothers + * pinning pte pages which are below limit. In the normal case this + * will be STACK_TOP_MAX, but at boot we need to pin up to + * FIXADDR_TOP. + * + * For 32-bit the important bit is that we don't pin beyond there, + * because then we start getting into Xen's ptes. + * + * For 64-bit, we must skip the Xen hole in the middle of the address + * space, just after the big x86-64 virtual hole. + */ +static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), unsigned long limit) { - pgd_t *pgd = pgd_base; int flush = 0; - unsigned long addr = 0; - unsigned long pgd_next; + unsigned hole_low, hole_high; + unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; + unsigned pgdidx, pudidx, pmdidx; - BUG_ON(limit > FIXADDR_TOP); + /* The limit is the last byte to be touched */ + limit--; + BUG_ON(limit >= FIXADDR_TOP); if (xen_feature(XENFEAT_auto_translated_physmap)) return 0; - for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { + /* + * 64-bit has a great big hole in the middle of the address + * space, which contains the Xen mappings. On 32-bit these + * will end up making a zero-sized hole and so is a no-op. + */ + hole_low = pgd_index(STACK_TOP_MAX + PGDIR_SIZE - 1); + hole_high = pgd_index(PAGE_OFFSET); + + pgdidx_limit = pgd_index(limit); +#if PTRS_PER_PUD > 1 + pudidx_limit = pud_index(limit); +#else + pudidx_limit = 0; +#endif +#if PTRS_PER_PMD > 1 + pmdidx_limit = pmd_index(limit); +#else + pmdidx_limit = 0; +#endif + + flush |= (*func)(virt_to_page(pgd), PT_PGD); + + for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { pud_t *pud; - unsigned long pud_limit, pud_next; - pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); + if (pgdidx >= hole_low && pgdidx < hole_high) + continue; - if (!pgd_val(*pgd)) + if (!pgd_val(pgd[pgdidx])) continue; - pud = pud_offset(pgd, 0); + pud = pud_offset(&pgd[pgdidx], 0); if (PTRS_PER_PUD > 1) /* not folded */ flush |= (*func)(virt_to_page(pud), PT_PUD); - for (; addr != pud_limit; pud++, addr = pud_next) { + for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { pmd_t *pmd; - unsigned long pmd_limit; - pud_next = pud_addr_end(addr, pud_limit); - - if (pud_next < limit) - pmd_limit = pud_next; - else - pmd_limit = limit; + if (pgdidx == pgdidx_limit && + pudidx > pudidx_limit) + goto out; - if (pud_none(*pud)) + if (pud_none(pud[pudidx])) continue; - pmd = pmd_offset(pud, 0); + pmd = pmd_offset(&pud[pudidx], 0); if (PTRS_PER_PMD > 1) /* not folded */ flush |= (*func)(virt_to_page(pmd), PT_PMD); - for (; addr != pmd_limit; pmd++) { - addr += (PAGE_SIZE * PTRS_PER_PTE); - if ((pmd_limit-1) < (addr-1)) { - addr = pmd_limit; - break; - } + for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { + struct page *pte; + + if (pgdidx == pgdidx_limit && + pudidx == pudidx_limit && + pmdidx > pmdidx_limit) + goto out; - if (pmd_none(*pmd)) + if (pmd_none(pmd[pmdidx])) continue; - flush |= (*func)(pmd_page(*pmd), PT_PTE); + pte = pmd_page(pmd[pmdidx]); + flush |= (*func)(pte, PT_PTE); } } } - - flush |= (*func)(virt_to_page(pgd_base), PT_PGD); +out: return flush; } @@ -650,6 +677,11 @@ void xen_pgd_pin(pgd_t *pgd) xen_mc_batch(); } +#ifdef CONFIG_X86_PAE + /* Need to make sure unshared kernel PMD is pinnable */ + pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); +#endif + xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); xen_mc_issue(0); } @@ -731,6 +763,10 @@ static void xen_pgd_unpin(pgd_t *pgd) xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); +#ifdef CONFIG_X86_PAE + /* Need to make sure unshared kernel PMD is unpinned */ + pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); +#endif pgd_walk(pgd, unpin_page, TASK_SIZE); xen_mc_issue(0); @@ -750,7 +786,6 @@ void xen_mm_unpin_all(void) list_for_each_entry(page, &pgd_list, lru) { if (PageSavePinned(page)) { BUG_ON(!PagePinned(page)); - printk("unpinning pinned %p\n", page_address(page)); xen_pgd_unpin((pgd_t *)page_address(page)); ClearPageSavePinned(page); } -- cgit v1.1 From b7c3c5c15936a40c79ef40af7b3bac801c7feb20 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:07 -0700 Subject: xen: make sure the kernel command line is right Point the boot params cmd_line_ptr to the domain-builder-provided command line. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 3b6b7fc..0172ba7 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1587,6 +1587,7 @@ asmlinkage void __init xen_start_kernel(void) boot_params.hdr.ramdisk_image = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0; boot_params.hdr.ramdisk_size = xen_start_info->mod_len; + boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); if (!is_initial_xendomain()) { add_preferred_console("xenboot", 0, NULL); -- cgit v1.1 From 8a95408e183b3e4aaf3b6a66fa34bff4db53011b Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Tue, 8 Jul 2008 15:07:10 -0700 Subject: xen64: Clear %fs on xen_load_tls() We need to do this, otherwise we can get a GPF on hypercall return after TLS descriptor is cleared but %fs is still pointing to it. Signed-off-by: Eduardo Habkost Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0172ba7..c13698f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -364,14 +364,6 @@ static void load_TLS_descriptor(struct thread_struct *t, static void xen_load_tls(struct thread_struct *t, unsigned int cpu) { - xen_mc_batch(); - - load_TLS_descriptor(t, cpu, 0); - load_TLS_descriptor(t, cpu, 1); - load_TLS_descriptor(t, cpu, 2); - - xen_mc_issue(PARAVIRT_LAZY_CPU); - /* * XXX sleazy hack: If we're being called in a lazy-cpu zone, * it means we're in a context switch, and %gs has just been @@ -380,9 +372,30 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) * Either way, it has been saved, and the new value will get * loaded properly. This will go away as soon as Xen has been * modified to not save/restore %gs for normal hypercalls. + * + * On x86_64, this hack is not used for %gs, because gs points + * to KERNEL_GS_BASE (and uses it for PDA references), so we + * must not zero %gs on x86_64 + * + * For x86_64, we need to zero %fs, otherwise we may get an + * exception between the new %fs descriptor being loaded and + * %fs being effectively cleared at __switch_to(). */ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { +#ifdef CONFIG_X86_32 loadsegment(gs, 0); +#else + loadsegment(fs, 0); +#endif + } + + xen_mc_batch(); + + load_TLS_descriptor(t, cpu, 0); + load_TLS_descriptor(t, cpu, 1); + load_TLS_descriptor(t, cpu, 2); + + xen_mc_issue(PARAVIRT_LAZY_CPU); } #ifdef CONFIG_X86_64 -- cgit v1.1 From d6182fbf04164016cb6540db02eef3d6bdc967c3 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:13 -0700 Subject: xen64: allocate and manage user pagetables Because the x86_64 architecture does not enforce segment limits, Xen cannot protect itself with them as it does in 32-bit mode. Therefore, to protect itself, it runs the guest kernel in ring 3. Since it also runs the guest userspace in ring3, the guest kernel must maintain a second pagetable for its userspace, which does not map kernel space. Naturally, the guest kernel pagetables map both kernel and userspace. The userspace pagetable is attached to the corresponding kernel pagetable via the pgd's page->private field. It is allocated and freed at the same time as the kernel pgd via the paravirt_pgd_alloc/free hooks. Fortunately, the user pagetable is almost entirely shared with the kernel pagetable; the only difference is the pgd page itself. set_pgd will populate all entries in the kernel pagetable, and also set the corresponding user pgd entry if the address is less than STACK_TOP_MAX. The user pagetable must be pinned and unpinned with the kernel one, but because the pagetables are aliased, pgd_walk() only needs to be called on the kernel pagetable. The user pgd page is then pinned/unpinned along with the kernel pgd page. xen_write_cr3 must write both the kernel and user cr3s. The init_mm.pgd pagetable never has a user pagetable allocated for it, because it can never be used while running usermode. One awkward area is that early in boot the page structures are not available. No user pagetable can exist at that point, but it complicates the logic to avoid looking at the page structure. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 99 ++++++++++++++++++++++++++++++++++++++++-------- arch/x86/xen/mmu.c | 91 +++++++++++++++++++++++++++++++++++++++----- arch/x86/xen/mmu.h | 2 + 3 files changed, 168 insertions(+), 24 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c13698f..48f1a7e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -46,7 +46,6 @@ #include #include #include -#include #include "xen-ops.h" #include "mmu.h" @@ -711,29 +710,57 @@ static void set_current_cr3(void *v) x86_write_percpu(xen_current_cr3, (unsigned long)v); } -static void xen_write_cr3(unsigned long cr3) +static void __xen_write_cr3(bool kernel, unsigned long cr3) { struct mmuext_op *op; struct multicall_space mcs; - unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); + unsigned long mfn; - BUG_ON(preemptible()); + if (cr3) + mfn = pfn_to_mfn(PFN_DOWN(cr3)); + else + mfn = 0; - mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ + WARN_ON(mfn == 0 && kernel); - /* Update while interrupts are disabled, so its atomic with - respect to ipis */ - x86_write_percpu(xen_cr3, cr3); + mcs = __xen_mc_entry(sizeof(*op)); op = mcs.args; - op->cmd = MMUEXT_NEW_BASEPTR; + op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; op->arg1.mfn = mfn; MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - /* Update xen_update_cr3 once the batch has actually - been submitted. */ - xen_mc_callback(set_current_cr3, (void *)cr3); + if (kernel) { + x86_write_percpu(xen_cr3, cr3); + + /* Update xen_current_cr3 once the batch has actually + been submitted. */ + xen_mc_callback(set_current_cr3, (void *)cr3); + } +} + +static void xen_write_cr3(unsigned long cr3) +{ + BUG_ON(preemptible()); + + xen_mc_batch(); /* disables interrupts */ + + /* Update while interrupts are disabled, so its atomic with + respect to ipis */ + x86_write_percpu(xen_cr3, cr3); + + __xen_write_cr3(true, cr3); + +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); + if (user_pgd) + __xen_write_cr3(false, __pa(user_pgd)); + else + __xen_write_cr3(false, 0); + } +#endif xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ } @@ -794,6 +821,40 @@ static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn) xen_alloc_ptpage(mm, pfn, PT_PMD); } +static int xen_pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd = mm->pgd; + int ret = 0; + + BUG_ON(PagePinned(virt_to_page(pgd))); + +#ifdef CONFIG_X86_64 + { + struct page *page = virt_to_page(pgd); + + BUG_ON(page->private != 0); + + page->private = __get_free_page(GFP_KERNEL | __GFP_ZERO); + if (page->private == 0) + ret = -ENOMEM; + + BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); + } +#endif + + return ret; +} + +static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ +#ifdef CONFIG_X86_64 + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + if (user_pgd) + free_page((unsigned long)user_pgd); +#endif +} + /* This should never happen until we're OK to use struct page */ static void xen_release_ptpage(u32 pfn, unsigned level) { @@ -1168,8 +1229,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .pte_update = paravirt_nop, .pte_update_defer = paravirt_nop, - .pgd_alloc = __paravirt_pgd_alloc, - .pgd_free = paravirt_nop, + .pgd_alloc = xen_pgd_alloc, + .pgd_free = xen_pgd_free, .alloc_pte = xen_alloc_pte_init, .release_pte = xen_release_pte_init, @@ -1480,7 +1541,15 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pf /* Switch over */ pgd = init_level4_pgt; - xen_write_cr3(__pa(pgd)); + + /* + * At this stage there can be no user pgd, and no page + * structure to attach it to, so make sure we just set kernel + * pgd. + */ + xen_mc_batch(); + __xen_write_cr3(true, __pa(pgd)); + xen_mc_issue(PARAVIRT_LAZY_CPU); reserve_early(__pa(xen_start_info->pt_base), __pa(xen_start_info->pt_base + diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 046c1f2..a44d56e 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -58,6 +58,13 @@ #include "multicalls.h" #include "mmu.h" +/* + * Just beyond the highest usermode address. STACK_TOP_MAX has a + * redzone above it, so round it up to a PGD boundary. + */ +#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) + + #define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) #define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) @@ -461,17 +468,45 @@ pud_t xen_make_pud(pudval_t pud) return native_make_pud(pud); } -void xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) +pgd_t *xen_get_user_pgd(pgd_t *pgd) { - struct mmu_update u; + pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); + unsigned offset = pgd - pgd_page; + pgd_t *user_ptr = NULL; - preempt_disable(); + if (offset < pgd_index(USER_LIMIT)) { + struct page *page = virt_to_page(pgd_page); + user_ptr = (pgd_t *)page->private; + if (user_ptr) + user_ptr += offset; + } - xen_mc_batch(); + return user_ptr; +} + +static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) +{ + struct mmu_update u; u.ptr = virt_to_machine(ptr).maddr; u.val = pgd_val_ma(val); extend_mmu_update(&u); +} + +/* + * Raw hypercall-based set_pgd, intended for in early boot before + * there's a page structure. This implies: + * 1. The only existing pagetable is the kernel's + * 2. It is always pinned + * 3. It has no user pagetable attached to it + */ +void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) +{ + preempt_disable(); + + xen_mc_batch(); + + __xen_set_pgd_hyper(ptr, val); xen_mc_issue(PARAVIRT_LAZY_MMU); @@ -480,14 +515,28 @@ void xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) void xen_set_pgd(pgd_t *ptr, pgd_t val) { + pgd_t *user_ptr = xen_get_user_pgd(ptr); + /* If page is not pinned, we can just update the entry directly */ if (!page_pinned(ptr)) { *ptr = val; + if (user_ptr) { + WARN_ON(page_pinned(user_ptr)); + *user_ptr = val; + } return; } - xen_set_pgd_hyper(ptr, val); + /* If it's pinned, then we can at least batch the kernel and + user updates together. */ + xen_mc_batch(); + + __xen_set_pgd_hyper(ptr, val); + if (user_ptr) + __xen_set_pgd_hyper(user_ptr, val); + + xen_mc_issue(PARAVIRT_LAZY_MMU); } #endif /* PAGETABLE_LEVELS == 4 */ @@ -526,7 +575,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), * space, which contains the Xen mappings. On 32-bit these * will end up making a zero-sized hole and so is a no-op. */ - hole_low = pgd_index(STACK_TOP_MAX + PGDIR_SIZE - 1); + hole_low = pgd_index(USER_LIMIT); hole_high = pgd_index(PAGE_OFFSET); pgdidx_limit = pgd_index(limit); @@ -670,19 +719,31 @@ void xen_pgd_pin(pgd_t *pgd) { xen_mc_batch(); - if (pgd_walk(pgd, pin_page, TASK_SIZE)) { + if (pgd_walk(pgd, pin_page, USER_LIMIT)) { /* re-enable interrupts for kmap_flush_unused */ xen_mc_issue(0); kmap_flush_unused(); xen_mc_batch(); } +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); + + if (user_pgd) { + pin_page(virt_to_page(user_pgd), PT_PGD); + xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); + } + } +#else /* CONFIG_X86_32 */ #ifdef CONFIG_X86_PAE /* Need to make sure unshared kernel PMD is pinnable */ pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); #endif - xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); +#endif /* CONFIG_X86_64 */ xen_mc_issue(0); } @@ -763,11 +824,23 @@ static void xen_pgd_unpin(pgd_t *pgd) xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + if (user_pgd) { + xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); + unpin_page(virt_to_page(user_pgd), PT_PGD); + } + } +#endif + #ifdef CONFIG_X86_PAE /* Need to make sure unshared kernel PMD is unpinned */ pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); #endif - pgd_walk(pgd, unpin_page, TASK_SIZE); + + pgd_walk(pgd, unpin_page, USER_LIMIT); xen_mc_issue(0); } diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 19d544b..0f59bd0 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -51,6 +51,8 @@ void xen_set_pgd(pgd_t *pgdp, pgd_t pgd); void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd); #endif +pgd_t *xen_get_user_pgd(pgd_t *pgd); + pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); -- cgit v1.1 From 6fcac6d305e8238939e169f4c52e8ec8a552a31f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:14 -0700 Subject: xen64: set up syscall and sysenter entrypoints for 64-bit We set up entrypoints for syscall and sysenter. sysenter is only used for 32-bit compat processes, whereas syscall can be used in by both 32 and 64-bit processes. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 4 ++ arch/x86/xen/setup.c | 42 +++++++++++++-- arch/x86/xen/smp.c | 1 + arch/x86/xen/xen-asm_64.S | 129 +++++++++++++++++++++++++++++++++++++++++++++- arch/x86/xen/xen-ops.h | 3 ++ 5 files changed, 174 insertions(+), 5 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 48f1a7e..87d3604 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1139,6 +1139,10 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .iret = xen_iret, .irq_enable_sysexit = xen_sysexit, +#ifdef CONFIG_X86_64 + .usergs_sysret32 = xen_sysret32, + .usergs_sysret64 = xen_sysret64, +#endif .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index bea3d4f..9d7a144 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -86,9 +86,11 @@ static void xen_idle(void) */ static void __init fiddle_vdso(void) { +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) extern const char vdso32_default_start; u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK); *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; +#endif } static __cpuinit int register_callback(unsigned type, const void *func) @@ -106,15 +108,48 @@ void __cpuinit xen_enable_sysenter(void) { int cpu = smp_processor_id(); extern void xen_sysenter_target(void); + int ret; + +#ifdef CONFIG_X86_32 + if (!boot_cpu_has(X86_FEATURE_SEP)) { + return; + } +#else + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && + boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR) { + return; + } +#endif - if (!boot_cpu_has(X86_FEATURE_SEP) || - register_callback(CALLBACKTYPE_sysenter, - xen_sysenter_target) != 0) { + ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target); + if(ret != 0) { clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); } } +void __cpuinit xen_enable_syscall(void) +{ +#ifdef CONFIG_X86_64 + int cpu = smp_processor_id(); + int ret; + extern void xen_syscall_target(void); + extern void xen_syscall32_target(void); + + ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); + if (ret != 0) { + printk("failed to set syscall: %d\n", ret); + clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SYSCALL); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SYSCALL); + } else { + ret = register_callback(CALLBACKTYPE_syscall32, + xen_syscall32_target); + if (ret != 0) + printk("failed to set 32-bit syscall: %d\n", ret); + } +#endif /* CONFIG_X86_64 */ +} + void __init xen_arch_setup(void) { struct physdev_set_iopl set_iopl; @@ -131,6 +166,7 @@ void __init xen_arch_setup(void) BUG(); xen_enable_sysenter(); + xen_enable_syscall(); set_iopl.iopl = 1; rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 8310ca0..f702199 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -69,6 +69,7 @@ static __cpuinit void cpu_bringup_and_idle(void) preempt_disable(); xen_enable_sysenter(); + xen_enable_syscall(); cpu = smp_processor_id(); smp_store_cpu_info(cpu); diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index b147b49..4038cbf 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -15,6 +15,8 @@ #include #include +#include +#include #include @@ -138,9 +140,132 @@ ENTRY(xen_adjust_exception_frame) mov 8+8(%rsp),%r11 ret $16 +hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 +/* + Xen64 iret frame: + + ss + rsp + rflags + cs + rip <-- standard iret frame + + flags + + rcx } + r11 }<-- pushed by hypercall page +rsp -> rax } + */ ENTRY(xen_iret) pushq $0 - jmp hypercall_page + __HYPERVISOR_iret * 32 +1: jmp hypercall_iret +ENDPATCH(xen_iret) +RELOC(xen_iret, 1b+1) +/* + sysexit is not used for 64-bit processes, so it's + only ever used to return to 32-bit compat userspace. + */ ENTRY(xen_sysexit) - ud2a + pushq $__USER32_DS + pushq %rcx + pushq $X86_EFLAGS_IF + pushq $__USER32_CS + pushq %rdx + + pushq $VGCF_in_syscall +1: jmp hypercall_iret +ENDPATCH(xen_sysexit) +RELOC(xen_sysexit, 1b+1) + +ENTRY(xen_sysret64) + /* We're already on the usermode stack at this point, but still + with the kernel gs, so we can easily switch back */ + movq %rsp, %gs:pda_oldrsp + movq %gs:pda_kernelstack,%rsp + + pushq $__USER_DS + pushq %gs:pda_oldrsp + pushq %r11 + pushq $__USER_CS + pushq %rcx + + pushq $VGCF_in_syscall +1: jmp hypercall_iret +ENDPATCH(xen_sysret64) +RELOC(xen_sysret64, 1b+1) + +ENTRY(xen_sysret32) + /* We're already on the usermode stack at this point, but still + with the kernel gs, so we can easily switch back */ + movq %rsp, %gs:pda_oldrsp + movq %gs:pda_kernelstack, %rsp + + pushq $__USER32_DS + pushq %gs:pda_oldrsp + pushq %r11 + pushq $__USER32_CS + pushq %rcx + + pushq $VGCF_in_syscall +1: jmp hypercall_iret +ENDPATCH(xen_sysret32) +RELOC(xen_sysret32, 1b+1) + +/* + Xen handles syscall callbacks much like ordinary exceptions, + which means we have: + - kernel gs + - kernel rsp + - an iret-like stack frame on the stack (including rcx and r11): + ss + rsp + rflags + cs + rip + r11 + rsp-> rcx + + In all the entrypoints, we undo all that to make it look + like a CPU-generated syscall/sysenter and jump to the normal + entrypoint. + */ + +.macro undo_xen_syscall + mov 0*8(%rsp),%rcx + mov 1*8(%rsp),%r11 + mov 5*8(%rsp),%rsp +.endm + +/* Normal 64-bit system call target */ +ENTRY(xen_syscall_target) + undo_xen_syscall + jmp system_call_after_swapgs +ENDPROC(xen_syscall_target) + +#ifdef CONFIG_IA32_EMULATION + +/* 32-bit compat syscall target */ +ENTRY(xen_syscall32_target) + undo_xen_syscall + jmp ia32_cstar_target +ENDPROC(xen_syscall32_target) + +/* 32-bit compat sysenter target */ +ENTRY(xen_sysenter_target) + undo_xen_syscall + jmp ia32_sysenter_target +ENDPROC(xen_sysenter_target) + +#else /* !CONFIG_IA32_EMULATION */ + +ENTRY(xen_syscall32_target) +ENTRY(xen_sysenter_target) + lea 16(%rsp), %rsp /* strip %rcx,%r11 */ + mov $-ENOSYS, %rax + pushq $VGCF_in_syscall + jmp hypercall_iret +ENDPROC(xen_syscall32_target) +ENDPROC(xen_sysenter_target) + +#endif /* CONFIG_IA32_EMULATION */ diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index c4800a2..dd3c231 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -26,6 +26,7 @@ char * __init xen_memory_setup(void); void __init xen_arch_setup(void); void __init xen_init_IRQ(void); void xen_enable_sysenter(void); +void xen_enable_syscall(void); void xen_vcpu_restore(void); void __init xen_build_dynamic_phys_to_machine(void); @@ -70,6 +71,8 @@ DECL_ASM(void, xen_restore_fl_direct, unsigned long); /* These are not functions, and cannot be called normally */ void xen_iret(void); void xen_sysexit(void); +void xen_sysret32(void); +void xen_sysret64(void); void xen_adjust_exception_frame(void); #endif /* XEN_OPS_H */ -- cgit v1.1 From bf18bf94dc72db998d0fbebc846c07c858a59c90 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:15 -0700 Subject: xen64: set up userspace syscall patch 64-bit userspace expects the vdso to be mapped at a specific fixed address, which happens to be in the middle of the kernel address space. Because we have split user and kernel pagetables, we need to make special arrangements for the vsyscall mapping to appear in the kernel part of the user pagetable. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 46 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 10 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 87d3604..f64b872 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -57,6 +57,18 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); /* + * Identity map, in addition to plain kernel map. This needs to be + * large enough to allocate page table pages to allocate the rest. + * Each page can map 2MB. + */ +static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; + +#ifdef CONFIG_X86_64 +/* l3 pud for userspace vsyscall mapping */ +static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; +#endif /* CONFIG_X86_64 */ + +/* * Note about cr3 (pagetable base) values: * * xen_cr3 contains the current logical cr3 value; it contains the @@ -831,12 +843,20 @@ static int xen_pgd_alloc(struct mm_struct *mm) #ifdef CONFIG_X86_64 { struct page *page = virt_to_page(pgd); + pgd_t *user_pgd; BUG_ON(page->private != 0); - page->private = __get_free_page(GFP_KERNEL | __GFP_ZERO); - if (page->private == 0) - ret = -ENOMEM; + ret = -ENOMEM; + + user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + page->private = (unsigned long)user_pgd; + + if (user_pgd != NULL) { + user_pgd[pgd_index(VSYSCALL_START)] = + __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); + ret = 0; + } BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); } @@ -977,6 +997,9 @@ static __init void xen_post_allocator_init(void) pv_mmu_ops.release_pud = xen_release_pud; #endif +#ifdef CONFIG_X86_64 + SetPagePinned(virt_to_page(level3_user_vsyscall)); +#endif xen_mark_init_mm_pinned(); } @@ -1088,6 +1111,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) } __native_set_fixmap(idx, pte); + +#ifdef CONFIG_X86_64 + /* Replicate changes to map the vsyscall page into the user + pagetable vsyscall mapping. */ + if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) { + unsigned long vaddr = __fix_to_virt(idx); + set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); + } +#endif } static const struct pv_info xen_info __initdata = { @@ -1427,13 +1459,6 @@ static void set_page_prot(void *addr, pgprot_t prot) BUG(); } -/* - * Identity map, in addition to plain kernel map. This needs to be - * large enough to allocate page table pages to allocate the rest. - * Each page can map 2MB. - */ -static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; - static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) { unsigned pmdidx, pteidx; @@ -1533,6 +1558,7 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pf set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); -- cgit v1.1 From 1153968a48e3ca3e2b7a437e8b82ec9e6f768e24 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:16 -0700 Subject: xen: implement Xen write_msr operation 64-bit uses MSRs for important things like the base for fs and gs-prefixed addresses. It's more efficient to use a hypercall to update these, rather than go via the trap and emulate path. Other MSR writes are just passed through; in an unprivileged domain they do nothing, but it might be useful later. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index f64b872..776c0fb 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -777,6 +778,34 @@ static void xen_write_cr3(unsigned long cr3) xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ } +static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) +{ + int ret; + + ret = 0; + + switch(msr) { +#ifdef CONFIG_X86_64 + unsigned which; + u64 base; + + case MSR_FS_BASE: which = SEGBASE_FS; goto set; + case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set; + case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set; + + set: + base = ((u64)high << 32) | low; + if (HYPERVISOR_set_segment_base(which, base) != 0) + ret = -EFAULT; + break; +#endif + default: + ret = native_write_msr_safe(msr, low, high); + } + + return ret; +} + /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) @@ -1165,7 +1194,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, - .write_msr = native_write_msr_safe, + .write_msr = xen_write_msr_safe, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, -- cgit v1.1 From 51dd660a2cd6eab4d470cfe1009c7f473832b786 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:17 -0700 Subject: xen: update Kconfig to allow 64-bit Xen Allow Xen to be enabled on 64-bit. Also extend domain size limit from 8 GB (on 32-bit) to 32 GB on 64-bit. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/Kconfig | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index c2cc995..20b4972 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -6,8 +6,8 @@ config XEN bool "Xen guest support" select PARAVIRT select PARAVIRT_CLOCK - depends on X86_32 - depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) + depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER)) + depends on X86_CMPXCHG && X86_TSC help This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the @@ -15,10 +15,11 @@ config XEN config XEN_MAX_DOMAIN_MEMORY int "Maximum allowed size of a domain in gigabytes" - default 8 + default 8 if X86_32 + default 32 if X86_64 depends on XEN help The pseudo-physical to machine address array is sized according to the maximum possible memory size of a Xen domain. This array uses 1 page per gigabyte, so there's no - need to be too stingy here. \ No newline at end of file + need to be too stingy here. -- cgit v1.1 From b3fe124389f9dd97f0bbd954da2910e286648f0f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Jul 2008 13:45:33 +0200 Subject: xen64: fix build error on 32-bit + !HIGHMEM fix: arch/x86/xen/enlighten.c: In function 'xen_set_fixmap': arch/x86/xen/enlighten.c:1127: error: 'FIX_KMAP_BEGIN' undeclared (first use in this function) arch/x86/xen/enlighten.c:1127: error: (Each undeclared identifier is reported only once arch/x86/xen/enlighten.c:1127: error: for each function it appears in.) arch/x86/xen/enlighten.c:1127: error: 'FIX_KMAP_END' undeclared (first use in this function) make[1]: *** [arch/x86/xen/enlighten.o] Error 1 make: *** [arch/x86/xen/enlighten.o] Error 2 FIX_KMAP_BEGIN is only available on HIGHMEM. Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 776c0fb..3da6acb 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1124,7 +1124,9 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) #ifdef CONFIG_X86_32 case FIX_WP_TEST: case FIX_VDSO: +# ifdef CONFIG_HIGHMEM case FIX_KMAP_BEGIN ... FIX_KMAP_END: +# endif #else case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: #endif -- cgit v1.1 From 62541c376668042e20122864a044360707b2fb82 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 10 Jul 2008 16:24:08 -0700 Subject: xen64: disable 32-bit syscall/sysenter if not supported. Old versions of Xen (3.1 and before) don't support sysenter or syscall from 32-bit compat userspaces. If we can't set the appropriate syscall callback, then disable the corresponding feature bit, which will cause the vdso32 setup to fall back appropriately. Linux assumes that syscall is always available to 32-bit userspace, and installs it by default if sysenter isn't available. In that case, we just disable vdso altogether, forcing userspace libc to fall back to int $0x80. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/setup.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 9d7a144..9cce4a9 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -106,46 +106,46 @@ static __cpuinit int register_callback(unsigned type, const void *func) void __cpuinit xen_enable_sysenter(void) { - int cpu = smp_processor_id(); extern void xen_sysenter_target(void); int ret; + unsigned sysenter_feature; #ifdef CONFIG_X86_32 - if (!boot_cpu_has(X86_FEATURE_SEP)) { - return; - } + sysenter_feature = X86_FEATURE_SEP; #else - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && - boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR) { - return; - } + sysenter_feature = X86_FEATURE_SYSENTER32; #endif + if (!boot_cpu_has(sysenter_feature)) + return; + ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target); - if(ret != 0) { - clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); - } + if(ret != 0) + setup_clear_cpu_cap(sysenter_feature); } void __cpuinit xen_enable_syscall(void) { #ifdef CONFIG_X86_64 - int cpu = smp_processor_id(); int ret; extern void xen_syscall_target(void); extern void xen_syscall32_target(void); ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); if (ret != 0) { - printk("failed to set syscall: %d\n", ret); - clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SYSCALL); - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SYSCALL); - } else { + printk(KERN_ERR "Failed to set syscall: %d\n", ret); + /* Pretty fatal; 64-bit userspace has no other + mechanism for syscalls. */ + } + + if (boot_cpu_has(X86_FEATURE_SYSCALL32)) { ret = register_callback(CALLBACKTYPE_syscall32, xen_syscall32_target); - if (ret != 0) - printk("failed to set 32-bit syscall: %d\n", ret); + if (ret != 0) { + printk(KERN_INFO "Xen: 32-bit syscall not supported: disabling vdso\n"); + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); + sysctl_vsyscall32 = 0; + } } #endif /* CONFIG_X86_64 */ } -- cgit v1.1 From 71415c6a0877d5944d5dc3060f3b03513746158d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Jul 2008 22:41:34 +0200 Subject: x86, xen, vdso: fix build error fix: arch/x86/xen/built-in.o: In function `xen_enable_syscall': (.cpuinit.text+0xdb): undefined reference to `sysctl_vsyscall32' Signed-off-by: Ingo Molnar --- arch/x86/xen/setup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 9cce4a9..3e11779 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -144,7 +144,9 @@ void __cpuinit xen_enable_syscall(void) if (ret != 0) { printk(KERN_INFO "Xen: 32-bit syscall not supported: disabling vdso\n"); setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); +#ifdef CONFIG_COMPAT sysctl_vsyscall32 = 0; +#endif } } #endif /* CONFIG_X86_64 */ -- cgit v1.1 From 6a52e4b1cddd90fbfde8fb67021657936ee74b07 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 12 Jul 2008 02:22:00 -0700 Subject: x86_64: further cleanup of 32-bit compat syscall mechanisms AMD only supports "syscall" from 32-bit compat usermode. Intel and Centaur(?) only support "sysenter" from 32-bit compat usermode. Set the X86 feature bits accordingly, and set up the vdso in accordance with those bits. On the offchance we run on in a 64-bit environment which supports neither syscall nor sysenter from 32-bit mode, then fall back to the int $0x80 vdso. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/xen/setup.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 3e11779..e3648e6 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -83,12 +83,16 @@ static void xen_idle(void) /* * Set the bit indicating "nosegneg" library variants should be used. + * We only need to bother in pure 32-bit mode; compat 32-bit processes + * can have un-truncated segments, so wrapping around is allowed. */ static void __init fiddle_vdso(void) { -#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) - extern const char vdso32_default_start; - u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK); +#ifdef CONFIG_X86_32 + u32 *mask; + mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK); + *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; + mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK); *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; #endif } -- cgit v1.1 From d5303b811b9d6dad2e7396d545eb7db414d42a61 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 12 Jul 2008 02:22:06 -0700 Subject: x86: xen: no need to disable vdso32 Now that the vdso32 code can cope with both syscall and sysenter missing for 32-bit compat processes, just disable the features without disabling vdso altogether. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/xen/setup.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index e3648e6..b6acc3a 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -137,7 +137,7 @@ void __cpuinit xen_enable_syscall(void) ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); if (ret != 0) { - printk(KERN_ERR "Failed to set syscall: %d\n", ret); + printk(KERN_ERR "Failed to set syscall callback: %d\n", ret); /* Pretty fatal; 64-bit userspace has no other mechanism for syscalls. */ } @@ -145,13 +145,8 @@ void __cpuinit xen_enable_syscall(void) if (boot_cpu_has(X86_FEATURE_SYSCALL32)) { ret = register_callback(CALLBACKTYPE_syscall32, xen_syscall32_target); - if (ret != 0) { - printk(KERN_INFO "Xen: 32-bit syscall not supported: disabling vdso\n"); + if (ret != 0) setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); -#ifdef CONFIG_COMPAT - sysctl_vsyscall32 = 0; -#endif - } } #endif /* CONFIG_X86_64 */ } -- cgit v1.1 From 56397f8dadb40055479a8ffff23f21a890098a31 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 7 Jul 2008 12:07:52 -0700 Subject: xen: use lock-byte spinlock implementation Switch to using the lock-byte spinlock implementation, to avoid the worst of the performance hit from ticket locks. Signed-off-by: Jeremy Fitzhardinge Cc: Jens Axboe Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Petr Tesarik Cc: Virtualization Cc: Xen devel Cc: Thomas Friebel Signed-off-by: Ingo Molnar --- arch/x86/xen/smp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index f702199..a8ebafc 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -430,4 +430,5 @@ void __init xen_smp_init(void) { smp_ops = xen_smp_ops; xen_fill_possible_map(); + paravirt_use_bytelocks(); } -- cgit v1.1 From 2d9e1e2f58b5612aa4eab0ab54c84308a29dbd79 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 7 Jul 2008 12:07:53 -0700 Subject: xen: implement Xen-specific spinlocks The standard ticket spinlocks are very expensive in a virtual environment, because their performance depends on Xen's scheduler giving vcpus time in the order that they're supposed to take the spinlock. This implements a Xen-specific spinlock, which should be much more efficient. The fast-path is essentially the old Linux-x86 locks, using a single lock byte. The locker decrements the byte; if the result is 0, then they have the lock. If the lock is negative, then locker must spin until the lock is positive again. When there's contention, the locker spin for 2^16[*] iterations waiting to get the lock. If it fails to get the lock in that time, it adds itself to the contention count in the lock and blocks on a per-cpu event channel. When unlocking the spinlock, the locker looks to see if there's anyone blocked waiting for the lock by checking for a non-zero waiter count. If there's a waiter, it traverses the per-cpu "lock_spinners" variable, which contains which lock each CPU is waiting on. It picks one CPU waiting on the lock and sends it an event to wake it up. This allows efficient fast-path spinlock operation, while allowing spinning vcpus to give up their processor time while waiting for a contended lock. [*] 2^16 iterations is threshold at which 98% locks have been taken according to Thomas Friebel's Xen Summit talk "Preventing Guests from Spinning Around". Therefore, we'd expect the lock and unlock slow paths will only be entered 2% of the time. Signed-off-by: Jeremy Fitzhardinge Cc: Jens Axboe Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Petr Tesarik Cc: Virtualization Cc: Xen devel Cc: Thomas Friebel Cc: Nick Piggin Signed-off-by: Ingo Molnar --- arch/x86/xen/smp.c | 172 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 171 insertions(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index a8ebafc..e693812 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -15,6 +15,7 @@ * This does not handle HOTPLUG_CPU yet. */ #include +#include #include #include @@ -35,6 +36,8 @@ #include "xen-ops.h" #include "mmu.h" +static void __cpuinit xen_init_lock_cpu(int cpu); + cpumask_t xen_cpu_initialized_map; static DEFINE_PER_CPU(int, resched_irq); @@ -179,6 +182,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) { unsigned cpu; + xen_init_lock_cpu(0); + smp_store_cpu_info(0); cpu_data(0).x86_max_cores = 1; set_cpu_sibling_map(0); @@ -301,6 +306,7 @@ static int __cpuinit xen_cpu_up(unsigned int cpu) clear_tsk_thread_flag(idle, TIF_FORK); #endif xen_setup_timer(cpu); + xen_init_lock_cpu(cpu); per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; @@ -413,6 +419,170 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +struct xen_spinlock { + unsigned char lock; /* 0 -> free; 1 -> locked */ + unsigned short spinners; /* count of waiting cpus */ +}; + +static int xen_spin_is_locked(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + return xl->lock != 0; +} + +static int xen_spin_is_contended(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + /* Not strictly true; this is only the count of contended + lock-takers entering the slow path. */ + return xl->spinners != 0; +} + +static int xen_spin_trylock(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + u8 old = 1; + + asm("xchgb %b0,%1" + : "+q" (old), "+m" (xl->lock) : : "memory"); + + return old == 0; +} + +static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; +static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); + +static inline void spinning_lock(struct xen_spinlock *xl) +{ + __get_cpu_var(lock_spinners) = xl; + wmb(); /* set lock of interest before count */ + asm(LOCK_PREFIX " incw %0" + : "+m" (xl->spinners) : : "memory"); +} + +static inline void unspinning_lock(struct xen_spinlock *xl) +{ + asm(LOCK_PREFIX " decw %0" + : "+m" (xl->spinners) : : "memory"); + wmb(); /* decrement count before clearing lock */ + __get_cpu_var(lock_spinners) = NULL; +} + +static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + int irq = __get_cpu_var(lock_kicker_irq); + int ret; + + /* If kicker interrupts not initialized yet, just spin */ + if (irq == -1) + return 0; + + /* announce we're spinning */ + spinning_lock(xl); + + /* clear pending */ + xen_clear_irq_pending(irq); + + /* check again make sure it didn't become free while + we weren't looking */ + ret = xen_spin_trylock(lock); + if (ret) + goto out; + + /* block until irq becomes pending */ + xen_poll_irq(irq); + kstat_this_cpu.irqs[irq]++; + +out: + unspinning_lock(xl); + return ret; +} + +static void xen_spin_lock(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + int timeout; + u8 oldval; + + do { + timeout = 1 << 10; + + asm("1: xchgb %1,%0\n" + " testb %1,%1\n" + " jz 3f\n" + "2: rep;nop\n" + " cmpb $0,%0\n" + " je 1b\n" + " dec %2\n" + " jnz 2b\n" + "3:\n" + : "+m" (xl->lock), "=q" (oldval), "+r" (timeout) + : "1" (1) + : "memory"); + + } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock))); +} + +static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) +{ + int cpu; + + for_each_online_cpu(cpu) { + /* XXX should mix up next cpu selection */ + if (per_cpu(lock_spinners, cpu) == xl) { + xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); + break; + } + } +} + +static void xen_spin_unlock(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + smp_wmb(); /* make sure no writes get moved after unlock */ + xl->lock = 0; /* release lock */ + + /* make sure unlock happens before kick */ + barrier(); + + if (unlikely(xl->spinners)) + xen_spin_unlock_slow(xl); +} + +static __cpuinit void xen_init_lock_cpu(int cpu) +{ + int irq; + const char *name; + + name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); + irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, + cpu, + xen_reschedule_interrupt, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + name, + NULL); + + if (irq >= 0) { + disable_irq(irq); /* make sure it's never delivered */ + per_cpu(lock_kicker_irq, cpu) = irq; + } + + printk("cpu %d spinlock event irq %d\n", cpu, irq); +} + +static void __init xen_init_spinlocks(void) +{ + pv_lock_ops.spin_is_locked = xen_spin_is_locked; + pv_lock_ops.spin_is_contended = xen_spin_is_contended; + pv_lock_ops.spin_lock = xen_spin_lock; + pv_lock_ops.spin_trylock = xen_spin_trylock; + pv_lock_ops.spin_unlock = xen_spin_unlock; +} + static const struct smp_ops xen_smp_ops __initdata = { .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, .smp_prepare_cpus = xen_smp_prepare_cpus, @@ -430,5 +600,5 @@ void __init xen_smp_init(void) { smp_ops = xen_smp_ops; xen_fill_possible_map(); - paravirt_use_bytelocks(); + xen_init_spinlocks(); } -- cgit v1.1 From 93a0886e2368eafb9df5e2021fb185195cee88b2 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 15 Jul 2008 13:43:42 -0700 Subject: x86, xen, power: fix up config dependencies on PM Xen save/restore needs bits of code enabled by PM_SLEEP, and PM_SLEEP depends on PM. So make XEN_SAVE_RESTORE depend on PM and PM_SLEEP depend on XEN_SAVE_RESTORE. Signed-off-by: Jeremy Fitzhardinge Acked-by: Rafael J. Wysocki Signed-off-by: Ingo Molnar --- arch/x86/xen/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 20b4972..3815e42 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -23,3 +23,8 @@ config XEN_MAX_DOMAIN_MEMORY according to the maximum possible memory size of a Xen domain. This array uses 1 page per gigabyte, so there's no need to be too stingy here. + +config XEN_SAVE_RESTORE + bool + depends on PM + default y \ No newline at end of file -- cgit v1.1 From 593f4a788e5d09e9f00182561437461b0b564de4 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Wed, 16 Jul 2008 19:15:30 +0100 Subject: x86: APIC: remove apic_write_around(); use alternatives Use alternatives to select the workaround for the 11AP Pentium erratum for the affected steppings on the fly rather than build time. Remove the X86_GOOD_APIC configuration option and replace all the calls to apic_write_around() with plain apic_write(), protecting accesses to the ESR as appropriate due to the 3AP Pentium erratum. Remove apic_read_around() and all its invocations altogether as not needed. Remove apic_write_atomic() and all its implementing backends. The use of ASM_OUTPUT2() is not strictly needed for input constraints, but I have used it for readability's sake. I had the feeling no one else was brave enough to do it, so I went ahead and here it is. Verified by checking the generated assembly and tested with both a 32-bit and a 64-bit configuration, also with the 11AP "feature" forced on and verified with gdb on /proc/kcore to work as expected (as an 11AP machines are quite hard to get hands on these days). Some script complained about the use of "volatile", but apic_write() needs it for the same reason and is effectively a replacement for writel(), so I have disregarded it. I am not sure what the policy wrt defconfig files is, they are generated and there is risk of a conflict resulting from an unrelated change, so I have left changes to them out. The option will get removed from them at the next run. Some testing with machines other than mine will be needed to avoid some stupid mistake, but despite its volume, the change is not really that intrusive, so I am fairly confident that because it works for me, it will everywhere. Signed-off-by: Maciej W. Rozycki Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index bb50845..7f26c37 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1131,7 +1131,6 @@ static const struct pv_irq_ops xen_irq_ops __initdata = { static const struct pv_apic_ops xen_apic_ops __initdata = { #ifdef CONFIG_X86_LOCAL_APIC .apic_write = xen_apic_write, - .apic_write_atomic = xen_apic_write, .apic_read = xen_apic_read, .setup_boot_clock = paravirt_nop, .setup_secondary_clock = paravirt_nop, -- cgit v1.1 From 95c7c23b06bc92f1772b9c9460845f179ba8c39e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 15 Jul 2008 13:42:34 -0700 Subject: xen: report hypervisor version Various versions of the hypervisor have differences in what ABIs and features they support. Print some details into the boot log to help with remote debugging. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index bb50845..5328e46 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -167,10 +167,14 @@ void xen_vcpu_restore(void) static void __init xen_banner(void) { + unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); + struct xen_extraversion extra; + HYPERVISOR_xen_version(XENVER_extraversion, &extra); + printk(KERN_INFO "Booting paravirtualized kernel on %s\n", pv_info.name); - printk(KERN_INFO "Hypervisor signature: %s%s\n", - xen_start_info->magic, + printk(KERN_INFO "Xen version: %d.%d%s%s\n", + version >> 16, version & 0xffff, extra.extraversion, xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); } -- cgit v1.1 From caf43bf7c6a55e89b6df5179df434d67e24aa32e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 20 Jul 2008 14:06:50 +0200 Subject: x86, xen: fix apic_ops build on UP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: arch/x86/xen/enlighten.c:615: error: variable ‘xen_basic_apic_ops’ has initializer but incomplete type arch/x86/xen/enlighten.c:616: error: unknown field ‘read’ specified in initializer [...] Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 008b7b6..e4d1459 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -35,6 +35,7 @@ #include #include +#include #include #include #include -- cgit v1.1 From 59438c9fc4f7a92c808c9049bc6b396f98bf954c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 21 Jul 2008 22:59:42 -0700 Subject: x86: rename PTE_MASK to PTE_PFN_MASK Rusty, in his peevish way, complained that macros defining constants should have a name which somewhat accurately reflects the actual purpose of the constant. Aside from the fact that PTE_MASK gives no clue as to what's actually being masked, and is misleadingly similar to the functionally entirely different PMD_MASK, PUD_MASK and PGD_MASK, I don't really see what the problem is. But if this patch silences the incessent noise, then it will have achieved its goal (TODO: write test-case). Signed-off-by: Jeremy Fitzhardinge Cc: Rusty Russell Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- arch/x86/xen/mmu.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 194bbd6..9ff6e3c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1435,7 +1435,7 @@ static unsigned long m2p(phys_addr_t maddr) { phys_addr_t paddr; - maddr &= PTE_MASK; + maddr &= PTE_PFN_MASK; paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; return paddr; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index a44d56e..0db6912 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -343,8 +343,8 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, static pteval_t pte_mfn_to_pfn(pteval_t val) { if (val & _PAGE_PRESENT) { - unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT; - pteval_t flags = val & ~PTE_MASK; + unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; + pteval_t flags = val & ~PTE_PFN_MASK; val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; } @@ -354,8 +354,8 @@ static pteval_t pte_mfn_to_pfn(pteval_t val) static pteval_t pte_pfn_to_mfn(pteval_t val) { if (val & _PAGE_PRESENT) { - unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT; - pteval_t flags = val & ~PTE_MASK; + unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; + pteval_t flags = val & ~PTE_PFN_MASK; val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; } -- cgit v1.1 From 77be1fabd024b37423d12f832b1fbdb95dbdf494 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 21 Jul 2008 22:59:56 -0700 Subject: x86: add PTE_FLAGS_MASK PTE_PFN_MASK was getting lonely, so I made it a friend. Signed-off-by: Jeremy Fitzhardinge Cc: Rusty Russell Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 0db6912..aa37469 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -344,7 +344,7 @@ static pteval_t pte_mfn_to_pfn(pteval_t val) { if (val & _PAGE_PRESENT) { unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; - pteval_t flags = val & ~PTE_PFN_MASK; + pteval_t flags = val & PTE_FLAGS_MASK; val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; } @@ -355,7 +355,7 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) { if (val & _PAGE_PRESENT) { unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; - pteval_t flags = val & ~PTE_PFN_MASK; + pteval_t flags = val & PTE_FLAGS_MASK; val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; } -- cgit v1.1 From 2dc1697eb355c34f9f7bcbbb83f490de248c360a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 21 Jul 2008 16:49:58 -0700 Subject: xen: don't use sysret for sysexit32 When implementing sysexit32, don't let Xen use sysret to return to userspace. That results in usermode register state being trashed. Signed-off-by: Jeremy Fitzhardinge Cc: Mark McLoughlin Cc: Eduardo Habkost Signed-off-by: Ingo Molnar --- arch/x86/xen/xen-asm_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 4038cbf..7f58304 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -173,7 +173,7 @@ ENTRY(xen_sysexit) pushq $__USER32_CS pushq %rdx - pushq $VGCF_in_syscall + pushq $0 1: jmp hypercall_iret ENDPATCH(xen_sysexit) RELOC(xen_sysexit, 1b+1) -- cgit v1.1 From 38ffbe66d59051fd9cfcfc8545f164700e2fa3bc Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 23 Jul 2008 14:21:18 -0700 Subject: x86/paravirt/xen: properly fill out the ldt ops LTP testing showed that Xen does not properly implement sys_modify_ldt(). This patch does the final little bits needed to make the ldt work properly. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 9ff6e3c..06219e6 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -325,6 +325,26 @@ static unsigned long xen_store_tr(void) return 0; } +static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) +{ + unsigned pages = roundup(entries * LDT_ENTRY_SIZE, PAGE_SIZE); + void *v = ldt; + int i; + + for(i = 0; i < pages; i += PAGE_SIZE) + make_lowmem_page_readonly(v + i); +} + +static void xen_free_ldt(struct desc_struct *ldt, unsigned entries) +{ + unsigned pages = roundup(entries * LDT_ENTRY_SIZE, PAGE_SIZE); + void *v = ldt; + int i; + + for(i = 0; i < pages; i += PAGE_SIZE) + make_lowmem_page_readwrite(v + i); +} + static void xen_set_ldt(const void *addr, unsigned entries) { struct mmuext_op *op; @@ -1220,6 +1240,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .load_gs_index = xen_load_gs_index, #endif + .alloc_ldt = xen_alloc_ldt, + .free_ldt = xen_free_ldt, + .store_gdt = native_store_gdt, .store_idt = native_store_idt, .store_tr = xen_store_tr, -- cgit v1.1 From d5de8841355a48f7f634a04507185eaf1f9755e3 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 23 Jul 2008 13:28:58 -0700 Subject: x86: split spinlock implementations out into their own files ftrace requires certain low-level code, like spinlocks and timestamps, to be compiled without -pg in order to avoid infinite recursion. This patch splits out the core paravirt spinlocks and the Xen spinlocks into separate files which can be compiled without -pg. Also do xen/time.c while we're about it. As a result, we can now use ftrace within a Xen domain. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/Makefile | 8 ++- arch/x86/xen/smp.c | 167 ------------------------------------------- arch/x86/xen/spinlock.c | 183 ++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/xen/xen-ops.h | 3 + 4 files changed, 193 insertions(+), 168 deletions(-) create mode 100644 arch/x86/xen/spinlock.c (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 59c1e53..5bfee24 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,4 +1,10 @@ +ifdef CONFIG_FTRACE +# Do not profile debug and lowlevel utilities +CFLAGS_REMOVE_spinlock.o = -pg +CFLAGS_REMOVE_time.o = -pg +endif + obj-y := enlighten.o setup.o multicalls.o mmu.o \ time.o xen-asm_$(BITS).o grant-table.o suspend.o -obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_SMP) += smp.o spinlock.o diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index d8faf79..baca7f2 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -15,7 +15,6 @@ * This does not handle HOTPLUG_CPU yet. */ #include -#include #include #include @@ -36,8 +35,6 @@ #include "xen-ops.h" #include "mmu.h" -static void __cpuinit xen_init_lock_cpu(int cpu); - cpumask_t xen_cpu_initialized_map; static DEFINE_PER_CPU(int, resched_irq); @@ -419,170 +416,6 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -struct xen_spinlock { - unsigned char lock; /* 0 -> free; 1 -> locked */ - unsigned short spinners; /* count of waiting cpus */ -}; - -static int xen_spin_is_locked(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - return xl->lock != 0; -} - -static int xen_spin_is_contended(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - /* Not strictly true; this is only the count of contended - lock-takers entering the slow path. */ - return xl->spinners != 0; -} - -static int xen_spin_trylock(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - u8 old = 1; - - asm("xchgb %b0,%1" - : "+q" (old), "+m" (xl->lock) : : "memory"); - - return old == 0; -} - -static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; -static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); - -static inline void spinning_lock(struct xen_spinlock *xl) -{ - __get_cpu_var(lock_spinners) = xl; - wmb(); /* set lock of interest before count */ - asm(LOCK_PREFIX " incw %0" - : "+m" (xl->spinners) : : "memory"); -} - -static inline void unspinning_lock(struct xen_spinlock *xl) -{ - asm(LOCK_PREFIX " decw %0" - : "+m" (xl->spinners) : : "memory"); - wmb(); /* decrement count before clearing lock */ - __get_cpu_var(lock_spinners) = NULL; -} - -static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - int irq = __get_cpu_var(lock_kicker_irq); - int ret; - - /* If kicker interrupts not initialized yet, just spin */ - if (irq == -1) - return 0; - - /* announce we're spinning */ - spinning_lock(xl); - - /* clear pending */ - xen_clear_irq_pending(irq); - - /* check again make sure it didn't become free while - we weren't looking */ - ret = xen_spin_trylock(lock); - if (ret) - goto out; - - /* block until irq becomes pending */ - xen_poll_irq(irq); - kstat_this_cpu.irqs[irq]++; - -out: - unspinning_lock(xl); - return ret; -} - -static void xen_spin_lock(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - int timeout; - u8 oldval; - - do { - timeout = 1 << 10; - - asm("1: xchgb %1,%0\n" - " testb %1,%1\n" - " jz 3f\n" - "2: rep;nop\n" - " cmpb $0,%0\n" - " je 1b\n" - " dec %2\n" - " jnz 2b\n" - "3:\n" - : "+m" (xl->lock), "=q" (oldval), "+r" (timeout) - : "1" (1) - : "memory"); - - } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock))); -} - -static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) -{ - int cpu; - - for_each_online_cpu(cpu) { - /* XXX should mix up next cpu selection */ - if (per_cpu(lock_spinners, cpu) == xl) { - xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); - break; - } - } -} - -static void xen_spin_unlock(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - smp_wmb(); /* make sure no writes get moved after unlock */ - xl->lock = 0; /* release lock */ - - /* make sure unlock happens before kick */ - barrier(); - - if (unlikely(xl->spinners)) - xen_spin_unlock_slow(xl); -} - -static __cpuinit void xen_init_lock_cpu(int cpu) -{ - int irq; - const char *name; - - name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); - irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, - cpu, - xen_reschedule_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, - name, - NULL); - - if (irq >= 0) { - disable_irq(irq); /* make sure it's never delivered */ - per_cpu(lock_kicker_irq, cpu) = irq; - } - - printk("cpu %d spinlock event irq %d\n", cpu, irq); -} - -static void __init xen_init_spinlocks(void) -{ - pv_lock_ops.spin_is_locked = xen_spin_is_locked; - pv_lock_ops.spin_is_contended = xen_spin_is_contended; - pv_lock_ops.spin_lock = xen_spin_lock; - pv_lock_ops.spin_trylock = xen_spin_trylock; - pv_lock_ops.spin_unlock = xen_spin_unlock; -} - static const struct smp_ops xen_smp_ops __initdata = { .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, .smp_prepare_cpus = xen_smp_prepare_cpus, diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c new file mode 100644 index 0000000..8dc4d31 --- /dev/null +++ b/arch/x86/xen/spinlock.c @@ -0,0 +1,183 @@ +/* + * Split spinlock implementation out into its own file, so it can be + * compiled in a FTRACE-compatible way. + */ +#include +#include + +#include + +#include +#include + +#include "xen-ops.h" + +struct xen_spinlock { + unsigned char lock; /* 0 -> free; 1 -> locked */ + unsigned short spinners; /* count of waiting cpus */ +}; + +static int xen_spin_is_locked(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + return xl->lock != 0; +} + +static int xen_spin_is_contended(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + /* Not strictly true; this is only the count of contended + lock-takers entering the slow path. */ + return xl->spinners != 0; +} + +static int xen_spin_trylock(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + u8 old = 1; + + asm("xchgb %b0,%1" + : "+q" (old), "+m" (xl->lock) : : "memory"); + + return old == 0; +} + +static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; +static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); + +static inline void spinning_lock(struct xen_spinlock *xl) +{ + __get_cpu_var(lock_spinners) = xl; + wmb(); /* set lock of interest before count */ + asm(LOCK_PREFIX " incw %0" + : "+m" (xl->spinners) : : "memory"); +} + +static inline void unspinning_lock(struct xen_spinlock *xl) +{ + asm(LOCK_PREFIX " decw %0" + : "+m" (xl->spinners) : : "memory"); + wmb(); /* decrement count before clearing lock */ + __get_cpu_var(lock_spinners) = NULL; +} + +static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + int irq = __get_cpu_var(lock_kicker_irq); + int ret; + + /* If kicker interrupts not initialized yet, just spin */ + if (irq == -1) + return 0; + + /* announce we're spinning */ + spinning_lock(xl); + + /* clear pending */ + xen_clear_irq_pending(irq); + + /* check again make sure it didn't become free while + we weren't looking */ + ret = xen_spin_trylock(lock); + if (ret) + goto out; + + /* block until irq becomes pending */ + xen_poll_irq(irq); + kstat_this_cpu.irqs[irq]++; + +out: + unspinning_lock(xl); + return ret; +} + +static void xen_spin_lock(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + int timeout; + u8 oldval; + + do { + timeout = 1 << 10; + + asm("1: xchgb %1,%0\n" + " testb %1,%1\n" + " jz 3f\n" + "2: rep;nop\n" + " cmpb $0,%0\n" + " je 1b\n" + " dec %2\n" + " jnz 2b\n" + "3:\n" + : "+m" (xl->lock), "=q" (oldval), "+r" (timeout) + : "1" (1) + : "memory"); + + } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock))); +} + +static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) +{ + int cpu; + + for_each_online_cpu(cpu) { + /* XXX should mix up next cpu selection */ + if (per_cpu(lock_spinners, cpu) == xl) { + xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); + break; + } + } +} + +static void xen_spin_unlock(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + smp_wmb(); /* make sure no writes get moved after unlock */ + xl->lock = 0; /* release lock */ + + /* make sure unlock happens before kick */ + barrier(); + + if (unlikely(xl->spinners)) + xen_spin_unlock_slow(xl); +} + +static irqreturn_t dummy_handler(int irq, void *dev_id) +{ + BUG(); + return IRQ_HANDLED; +} + +void __cpuinit xen_init_lock_cpu(int cpu) +{ + int irq; + const char *name; + + name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); + irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, + cpu, + dummy_handler, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + name, + NULL); + + if (irq >= 0) { + disable_irq(irq); /* make sure it's never delivered */ + per_cpu(lock_kicker_irq, cpu) = irq; + } + + printk("cpu %d spinlock event irq %d\n", cpu, irq); +} + +void __init xen_init_spinlocks(void) +{ + pv_lock_ops.spin_is_locked = xen_spin_is_locked; + pv_lock_ops.spin_is_contended = xen_spin_is_contended; + pv_lock_ops.spin_lock = xen_spin_lock; + pv_lock_ops.spin_trylock = xen_spin_trylock; + pv_lock_ops.spin_unlock = xen_spin_unlock; +} diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index dd3c231..8847fb3 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -50,6 +50,9 @@ void __init xen_setup_vcpu_info_placement(void); #ifdef CONFIG_SMP void xen_smp_init(void); +void __init xen_init_spinlocks(void); +__cpuinit void xen_init_lock_cpu(int cpu); + extern cpumask_t xen_cpu_initialized_map; #else static inline void xen_smp_init(void) {} -- cgit v1.1 From b56afe1d41653fb07ab1b5af5ccc12001c4dd5a0 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Thu, 24 Jul 2008 12:15:45 -0300 Subject: x86, xen: Use native_pte_flags instead of native_pte_val for .pte_flags Using native_pte_val triggers the BUG_ON() in the paravirt_ops version of pte_flags(). Signed-off-by: Eduardo Habkost Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 06219e6..e2767c2 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1347,7 +1347,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .ptep_modify_prot_commit = __ptep_modify_prot_commit, .pte_val = xen_pte_val, - .pte_flags = native_pte_val, + .pte_flags = native_pte_flags, .pgd_val = xen_pgd_val, .make_pte = xen_make_pte, -- cgit v1.1 From a05d2ebab28011c2f3f520833f4bfdd2fd1b9c02 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 27 Jul 2008 08:45:02 -0700 Subject: xen: fix allocation and use of large ldts When the ldt gets to more than 1 page in size, the kernel uses vmalloc to allocate it. This means that: - when making the ldt RO, we must update the pages in both the vmalloc mapping and the linear mapping to make sure there are no RW aliases. - we need to use arbitrary_virt_to_machine to compute the machine addr for each update Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 51 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 10 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e2767c2..b011e4a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -325,24 +325,55 @@ static unsigned long xen_store_tr(void) return 0; } +/* + * If 'v' is a vmalloc mapping, then find the linear mapping of the + * page (if any) and also set its protections to match: + */ +static void set_aliased_prot(void *v, pgprot_t prot) +{ + int level; + pte_t *ptep; + pte_t pte; + unsigned long pfn; + struct page *page; + + ptep = lookup_address((unsigned long)v, &level); + BUG_ON(ptep == NULL); + + pfn = pte_pfn(*ptep); + page = pfn_to_page(pfn); + + pte = pfn_pte(pfn, prot); + + if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0)) + BUG(); + + if (!PageHighMem(page)) { + void *av = __va(PFN_PHYS(pfn)); + + if (av != v) + if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0)) + BUG(); + } else + kmap_flush_unused(); +} + static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) { - unsigned pages = roundup(entries * LDT_ENTRY_SIZE, PAGE_SIZE); - void *v = ldt; + const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; int i; - for(i = 0; i < pages; i += PAGE_SIZE) - make_lowmem_page_readonly(v + i); + for(i = 0; i < entries; i += entries_per_page) + set_aliased_prot(ldt + i, PAGE_KERNEL_RO); } static void xen_free_ldt(struct desc_struct *ldt, unsigned entries) { - unsigned pages = roundup(entries * LDT_ENTRY_SIZE, PAGE_SIZE); - void *v = ldt; + const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; int i; - for(i = 0; i < pages; i += PAGE_SIZE) - make_lowmem_page_readwrite(v + i); + for(i = 0; i < entries; i += entries_per_page) + set_aliased_prot(ldt + i, PAGE_KERNEL); } static void xen_set_ldt(const void *addr, unsigned entries) @@ -446,7 +477,7 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, const void *ptr) { unsigned long lp = (unsigned long)&dt[entrynum]; - xmaddr_t mach_lp = virt_to_machine(lp); + xmaddr_t mach_lp = arbitrary_virt_to_machine(lp); u64 entry = *(u64 *)ptr; preempt_disable(); @@ -579,7 +610,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, } static void xen_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) + struct thread_struct *thread) { struct multicall_space mcs = xen_mc_entry(0); MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); -- cgit v1.1 From d89961e2dc87b6e30b8e3f60bd2af5cd92cf4643 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 24 Jul 2008 13:48:58 -0700 Subject: xen: suppress known wrmsrs In general, Xen doesn't support wrmsr from an unprivileged domain; it just ends up ignoring the instruction and printing a message on the console. Given that there are sets of MSRs we know the kernel will try to write to, but we don't care, just eat them in xen_write_msr to cut down on console noise. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b011e4a..b795470 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -854,6 +854,19 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) ret = -EFAULT; break; #endif + + case MSR_STAR: + case MSR_CSTAR: + case MSR_LSTAR: + case MSR_SYSCALL_MASK: + case MSR_IA32_SYSENTER_CS: + case MSR_IA32_SYSENTER_ESP: + case MSR_IA32_SYSENTER_EIP: + /* Fast syscall setup is all done in hypercalls, so + these are all ignored. Stub them out here to stop + Xen console noise. */ + break; + default: ret = native_write_msr_safe(msr, low, high); } -- cgit v1.1 From 0d1edf46ba229b46efacf75c0544b88c05a7b266 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 28 Jul 2008 11:53:57 -0700 Subject: xen: compile irq functions without -pg for ftrace For some reason I managed to miss a bunch of irq-related functions which also need to be compiled without -pg when using ftrace. This patch moves them into their own file, and starts a cleanup process I've been meaning to do anyway. Signed-off-by: Jeremy Fitzhardinge Cc: Sam Ravnborg Cc: "Alex Nixon (Intern)" Cc: Eduardo Habkost Signed-off-by: Ingo Molnar --- arch/x86/xen/Makefile | 3 +- arch/x86/xen/enlighten.c | 122 +-------------------------------------- arch/x86/xen/irq.c | 143 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/xen/xen-asm_32.S | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- arch/x86/xen/xen-ops.h | 1 + 6 files changed, 150 insertions(+), 123 deletions(-) create mode 100644 arch/x86/xen/irq.c (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 5bfee24..9ee745f 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -2,9 +2,10 @@ ifdef CONFIG_FTRACE # Do not profile debug and lowlevel utilities CFLAGS_REMOVE_spinlock.o = -pg CFLAGS_REMOVE_time.o = -pg +CFLAGS_REMOVE_irq.o = -pg endif -obj-y := enlighten.o setup.o multicalls.o mmu.o \ +obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm_$(BITS).o grant-table.o suspend.o obj-$(CONFIG_SMP) += smp.o spinlock.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b795470..cf8b3a9 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include @@ -226,94 +225,6 @@ static unsigned long xen_get_debugreg(int reg) return HYPERVISOR_get_debugreg(reg); } -static unsigned long xen_save_fl(void) -{ - struct vcpu_info *vcpu; - unsigned long flags; - - vcpu = x86_read_percpu(xen_vcpu); - - /* flag has opposite sense of mask */ - flags = !vcpu->evtchn_upcall_mask; - - /* convert to IF type flag - -0 -> 0x00000000 - -1 -> 0xffffffff - */ - return (-flags) & X86_EFLAGS_IF; -} - -static void xen_restore_fl(unsigned long flags) -{ - struct vcpu_info *vcpu; - - /* convert from IF type flag */ - flags = !(flags & X86_EFLAGS_IF); - - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ - preempt_disable(); - vcpu = x86_read_percpu(xen_vcpu); - vcpu->evtchn_upcall_mask = flags; - preempt_enable_no_resched(); - - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ - - if (flags == 0) { - preempt_check_resched(); - barrier(); /* unmask then check (avoid races) */ - if (unlikely(vcpu->evtchn_upcall_pending)) - force_evtchn_callback(); - } -} - -static void xen_irq_disable(void) -{ - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ - preempt_disable(); - x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; - preempt_enable_no_resched(); -} - -static void xen_irq_enable(void) -{ - struct vcpu_info *vcpu; - - /* We don't need to worry about being preempted here, since - either a) interrupts are disabled, so no preemption, or b) - the caller is confused and is trying to re-enable interrupts - on an indeterminate processor. */ - - vcpu = x86_read_percpu(xen_vcpu); - vcpu->evtchn_upcall_mask = 0; - - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ - - barrier(); /* unmask then check (avoid races) */ - if (unlikely(vcpu->evtchn_upcall_pending)) - force_evtchn_callback(); -} - -static void xen_safe_halt(void) -{ - /* Blocking includes an implicit local_irq_enable(). */ - if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) - BUG(); -} - -static void xen_halt(void) -{ - if (irqs_disabled()) - HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); - else - xen_safe_halt(); -} - static void xen_leave_lazy(void) { paravirt_leave_lazy(paravirt_get_lazy_mode()); @@ -1308,36 +1219,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { }, }; -static void __init __xen_init_IRQ(void) -{ -#ifdef CONFIG_X86_64 - int i; - - /* Create identity vector->irq map */ - for(i = 0; i < NR_VECTORS; i++) { - int cpu; - - for_each_possible_cpu(cpu) - per_cpu(vector_irq, cpu)[i] = i; - } -#endif /* CONFIG_X86_64 */ - - xen_init_IRQ(); -} - -static const struct pv_irq_ops xen_irq_ops __initdata = { - .init_IRQ = __xen_init_IRQ, - .save_fl = xen_save_fl, - .restore_fl = xen_restore_fl, - .irq_disable = xen_irq_disable, - .irq_enable = xen_irq_enable, - .safe_halt = xen_safe_halt, - .halt = xen_halt, -#ifdef CONFIG_X86_64 - .adjust_exception_frame = xen_adjust_exception_frame, -#endif -}; - static const struct pv_apic_ops xen_apic_ops __initdata = { #ifdef CONFIG_X86_LOCAL_APIC .apic_write = xen_apic_write, @@ -1740,10 +1621,11 @@ asmlinkage void __init xen_start_kernel(void) pv_init_ops = xen_init_ops; pv_time_ops = xen_time_ops; pv_cpu_ops = xen_cpu_ops; - pv_irq_ops = xen_irq_ops; pv_apic_ops = xen_apic_ops; pv_mmu_ops = xen_mmu_ops; + xen_init_irq_ops(); + if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c new file mode 100644 index 0000000..28b85ab --- /dev/null +++ b/arch/x86/xen/irq.c @@ -0,0 +1,143 @@ +#include + +#include +#include +#include + +#include +#include + +#include "xen-ops.h" + +/* + * Force a proper event-channel callback from Xen after clearing the + * callback mask. We do this in a very simple manner, by making a call + * down into Xen. The pending flag will be checked by Xen on return. + */ +void xen_force_evtchn_callback(void) +{ + (void)HYPERVISOR_xen_version(0, NULL); +} + +static void __init __xen_init_IRQ(void) +{ +#ifdef CONFIG_X86_64 + int i; + + /* Create identity vector->irq map */ + for(i = 0; i < NR_VECTORS; i++) { + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(vector_irq, cpu)[i] = i; + } +#endif /* CONFIG_X86_64 */ + + xen_init_IRQ(); +} + +static unsigned long xen_save_fl(void) +{ + struct vcpu_info *vcpu; + unsigned long flags; + + vcpu = x86_read_percpu(xen_vcpu); + + /* flag has opposite sense of mask */ + flags = !vcpu->evtchn_upcall_mask; + + /* convert to IF type flag + -0 -> 0x00000000 + -1 -> 0xffffffff + */ + return (-flags) & X86_EFLAGS_IF; +} + +static void xen_restore_fl(unsigned long flags) +{ + struct vcpu_info *vcpu; + + /* convert from IF type flag */ + flags = !(flags & X86_EFLAGS_IF); + + /* There's a one instruction preempt window here. We need to + make sure we're don't switch CPUs between getting the vcpu + pointer and updating the mask. */ + preempt_disable(); + vcpu = x86_read_percpu(xen_vcpu); + vcpu->evtchn_upcall_mask = flags; + preempt_enable_no_resched(); + + /* Doesn't matter if we get preempted here, because any + pending event will get dealt with anyway. */ + + if (flags == 0) { + preempt_check_resched(); + barrier(); /* unmask then check (avoid races) */ + if (unlikely(vcpu->evtchn_upcall_pending)) + xen_force_evtchn_callback(); + } +} + +static void xen_irq_disable(void) +{ + /* There's a one instruction preempt window here. We need to + make sure we're don't switch CPUs between getting the vcpu + pointer and updating the mask. */ + preempt_disable(); + x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; + preempt_enable_no_resched(); +} + +static void xen_irq_enable(void) +{ + struct vcpu_info *vcpu; + + /* We don't need to worry about being preempted here, since + either a) interrupts are disabled, so no preemption, or b) + the caller is confused and is trying to re-enable interrupts + on an indeterminate processor. */ + + vcpu = x86_read_percpu(xen_vcpu); + vcpu->evtchn_upcall_mask = 0; + + /* Doesn't matter if we get preempted here, because any + pending event will get dealt with anyway. */ + + barrier(); /* unmask then check (avoid races) */ + if (unlikely(vcpu->evtchn_upcall_pending)) + xen_force_evtchn_callback(); +} + +static void xen_safe_halt(void) +{ + /* Blocking includes an implicit local_irq_enable(). */ + if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) + BUG(); +} + +static void xen_halt(void) +{ + if (irqs_disabled()) + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); + else + xen_safe_halt(); +} + +static const struct pv_irq_ops xen_irq_ops __initdata = { + .init_IRQ = __xen_init_IRQ, + .save_fl = xen_save_fl, + .restore_fl = xen_restore_fl, + .irq_disable = xen_irq_disable, + .irq_enable = xen_irq_enable, + .safe_halt = xen_safe_halt, + .halt = xen_halt, +#ifdef CONFIG_X86_64 + .adjust_exception_frame = xen_adjust_exception_frame, +#endif +}; + +void __init xen_init_irq_ops() +{ + pv_irq_ops = xen_irq_ops; +} diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 2497a30..42786f5 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -298,7 +298,7 @@ check_events: push %eax push %ecx push %edx - call force_evtchn_callback + call xen_force_evtchn_callback pop %edx pop %ecx pop %eax diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 7f58304..3b9bda4 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -122,7 +122,7 @@ check_events: push %r9 push %r10 push %r11 - call force_evtchn_callback + call xen_force_evtchn_callback pop %r11 pop %r10 pop %r9 diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 8847fb3..3c70ebc 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -31,6 +31,7 @@ void xen_vcpu_restore(void); void __init xen_build_dynamic_phys_to_machine(void); +void xen_init_irq_ops(void); void xen_setup_timer(int cpu); void xen_setup_cpu_clockevents(void); unsigned long xen_tsc_khz(void); -- cgit v1.1 From cef43bf6b3afd819f7cdcba356af0e8220fb3789 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 28 Jul 2008 13:33:44 -0700 Subject: xen: fix allocation and use of large ldts, cleanup Add a proper comment for set_aliased_prot() and fix an unsigned long/void * warning. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index cf8b3a9..04ec69e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -237,8 +237,10 @@ static unsigned long xen_store_tr(void) } /* - * If 'v' is a vmalloc mapping, then find the linear mapping of the - * page (if any) and also set its protections to match: + * Set the page permissions for a particular virtual address. If the + * address is a vmalloc mapping (or other non-linear mapping), then + * find the linear mapping of the page and also set its protections to + * match. */ static void set_aliased_prot(void *v, pgprot_t prot) { @@ -387,8 +389,7 @@ static void xen_load_gs_index(unsigned int idx) static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, const void *ptr) { - unsigned long lp = (unsigned long)&dt[entrynum]; - xmaddr_t mach_lp = arbitrary_virt_to_machine(lp); + xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]); u64 entry = *(u64 *)ptr; preempt_disable(); -- cgit v1.1 From 169ad16bb87c10a3f7c108bb7008ebc0270f617a Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Mon, 28 Jul 2008 18:32:09 -0300 Subject: xen_alloc_ptpage: cast PFN_PHYS() argument to unsigned long Currently paravirt_ops alloc_p*() uses u32 for the pfn args. We should change that later, but while the pfn parameter is still u32, we need to cast the PFN_PHYS() argument at xen_alloc_ptpage() to unsigned long, otherwise it will lose bits on the shift. I think PFN_PHYS() should behave better when fed with smaller integers, but a cast to unsigned long won't be enough for all cases on 32-bit PAE, and a cast to u64 would be overkill for most users of PFN_PHYS(). We could have two different flavors of PFN_PHYS: one for low pages only (unsigned long) and another that works for any page (u64)), but while we don't have it, we will need the cast to unsigned long on xen_alloc_ptpage(). Signed-off-by: Eduardo Habkost Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 04ec69e..53afa14 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -822,7 +822,7 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) SetPagePinned(page); if (!PageHighMem(page)) { - make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); + make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); if (level == PT_PTE) pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); } else -- cgit v1.1 From 6e833587e11ed0dbf12e647127f2650e2f80b26d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 19 Aug 2008 13:16:17 -0700 Subject: xen: clean up domain mode predicates There are four operating modes Xen code may find itself running in: - native - hvm domain - pv dom0 - pv domU Clean up predicates for testing for these states to make them more consistent. Signed-off-by: Jeremy Fitzhardinge Cc: Xen-devel Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 53afa14..b106e82 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -56,6 +56,9 @@ EXPORT_SYMBOL_GPL(hypercall_page); DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); +enum xen_domain_type xen_domain_type = XEN_NATIVE; +EXPORT_SYMBOL_GPL(xen_domain_type); + /* * Identity map, in addition to plain kernel map. This needs to be * large enough to allocate page table pages to allocate the rest. @@ -1613,6 +1616,8 @@ asmlinkage void __init xen_start_kernel(void) if (!xen_start_info) return; + xen_domain_type = XEN_PV_DOMAIN; + BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); xen_setup_features(); @@ -1650,7 +1655,7 @@ asmlinkage void __init xen_start_kernel(void) /* Prevent unwanted bits from being set in PTEs. */ __supported_pte_mask &= ~_PAGE_GLOBAL; - if (!is_initial_xendomain()) + if (!xen_initial_domain()) __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); /* Don't do the full vcpu_info placement stuff until we have a @@ -1685,7 +1690,7 @@ asmlinkage void __init xen_start_kernel(void) boot_params.hdr.ramdisk_size = xen_start_info->mod_len; boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); - if (!is_initial_xendomain()) { + if (!xen_initial_domain()) { add_preferred_console("xenboot", 0, NULL); add_preferred_console("tty", 0, NULL); add_preferred_console("hvc", 0, NULL); -- cgit v1.1 From 11ad93e59d114f4b218873f1c93261be725d2e22 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 19 Aug 2008 13:32:51 -0700 Subject: xen: clarify locking used when pinning a pagetable. Add some comments explaining the locking and pinning algorithm when using split pte locks. Also implement a minor optimisation of not pinning the PTE when not using split pte locks. Signed-off-by: Jeremy Fitzhardinge Cc: Xen-devel Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index aa37469..d3752b6 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -590,8 +590,6 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), pmdidx_limit = 0; #endif - flush |= (*func)(virt_to_page(pgd), PT_PGD); - for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { pud_t *pud; @@ -637,7 +635,11 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), } } } + out: + /* Do the top level last, so that the callbacks can use it as + a cue to do final things like tlb flushes. */ + flush |= (*func)(virt_to_page(pgd), PT_PGD); return flush; } @@ -691,6 +693,26 @@ static int pin_page(struct page *page, enum pt_level level) flush = 0; + /* + * We need to hold the pagetable lock between the time + * we make the pagetable RO and when we actually pin + * it. If we don't, then other users may come in and + * attempt to update the pagetable by writing it, + * which will fail because the memory is RO but not + * pinned, so Xen won't do the trap'n'emulate. + * + * If we're using split pte locks, we can't hold the + * entire pagetable's worth of locks during the + * traverse, because we may wrap the preempt count (8 + * bits). The solution is to mark RO and pin each PTE + * page while holding the lock. This means the number + * of locks we end up holding is never more than a + * batch size (~32 entries, at present). + * + * If we're not using split pte locks, we needn't pin + * the PTE pages independently, because we're + * protected by the overall pagetable lock. + */ ptl = NULL; if (level == PT_PTE) ptl = lock_pte(page); @@ -699,10 +721,9 @@ static int pin_page(struct page *page, enum pt_level level) pfn_pte(pfn, PAGE_KERNEL_RO), level == PT_PGD ? UVMF_TLB_FLUSH : 0); - if (level == PT_PTE) + if (ptl) { xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); - if (ptl) { /* Queue a deferred unlock for when this batch is completed. */ xen_mc_callback(do_unlock, ptl); @@ -796,10 +817,18 @@ static int unpin_page(struct page *page, enum pt_level level) spinlock_t *ptl = NULL; struct multicall_space mcs; + /* + * Do the converse to pin_page. If we're using split + * pte locks, we must be holding the lock for while + * the pte page is unpinned but still RO to prevent + * concurrent updates from seeing it in this + * partially-pinned state. + */ if (level == PT_PTE) { ptl = lock_pte(page); - xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); + if (ptl) + xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); } mcs = __xen_mc_entry(0); @@ -837,7 +866,7 @@ static void xen_pgd_unpin(pgd_t *pgd) #ifdef CONFIG_X86_PAE /* Need to make sure unshared kernel PMD is unpinned */ - pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); + unpin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); #endif pgd_walk(pgd, unpin_page, USER_LIMIT); -- cgit v1.1 From 7708ad64a24a674f7905aa7a5099a50f055debec Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 19 Aug 2008 13:34:22 -0700 Subject: xen: add xen_ prefixes to make tracing with ftrace easier It's easier to pattern match on Xen function if they all start with xen_. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 66 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 32 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index d3752b6..d9a35a3 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -229,14 +229,14 @@ void make_lowmem_page_readwrite(void *vaddr) } -static bool page_pinned(void *ptr) +static bool xen_page_pinned(void *ptr) { struct page *page = virt_to_page(ptr); return PagePinned(page); } -static void extend_mmu_update(const struct mmu_update *update) +static void xen_extend_mmu_update(const struct mmu_update *update) { struct multicall_space mcs; struct mmu_update *u; @@ -265,7 +265,7 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) /* ptr may be ioremapped for 64-bit pagetable setup */ u.ptr = arbitrary_virt_to_machine(ptr).maddr; u.val = pmd_val_ma(val); - extend_mmu_update(&u); + xen_extend_mmu_update(&u); xen_mc_issue(PARAVIRT_LAZY_MMU); @@ -276,7 +276,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val) { /* If page is not pinned, we can just update the entry directly */ - if (!page_pinned(ptr)) { + if (!xen_page_pinned(ptr)) { *ptr = val; return; } @@ -334,7 +334,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; u.val = pte_val_ma(pte); - extend_mmu_update(&u); + xen_extend_mmu_update(&u); xen_mc_issue(PARAVIRT_LAZY_MMU); } @@ -400,7 +400,7 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) /* ptr may be ioremapped for 64-bit pagetable setup */ u.ptr = arbitrary_virt_to_machine(ptr).maddr; u.val = pud_val_ma(val); - extend_mmu_update(&u); + xen_extend_mmu_update(&u); xen_mc_issue(PARAVIRT_LAZY_MMU); @@ -411,7 +411,7 @@ void xen_set_pud(pud_t *ptr, pud_t val) { /* If page is not pinned, we can just update the entry directly */ - if (!page_pinned(ptr)) { + if (!xen_page_pinned(ptr)) { *ptr = val; return; } @@ -490,7 +490,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) u.ptr = virt_to_machine(ptr).maddr; u.val = pgd_val_ma(val); - extend_mmu_update(&u); + xen_extend_mmu_update(&u); } /* @@ -519,10 +519,10 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) /* If page is not pinned, we can just update the entry directly */ - if (!page_pinned(ptr)) { + if (!xen_page_pinned(ptr)) { *ptr = val; if (user_ptr) { - WARN_ON(page_pinned(user_ptr)); + WARN_ON(xen_page_pinned(user_ptr)); *user_ptr = val; } return; @@ -555,8 +555,8 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) * For 64-bit, we must skip the Xen hole in the middle of the address * space, just after the big x86-64 virtual hole. */ -static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), - unsigned long limit) +static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), + unsigned long limit) { int flush = 0; unsigned hole_low, hole_high; @@ -644,7 +644,9 @@ out: return flush; } -static spinlock_t *lock_pte(struct page *page) +/* If we're using split pte locks, then take the page's lock and + return a pointer to it. Otherwise return NULL. */ +static spinlock_t *xen_pte_lock(struct page *page) { spinlock_t *ptl = NULL; @@ -656,7 +658,7 @@ static spinlock_t *lock_pte(struct page *page) return ptl; } -static void do_unlock(void *v) +static void xen_pte_unlock(void *v) { spinlock_t *ptl = v; spin_unlock(ptl); @@ -674,7 +676,7 @@ static void xen_do_pin(unsigned level, unsigned long pfn) MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); } -static int pin_page(struct page *page, enum pt_level level) +static int xen_pin_page(struct page *page, enum pt_level level) { unsigned pgfl = TestSetPagePinned(page); int flush; @@ -715,7 +717,7 @@ static int pin_page(struct page *page, enum pt_level level) */ ptl = NULL; if (level == PT_PTE) - ptl = lock_pte(page); + ptl = xen_pte_lock(page); MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, pfn_pte(pfn, PAGE_KERNEL_RO), @@ -726,7 +728,7 @@ static int pin_page(struct page *page, enum pt_level level) /* Queue a deferred unlock for when this batch is completed. */ - xen_mc_callback(do_unlock, ptl); + xen_mc_callback(xen_pte_unlock, ptl); } } @@ -740,7 +742,7 @@ void xen_pgd_pin(pgd_t *pgd) { xen_mc_batch(); - if (pgd_walk(pgd, pin_page, USER_LIMIT)) { + if (xen_pgd_walk(pgd, xen_pin_page, USER_LIMIT)) { /* re-enable interrupts for kmap_flush_unused */ xen_mc_issue(0); kmap_flush_unused(); @@ -754,14 +756,14 @@ void xen_pgd_pin(pgd_t *pgd) xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); if (user_pgd) { - pin_page(virt_to_page(user_pgd), PT_PGD); + xen_pin_page(virt_to_page(user_pgd), PT_PGD); xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); } } #else /* CONFIG_X86_32 */ #ifdef CONFIG_X86_PAE /* Need to make sure unshared kernel PMD is pinnable */ - pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); + xen_pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); #endif xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); #endif /* CONFIG_X86_64 */ @@ -796,7 +798,7 @@ void xen_mm_pin_all(void) * that's before we have page structures to store the bits. So do all * the book-keeping now. */ -static __init int mark_pinned(struct page *page, enum pt_level level) +static __init int xen_mark_pinned(struct page *page, enum pt_level level) { SetPagePinned(page); return 0; @@ -804,10 +806,10 @@ static __init int mark_pinned(struct page *page, enum pt_level level) void __init xen_mark_init_mm_pinned(void) { - pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); + xen_pgd_walk(init_mm.pgd, xen_mark_pinned, FIXADDR_TOP); } -static int unpin_page(struct page *page, enum pt_level level) +static int xen_unpin_page(struct page *page, enum pt_level level) { unsigned pgfl = TestClearPagePinned(page); @@ -825,7 +827,7 @@ static int unpin_page(struct page *page, enum pt_level level) * partially-pinned state. */ if (level == PT_PTE) { - ptl = lock_pte(page); + ptl = xen_pte_lock(page); if (ptl) xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); @@ -839,7 +841,7 @@ static int unpin_page(struct page *page, enum pt_level level) if (ptl) { /* unlock when batch completed */ - xen_mc_callback(do_unlock, ptl); + xen_mc_callback(xen_pte_unlock, ptl); } } @@ -859,17 +861,17 @@ static void xen_pgd_unpin(pgd_t *pgd) if (user_pgd) { xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); - unpin_page(virt_to_page(user_pgd), PT_PGD); + xen_unpin_page(virt_to_page(user_pgd), PT_PGD); } } #endif #ifdef CONFIG_X86_PAE /* Need to make sure unshared kernel PMD is unpinned */ - unpin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); + xen_unpin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); #endif - pgd_walk(pgd, unpin_page, USER_LIMIT); + xen_pgd_walk(pgd, xen_unpin_page, USER_LIMIT); xen_mc_issue(0); } @@ -936,7 +938,7 @@ static void drop_other_mm_ref(void *info) } } -static void drop_mm_ref(struct mm_struct *mm) +static void xen_drop_mm_ref(struct mm_struct *mm) { cpumask_t mask; unsigned cpu; @@ -966,7 +968,7 @@ static void drop_mm_ref(struct mm_struct *mm) smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); } #else -static void drop_mm_ref(struct mm_struct *mm) +static void xen_drop_mm_ref(struct mm_struct *mm) { if (current->active_mm == mm) load_cr3(swapper_pg_dir); @@ -990,13 +992,13 @@ static void drop_mm_ref(struct mm_struct *mm) void xen_exit_mmap(struct mm_struct *mm) { get_cpu(); /* make sure we don't move around */ - drop_mm_ref(mm); + xen_drop_mm_ref(mm); put_cpu(); spin_lock(&mm->page_table_lock); /* pgd may not be pinned in the error exit path of execve */ - if (page_pinned(mm->pgd)) + if (xen_page_pinned(mm->pgd)) xen_pgd_unpin(mm->pgd); spin_unlock(&mm->page_table_lock); -- cgit v1.1 From 168d2f464ab9860f0d1e66cf1f9684973222f1c6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 20 Aug 2008 17:02:18 -0700 Subject: xen: save previous spinlock when blocking A spinlock can be interrupted while spinning, so make sure we preserve the previous lock of interest if we're taking a lock from within an interrupt handler. We also need to deal with the case where the blocking path gets interrupted between testing to see if the lock is free and actually blocking. If we get interrupted there and end up in the state where the lock is free but the irq isn't pending, then we'll block indefinitely in the hypervisor. This fix is to make sure that any nested lock-takers will always leave the irq pending if there's any chance the outer lock became free. Signed-off-by: Jeremy Fitzhardinge Acked-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/xen/spinlock.c | 65 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 15 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 8dc4d31..4884bc6 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -47,25 +47,41 @@ static int xen_spin_trylock(struct raw_spinlock *lock) static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); -static inline void spinning_lock(struct xen_spinlock *xl) +/* + * Mark a cpu as interested in a lock. Returns the CPU's previous + * lock of interest, in case we got preempted by an interrupt. + */ +static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl) { + struct xen_spinlock *prev; + + prev = __get_cpu_var(lock_spinners); __get_cpu_var(lock_spinners) = xl; + wmb(); /* set lock of interest before count */ + asm(LOCK_PREFIX " incw %0" : "+m" (xl->spinners) : : "memory"); + + return prev; } -static inline void unspinning_lock(struct xen_spinlock *xl) +/* + * Mark a cpu as no longer interested in a lock. Restores previous + * lock of interest (NULL for none). + */ +static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev) { asm(LOCK_PREFIX " decw %0" : "+m" (xl->spinners) : : "memory"); - wmb(); /* decrement count before clearing lock */ - __get_cpu_var(lock_spinners) = NULL; + wmb(); /* decrement count before restoring lock */ + __get_cpu_var(lock_spinners) = prev; } static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; + struct xen_spinlock *prev; int irq = __get_cpu_var(lock_kicker_irq); int ret; @@ -74,23 +90,42 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) return 0; /* announce we're spinning */ - spinning_lock(xl); + prev = spinning_lock(xl); - /* clear pending */ - xen_clear_irq_pending(irq); + do { + /* clear pending */ + xen_clear_irq_pending(irq); + + /* check again make sure it didn't become free while + we weren't looking */ + ret = xen_spin_trylock(lock); + if (ret) { + /* + * If we interrupted another spinlock while it + * was blocking, make sure it doesn't block + * without rechecking the lock. + */ + if (prev != NULL) + xen_set_irq_pending(irq); + goto out; + } - /* check again make sure it didn't become free while - we weren't looking */ - ret = xen_spin_trylock(lock); - if (ret) - goto out; + /* + * Block until irq becomes pending. If we're + * interrupted at this point (after the trylock but + * before entering the block), then the nested lock + * handler guarantees that the irq will be left + * pending if there's any chance the lock became free; + * xen_poll_irq() returns immediately if the irq is + * pending. + */ + xen_poll_irq(irq); + } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ - /* block until irq becomes pending */ - xen_poll_irq(irq); kstat_this_cpu.irqs[irq]++; out: - unspinning_lock(xl); + unspinning_lock(xl, prev); return ret; } -- cgit v1.1 From 994025caba3e6beade9bde84dd1b70d9d250f27b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 20 Aug 2008 17:02:19 -0700 Subject: xen: add debugfs support Add support for exporting statistics on mmu updates, multicall batching and pv spinlocks into debugfs. The base path is xen/ and each subsystem adds its own directory: mmu, multicalls, spinlocks. In each directory, writing 1 to "zero_stats" will cause the corresponding stats to be zeroed the next time they're updated. Signed-off-by: Jeremy Fitzhardinge Acked-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/xen/Kconfig | 10 ++- arch/x86/xen/Makefile | 3 +- arch/x86/xen/debugfs.c | 123 ++++++++++++++++++++++++++++++++++ arch/x86/xen/debugfs.h | 10 +++ arch/x86/xen/mmu.c | 163 ++++++++++++++++++++++++++++++++++++++++++++- arch/x86/xen/multicalls.c | 115 +++++++++++++++++++++++++++++++- arch/x86/xen/spinlock.c | 165 +++++++++++++++++++++++++++++++++++++++++++++- 7 files changed, 580 insertions(+), 9 deletions(-) create mode 100644 arch/x86/xen/debugfs.c create mode 100644 arch/x86/xen/debugfs.h (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 3815e42..d3e6846 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -27,4 +27,12 @@ config XEN_MAX_DOMAIN_MEMORY config XEN_SAVE_RESTORE bool depends on PM - default y \ No newline at end of file + default y + +config XEN_DEBUG_FS + bool "Enable Xen debug and tuning parameters in debugfs" + depends on XEN && DEBUG_FS + default n + help + Enable statistics output and various tuning options in debugfs. + Enabling this option may incur a significant performance overhead. \ No newline at end of file diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 9ee745f..3139479 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -8,4 +8,5 @@ endif obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm_$(BITS).o grant-table.o suspend.o -obj-$(CONFIG_SMP) += smp.o spinlock.o +obj-$(CONFIG_SMP) += smp.o spinlock.o +obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c new file mode 100644 index 0000000..b53225d --- /dev/null +++ b/arch/x86/xen/debugfs.c @@ -0,0 +1,123 @@ +#include +#include +#include + +#include "debugfs.h" + +static struct dentry *d_xen_debug; + +struct dentry * __init xen_init_debugfs(void) +{ + if (!d_xen_debug) { + d_xen_debug = debugfs_create_dir("xen", NULL); + + if (!d_xen_debug) + pr_warning("Could not create 'xen' debugfs directory\n"); + } + + return d_xen_debug; +} + +struct array_data +{ + void *array; + unsigned elements; +}; + +static int u32_array_open(struct inode *inode, struct file *file) +{ + file->private_data = NULL; + return nonseekable_open(inode, file); +} + +static size_t format_array(char *buf, size_t bufsize, const char *fmt, + u32 *array, unsigned array_size) +{ + size_t ret = 0; + unsigned i; + + for(i = 0; i < array_size; i++) { + size_t len; + + len = snprintf(buf, bufsize, fmt, array[i]); + len++; /* ' ' or '\n' */ + ret += len; + + if (buf) { + buf += len; + bufsize -= len; + buf[-1] = (i == array_size-1) ? '\n' : ' '; + } + } + + ret++; /* \0 */ + if (buf) + *buf = '\0'; + + return ret; +} + +static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size) +{ + size_t len = format_array(NULL, 0, fmt, array, array_size); + char *ret; + + ret = kmalloc(len, GFP_KERNEL); + if (ret == NULL) + return NULL; + + format_array(ret, len, fmt, array, array_size); + return ret; +} + +static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len, + loff_t *ppos) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct array_data *data = inode->i_private; + size_t size; + + if (*ppos == 0) { + if (file->private_data) { + kfree(file->private_data); + file->private_data = NULL; + } + + file->private_data = format_array_alloc("%u", data->array, data->elements); + } + + size = 0; + if (file->private_data) + size = strlen(file->private_data); + + return simple_read_from_buffer(buf, len, ppos, file->private_data, size); +} + +static int xen_array_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + + return 0; +} + +static struct file_operations u32_array_fops = { + .owner = THIS_MODULE, + .open = u32_array_open, + .release= xen_array_release, + .read = u32_array_read, +}; + +struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, + struct dentry *parent, + u32 *array, unsigned elements) +{ + struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL); + + if (data == NULL) + return NULL; + + data->array = array; + data->elements = elements; + + return debugfs_create_file(name, mode, parent, data, &u32_array_fops); +} diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h new file mode 100644 index 0000000..e281320 --- /dev/null +++ b/arch/x86/xen/debugfs.h @@ -0,0 +1,10 @@ +#ifndef _XEN_DEBUGFS_H +#define _XEN_DEBUGFS_H + +struct dentry * __init xen_init_debugfs(void); + +struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, + struct dentry *parent, + u32 *array, unsigned elements); + +#endif /* _XEN_DEBUGFS_H */ diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index d9a35a3..f5af913 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -40,6 +40,7 @@ */ #include #include +#include #include #include @@ -57,6 +58,61 @@ #include "multicalls.h" #include "mmu.h" +#include "debugfs.h" + +#define MMU_UPDATE_HISTO 30 + +#ifdef CONFIG_XEN_DEBUG_FS + +static struct { + u32 pgd_update; + u32 pgd_update_pinned; + u32 pgd_update_batched; + + u32 pud_update; + u32 pud_update_pinned; + u32 pud_update_batched; + + u32 pmd_update; + u32 pmd_update_pinned; + u32 pmd_update_batched; + + u32 pte_update; + u32 pte_update_pinned; + u32 pte_update_batched; + + u32 mmu_update; + u32 mmu_update_extended; + u32 mmu_update_histo[MMU_UPDATE_HISTO]; + + u32 prot_commit; + u32 prot_commit_batched; + + u32 set_pte_at; + u32 set_pte_at_batched; + u32 set_pte_at_pinned; + u32 set_pte_at_current; + u32 set_pte_at_kernel; +} mmu_stats; + +static u8 zero_stats; + +static inline void check_zero(void) +{ + if (unlikely(zero_stats)) { + memset(&mmu_stats, 0, sizeof(mmu_stats)); + zero_stats = 0; + } +} + +#define ADD_STATS(elem, val) \ + do { check_zero(); mmu_stats.elem += (val); } while(0) + +#else /* !CONFIG_XEN_DEBUG_FS */ + +#define ADD_STATS(elem, val) do { (void)(val); } while(0) + +#endif /* CONFIG_XEN_DEBUG_FS */ /* * Just beyond the highest usermode address. STACK_TOP_MAX has a @@ -243,11 +299,21 @@ static void xen_extend_mmu_update(const struct mmu_update *update) mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); - if (mcs.mc != NULL) + if (mcs.mc != NULL) { + ADD_STATS(mmu_update_extended, 1); + ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1); + mcs.mc->args[1]++; - else { + + if (mcs.mc->args[1] < MMU_UPDATE_HISTO) + ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1); + else + ADD_STATS(mmu_update_histo[0], 1); + } else { + ADD_STATS(mmu_update, 1); mcs = __xen_mc_entry(sizeof(*u)); MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); + ADD_STATS(mmu_update_histo[1], 1); } u = mcs.args; @@ -267,6 +333,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) u.val = pmd_val_ma(val); xen_extend_mmu_update(&u); + ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); + xen_mc_issue(PARAVIRT_LAZY_MMU); preempt_enable(); @@ -274,6 +342,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) void xen_set_pmd(pmd_t *ptr, pmd_t val) { + ADD_STATS(pmd_update, 1); + /* If page is not pinned, we can just update the entry directly */ if (!xen_page_pinned(ptr)) { @@ -281,6 +351,8 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val) return; } + ADD_STATS(pmd_update_pinned, 1); + xen_set_pmd_hyper(ptr, val); } @@ -300,12 +372,18 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, if (mm == &init_mm) preempt_disable(); + ADD_STATS(set_pte_at, 1); +// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); + ADD_STATS(set_pte_at_current, mm == current->mm); + ADD_STATS(set_pte_at_kernel, mm == &init_mm); + if (mm == current->mm || mm == &init_mm) { if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { struct multicall_space mcs; mcs = xen_mc_entry(0); MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); + ADD_STATS(set_pte_at_batched, 1); xen_mc_issue(PARAVIRT_LAZY_MMU); goto out; } else @@ -336,6 +414,9 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, u.val = pte_val_ma(pte); xen_extend_mmu_update(&u); + ADD_STATS(prot_commit, 1); + ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); + xen_mc_issue(PARAVIRT_LAZY_MMU); } @@ -402,6 +483,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) u.val = pud_val_ma(val); xen_extend_mmu_update(&u); + ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); + xen_mc_issue(PARAVIRT_LAZY_MMU); preempt_enable(); @@ -409,6 +492,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) void xen_set_pud(pud_t *ptr, pud_t val) { + ADD_STATS(pud_update, 1); + /* If page is not pinned, we can just update the entry directly */ if (!xen_page_pinned(ptr)) { @@ -416,11 +501,17 @@ void xen_set_pud(pud_t *ptr, pud_t val) return; } + ADD_STATS(pud_update_pinned, 1); + xen_set_pud_hyper(ptr, val); } void xen_set_pte(pte_t *ptep, pte_t pte) { + ADD_STATS(pte_update, 1); +// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep)); + ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); + #ifdef CONFIG_X86_PAE ptep->pte_high = pte.pte_high; smp_wmb(); @@ -517,6 +608,8 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) { pgd_t *user_ptr = xen_get_user_pgd(ptr); + ADD_STATS(pgd_update, 1); + /* If page is not pinned, we can just update the entry directly */ if (!xen_page_pinned(ptr)) { @@ -528,6 +621,9 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) return; } + ADD_STATS(pgd_update_pinned, 1); + ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); + /* If it's pinned, then we can at least batch the kernel and user updates together. */ xen_mc_batch(); @@ -1003,3 +1099,66 @@ void xen_exit_mmap(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } + +#ifdef CONFIG_XEN_DEBUG_FS + +static struct dentry *d_mmu_debug; + +static int __init xen_mmu_debugfs(void) +{ + struct dentry *d_xen = xen_init_debugfs(); + + if (d_xen == NULL) + return -ENOMEM; + + d_mmu_debug = debugfs_create_dir("mmu", d_xen); + + debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats); + + debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update); + debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug, + &mmu_stats.pgd_update_pinned); + debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug, + &mmu_stats.pgd_update_pinned); + + debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update); + debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug, + &mmu_stats.pud_update_pinned); + debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug, + &mmu_stats.pud_update_pinned); + + debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update); + debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug, + &mmu_stats.pmd_update_pinned); + debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug, + &mmu_stats.pmd_update_pinned); + + debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update); +// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug, +// &mmu_stats.pte_update_pinned); + debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug, + &mmu_stats.pte_update_pinned); + + debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update); + debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug, + &mmu_stats.mmu_update_extended); + xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug, + mmu_stats.mmu_update_histo, 20); + + debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at); + debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug, + &mmu_stats.set_pte_at_batched); + debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug, + &mmu_stats.set_pte_at_current); + debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug, + &mmu_stats.set_pte_at_kernel); + + debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit); + debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug, + &mmu_stats.prot_commit_batched); + + return 0; +} +fs_initcall(xen_mmu_debugfs); + +#endif /* CONFIG_XEN_DEBUG_FS */ diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 9efd1c6..8ea8a0d 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -21,16 +21,20 @@ */ #include #include +#include #include #include "multicalls.h" +#include "debugfs.h" + +#define MC_BATCH 32 #define MC_DEBUG 1 -#define MC_BATCH 32 #define MC_ARGS (MC_BATCH * 16) + struct mc_buffer { struct multicall_entry entries[MC_BATCH]; #if MC_DEBUG @@ -47,6 +51,76 @@ struct mc_buffer { static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); +/* flush reasons 0- slots, 1- args, 2- callbacks */ +enum flush_reasons +{ + FL_SLOTS, + FL_ARGS, + FL_CALLBACKS, + + FL_N_REASONS +}; + +#ifdef CONFIG_XEN_DEBUG_FS +#define NHYPERCALLS 40 /* not really */ + +static struct { + unsigned histo[MC_BATCH+1]; + + unsigned issued; + unsigned arg_total; + unsigned hypercalls; + unsigned histo_hypercalls[NHYPERCALLS]; + + unsigned flush[FL_N_REASONS]; +} mc_stats; + +static u8 zero_stats; + +static inline void check_zero(void) +{ + if (unlikely(zero_stats)) { + memset(&mc_stats, 0, sizeof(mc_stats)); + zero_stats = 0; + } +} + +static void mc_add_stats(const struct mc_buffer *mc) +{ + int i; + + check_zero(); + + mc_stats.issued++; + mc_stats.hypercalls += mc->mcidx; + mc_stats.arg_total += mc->argidx; + + mc_stats.histo[mc->mcidx]++; + for(i = 0; i < mc->mcidx; i++) { + unsigned op = mc->entries[i].op; + if (op < NHYPERCALLS) + mc_stats.histo_hypercalls[op]++; + } +} + +static void mc_stats_flush(enum flush_reasons idx) +{ + check_zero(); + + mc_stats.flush[idx]++; +} + +#else /* !CONFIG_XEN_DEBUG_FS */ + +static inline void mc_add_stats(const struct mc_buffer *mc) +{ +} + +static inline void mc_stats_flush(enum flush_reasons idx) +{ +} +#endif /* CONFIG_XEN_DEBUG_FS */ + void xen_mc_flush(void) { struct mc_buffer *b = &__get_cpu_var(mc_buffer); @@ -60,6 +134,8 @@ void xen_mc_flush(void) something in the middle */ local_irq_save(flags); + mc_add_stats(b); + if (b->mcidx) { #if MC_DEBUG memcpy(b->debug, b->entries, @@ -115,6 +191,7 @@ struct multicall_space __xen_mc_entry(size_t args) if (b->mcidx == MC_BATCH || (argidx + args) > MC_ARGS) { + mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS); xen_mc_flush(); argidx = roundup(b->argidx, sizeof(u64)); } @@ -158,10 +235,44 @@ void xen_mc_callback(void (*fn)(void *), void *data) struct mc_buffer *b = &__get_cpu_var(mc_buffer); struct callback *cb; - if (b->cbidx == MC_BATCH) + if (b->cbidx == MC_BATCH) { + mc_stats_flush(FL_CALLBACKS); xen_mc_flush(); + } cb = &b->callbacks[b->cbidx++]; cb->fn = fn; cb->data = data; } + +#ifdef CONFIG_XEN_DEBUG_FS + +static struct dentry *d_mc_debug; + +static int __init xen_mc_debugfs(void) +{ + struct dentry *d_xen = xen_init_debugfs(); + + if (d_xen == NULL) + return -ENOMEM; + + d_mc_debug = debugfs_create_dir("multicalls", d_xen); + + debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats); + + debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued); + debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls); + debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total); + + xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug, + mc_stats.histo, MC_BATCH); + xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug, + mc_stats.histo_hypercalls, NHYPERCALLS); + xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug, + mc_stats.flush, FL_N_REASONS); + + return 0; +} +fs_initcall(xen_mc_debugfs); + +#endif /* CONFIG_XEN_DEBUG_FS */ diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 4884bc6..0d8f3b2 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -4,6 +4,8 @@ */ #include #include +#include +#include #include @@ -11,6 +13,93 @@ #include #include "xen-ops.h" +#include "debugfs.h" + +#ifdef CONFIG_XEN_DEBUG_FS +static struct xen_spinlock_stats +{ + u64 taken; + u32 taken_slow; + u32 taken_slow_nested; + u32 taken_slow_pickup; + u32 taken_slow_spurious; + + u64 released; + u32 released_slow; + u32 released_slow_kicked; + +#define HISTO_BUCKETS 20 + u32 histo_spin_fast[HISTO_BUCKETS+1]; + u32 histo_spin[HISTO_BUCKETS+1]; + + u64 spinning_time; + u64 total_time; +} spinlock_stats; + +static u8 zero_stats; + +static unsigned lock_timeout = 1 << 10; +#define TIMEOUT lock_timeout + +static inline void check_zero(void) +{ + if (unlikely(zero_stats)) { + memset(&spinlock_stats, 0, sizeof(spinlock_stats)); + zero_stats = 0; + } +} + +#define ADD_STATS(elem, val) \ + do { check_zero(); spinlock_stats.elem += (val); } while(0) + +static inline u64 spin_time_start(void) +{ + return xen_clocksource_read(); +} + +static void __spin_time_accum(u64 delta, u32 *array) +{ + unsigned index = ilog2(delta); + + check_zero(); + + if (index < HISTO_BUCKETS) + array[index]++; + else + array[HISTO_BUCKETS]++; +} + +static inline void spin_time_accum_fast(u64 start) +{ + u32 delta = xen_clocksource_read() - start; + + __spin_time_accum(delta, spinlock_stats.histo_spin_fast); + spinlock_stats.spinning_time += delta; +} + +static inline void spin_time_accum(u64 start) +{ + u32 delta = xen_clocksource_read() - start; + + __spin_time_accum(delta, spinlock_stats.histo_spin); + spinlock_stats.total_time += delta; +} +#else /* !CONFIG_XEN_DEBUG_FS */ +#define TIMEOUT (1 << 10) +#define ADD_STATS(elem, val) do { (void)(val); } while(0) + +static inline u64 spin_time_start(void) +{ + return 0; +} + +static inline void spin_time_accum_fast(u64 start) +{ +} +static inline void spin_time_accum(u64 start) +{ +} +#endif /* CONFIG_XEN_DEBUG_FS */ struct xen_spinlock { unsigned char lock; /* 0 -> free; 1 -> locked */ @@ -92,6 +181,9 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) /* announce we're spinning */ prev = spinning_lock(xl); + ADD_STATS(taken_slow, 1); + ADD_STATS(taken_slow_nested, prev != NULL); + do { /* clear pending */ xen_clear_irq_pending(irq); @@ -100,6 +192,8 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) we weren't looking */ ret = xen_spin_trylock(lock); if (ret) { + ADD_STATS(taken_slow_pickup, 1); + /* * If we interrupted another spinlock while it * was blocking, make sure it doesn't block @@ -120,6 +214,7 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) * pending. */ xen_poll_irq(irq); + ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ kstat_this_cpu.irqs[irq]++; @@ -132,11 +227,18 @@ out: static void xen_spin_lock(struct raw_spinlock *lock) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; - int timeout; + unsigned timeout; u8 oldval; + u64 start_spin; + + ADD_STATS(taken, 1); + + start_spin = spin_time_start(); do { - timeout = 1 << 10; + u64 start_spin_fast = spin_time_start(); + + timeout = TIMEOUT; asm("1: xchgb %1,%0\n" " testb %1,%1\n" @@ -151,16 +253,22 @@ static void xen_spin_lock(struct raw_spinlock *lock) : "1" (1) : "memory"); - } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock))); + spin_time_accum_fast(start_spin_fast); + } while (unlikely(oldval != 0 && (TIMEOUT == ~0 || !xen_spin_lock_slow(lock)))); + + spin_time_accum(start_spin); } static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) { int cpu; + ADD_STATS(released_slow, 1); + for_each_online_cpu(cpu) { /* XXX should mix up next cpu selection */ if (per_cpu(lock_spinners, cpu) == xl) { + ADD_STATS(released_slow_kicked, 1); xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); break; } @@ -171,6 +279,8 @@ static void xen_spin_unlock(struct raw_spinlock *lock) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; + ADD_STATS(released, 1); + smp_wmb(); /* make sure no writes get moved after unlock */ xl->lock = 0; /* release lock */ @@ -216,3 +326,52 @@ void __init xen_init_spinlocks(void) pv_lock_ops.spin_trylock = xen_spin_trylock; pv_lock_ops.spin_unlock = xen_spin_unlock; } + +#ifdef CONFIG_XEN_DEBUG_FS + +static struct dentry *d_spin_debug; + +static int __init xen_spinlock_debugfs(void) +{ + struct dentry *d_xen = xen_init_debugfs(); + + if (d_xen == NULL) + return -ENOMEM; + + d_spin_debug = debugfs_create_dir("spinlocks", d_xen); + + debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); + + debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout); + + debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken); + debugfs_create_u32("taken_slow", 0444, d_spin_debug, + &spinlock_stats.taken_slow); + debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug, + &spinlock_stats.taken_slow_nested); + debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, + &spinlock_stats.taken_slow_pickup); + debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug, + &spinlock_stats.taken_slow_spurious); + + debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released); + debugfs_create_u32("released_slow", 0444, d_spin_debug, + &spinlock_stats.released_slow); + debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, + &spinlock_stats.released_slow_kicked); + + debugfs_create_u64("time_spinning", 0444, d_spin_debug, + &spinlock_stats.spinning_time); + debugfs_create_u64("time_total", 0444, d_spin_debug, + &spinlock_stats.total_time); + + xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug, + spinlock_stats.histo_spin, HISTO_BUCKETS + 1); + xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, + spinlock_stats.histo_spin_fast, HISTO_BUCKETS + 1); + + return 0; +} +fs_initcall(xen_spinlock_debugfs); + +#endif /* CONFIG_XEN_DEBUG_FS */ -- cgit v1.1 From 1e696f638b523958ee3d95e655becdb4c4b6fcde Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 20 Aug 2008 17:02:20 -0700 Subject: xen: allow interrupts to be enabled while doing a blocking spin If spin_lock is called in an interrupts-enabled context, we can safely enable interrupts while spinning. We don't bother for the actual spin loop, but if we timeout and fall back to blocking, it's definitely worthwhile enabling interrupts if possible. Signed-off-by: Jeremy Fitzhardinge Acked-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/xen/spinlock.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 0d8f3b2..e6061f2 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -23,6 +23,7 @@ static struct xen_spinlock_stats u32 taken_slow_nested; u32 taken_slow_pickup; u32 taken_slow_spurious; + u32 taken_slow_irqenable; u64 released; u32 released_slow; @@ -167,12 +168,13 @@ static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock __get_cpu_var(lock_spinners) = prev; } -static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) +static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; struct xen_spinlock *prev; int irq = __get_cpu_var(lock_kicker_irq); int ret; + unsigned long flags; /* If kicker interrupts not initialized yet, just spin */ if (irq == -1) @@ -181,6 +183,12 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) /* announce we're spinning */ prev = spinning_lock(xl); + flags = __raw_local_save_flags(); + if (irq_enable) { + ADD_STATS(taken_slow_irqenable, 1); + raw_local_irq_enable(); + } + ADD_STATS(taken_slow, 1); ADD_STATS(taken_slow_nested, prev != NULL); @@ -220,11 +228,12 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) kstat_this_cpu.irqs[irq]++; out: + raw_local_irq_restore(flags); unspinning_lock(xl, prev); return ret; } -static void xen_spin_lock(struct raw_spinlock *lock) +static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; unsigned timeout; @@ -254,11 +263,23 @@ static void xen_spin_lock(struct raw_spinlock *lock) : "memory"); spin_time_accum_fast(start_spin_fast); - } while (unlikely(oldval != 0 && (TIMEOUT == ~0 || !xen_spin_lock_slow(lock)))); + + } while (unlikely(oldval != 0 && + (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable)))); spin_time_accum(start_spin); } +static void xen_spin_lock(struct raw_spinlock *lock) +{ + __xen_spin_lock(lock, false); +} + +static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags) +{ + __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags)); +} + static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) { int cpu; @@ -323,6 +344,7 @@ void __init xen_init_spinlocks(void) pv_lock_ops.spin_is_locked = xen_spin_is_locked; pv_lock_ops.spin_is_contended = xen_spin_is_contended; pv_lock_ops.spin_lock = xen_spin_lock; + pv_lock_ops.spin_lock_flags = xen_spin_lock_flags; pv_lock_ops.spin_trylock = xen_spin_trylock; pv_lock_ops.spin_unlock = xen_spin_unlock; } @@ -353,6 +375,8 @@ static int __init xen_spinlock_debugfs(void) &spinlock_stats.taken_slow_pickup); debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug, &spinlock_stats.taken_slow_spurious); + debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug, + &spinlock_stats.taken_slow_irqenable); debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released); debugfs_create_u32("released_slow", 0444, d_spin_debug, -- cgit v1.1 From f8eca41f0afcc7042dc3398c8ba5878c59b1cf1b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 20 Aug 2008 17:02:21 -0700 Subject: xen: measure how long spinlocks spend blocking Measure how long spinlocks spend blocked. Also rename some fields to be more consistent. Signed-off-by: Jeremy Fitzhardinge Acked-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/xen/spinlock.c | 62 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 20 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index e6061f2..d072823 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -29,12 +29,14 @@ static struct xen_spinlock_stats u32 released_slow; u32 released_slow_kicked; -#define HISTO_BUCKETS 20 - u32 histo_spin_fast[HISTO_BUCKETS+1]; - u32 histo_spin[HISTO_BUCKETS+1]; - - u64 spinning_time; - u64 total_time; +#define HISTO_BUCKETS 30 + u32 histo_spin_total[HISTO_BUCKETS+1]; + u32 histo_spin_spinning[HISTO_BUCKETS+1]; + u32 histo_spin_blocked[HISTO_BUCKETS+1]; + + u64 time_total; + u64 time_spinning; + u64 time_blocked; } spinlock_stats; static u8 zero_stats; @@ -70,20 +72,28 @@ static void __spin_time_accum(u64 delta, u32 *array) array[HISTO_BUCKETS]++; } -static inline void spin_time_accum_fast(u64 start) +static inline void spin_time_accum_spinning(u64 start) +{ + u32 delta = xen_clocksource_read() - start; + + __spin_time_accum(delta, spinlock_stats.histo_spin_spinning); + spinlock_stats.time_spinning += delta; +} + +static inline void spin_time_accum_total(u64 start) { u32 delta = xen_clocksource_read() - start; - __spin_time_accum(delta, spinlock_stats.histo_spin_fast); - spinlock_stats.spinning_time += delta; + __spin_time_accum(delta, spinlock_stats.histo_spin_total); + spinlock_stats.time_total += delta; } -static inline void spin_time_accum(u64 start) +static inline void spin_time_accum_blocked(u64 start) { u32 delta = xen_clocksource_read() - start; - __spin_time_accum(delta, spinlock_stats.histo_spin); - spinlock_stats.total_time += delta; + __spin_time_accum(delta, spinlock_stats.histo_spin_blocked); + spinlock_stats.time_blocked += delta; } #else /* !CONFIG_XEN_DEBUG_FS */ #define TIMEOUT (1 << 10) @@ -94,10 +104,13 @@ static inline u64 spin_time_start(void) return 0; } -static inline void spin_time_accum_fast(u64 start) +static inline void spin_time_accum_total(u64 start) +{ +} +static inline void spin_time_accum_spinning(u64 start) { } -static inline void spin_time_accum(u64 start) +static inline void spin_time_accum_blocked(u64 start) { } #endif /* CONFIG_XEN_DEBUG_FS */ @@ -175,11 +188,14 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl int irq = __get_cpu_var(lock_kicker_irq); int ret; unsigned long flags; + u64 start; /* If kicker interrupts not initialized yet, just spin */ if (irq == -1) return 0; + start = spin_time_start(); + /* announce we're spinning */ prev = spinning_lock(xl); @@ -230,6 +246,8 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl out: raw_local_irq_restore(flags); unspinning_lock(xl, prev); + spin_time_accum_blocked(start); + return ret; } @@ -262,12 +280,12 @@ static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable) : "1" (1) : "memory"); - spin_time_accum_fast(start_spin_fast); + spin_time_accum_spinning(start_spin_fast); } while (unlikely(oldval != 0 && (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable)))); - spin_time_accum(start_spin); + spin_time_accum_total(start_spin); } static void xen_spin_lock(struct raw_spinlock *lock) @@ -385,14 +403,18 @@ static int __init xen_spinlock_debugfs(void) &spinlock_stats.released_slow_kicked); debugfs_create_u64("time_spinning", 0444, d_spin_debug, - &spinlock_stats.spinning_time); + &spinlock_stats.time_spinning); + debugfs_create_u64("time_blocked", 0444, d_spin_debug, + &spinlock_stats.time_blocked); debugfs_create_u64("time_total", 0444, d_spin_debug, - &spinlock_stats.total_time); + &spinlock_stats.time_total); xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug, - spinlock_stats.histo_spin, HISTO_BUCKETS + 1); + spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1); xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, - spinlock_stats.histo_spin_fast, HISTO_BUCKETS + 1); + spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1); + xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, + spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); return 0; } -- cgit v1.1 From f86399396ce7a4f4069828b7dceac5aa5113dfb5 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Wed, 30 Jul 2008 18:32:27 -0300 Subject: x86, paravirt_ops: use unsigned long instead of u32 for alloc_p*() pfn args This patch changes the pfn args from 'u32' to 'unsigned long' on alloc_p*() functions on paravirt_ops, and the corresponding implementations for Xen and VMI. The prototypes for CONFIG_PARAVIRT=n are already using unsigned long, so paravirt.h now matches the prototypes on asm-x86/pgalloc.h. It shouldn't result in any changes on generated code on 32-bit, with or without CONFIG_PARAVIRT. On both cases, 'codiff -f' didn't show any change after applying this patch. On 64-bit, there are (expected) binary changes only when CONFIG_PARAVIRT is enabled, as the patch is really supposed to change the size of the pfn args. [ v2: KVM_GUEST: use the right parameter type on kvm_release_pt() ] Signed-off-by: Eduardo Habkost Acked-by: Jeremy Fitzhardinge Acked-by: Zachary Amsden Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 9ff6e3c..db970bd 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -812,7 +812,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ -static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) +static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) { #ifdef CONFIG_FLATMEM BUG_ON(mem_map); /* should only be used early */ @@ -822,7 +822,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) /* Early release_pte assumes that all pts are pinned, since there's only init_mm and anything attached to that is pinned. */ -static void xen_release_pte_init(u32 pfn) +static void xen_release_pte_init(unsigned long pfn) { make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } @@ -838,7 +838,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) /* This needs to make sure the new pte page is pinned iff its being attached to a pinned pagetable. */ -static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) +static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level) { struct page *page = pfn_to_page(pfn); @@ -856,12 +856,12 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) } } -static void xen_alloc_pte(struct mm_struct *mm, u32 pfn) +static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) { xen_alloc_ptpage(mm, pfn, PT_PTE); } -static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn) +static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) { xen_alloc_ptpage(mm, pfn, PT_PMD); } @@ -909,7 +909,7 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) } /* This should never happen until we're OK to use struct page */ -static void xen_release_ptpage(u32 pfn, unsigned level) +static void xen_release_ptpage(unsigned long pfn, unsigned level) { struct page *page = pfn_to_page(pfn); @@ -923,23 +923,23 @@ static void xen_release_ptpage(u32 pfn, unsigned level) } } -static void xen_release_pte(u32 pfn) +static void xen_release_pte(unsigned long pfn) { xen_release_ptpage(pfn, PT_PTE); } -static void xen_release_pmd(u32 pfn) +static void xen_release_pmd(unsigned long pfn) { xen_release_ptpage(pfn, PT_PMD); } #if PAGETABLE_LEVELS == 4 -static void xen_alloc_pud(struct mm_struct *mm, u32 pfn) +static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) { xen_alloc_ptpage(mm, pfn, PT_PUD); } -static void xen_release_pud(u32 pfn) +static void xen_release_pud(unsigned long pfn) { xen_release_ptpage(pfn, PT_PUD); } -- cgit v1.1 From ee7686bc043921a488d9bf89fe93241c5457a74e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 21 Aug 2008 13:17:56 -0700 Subject: x86: build fix in "xen spinlock updates and performance measurements" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ingo Molnar wrote: > -tip testing found this build failure: > > arch/x86/xen/spinlock.c: In function ‘spin_time_start’: > arch/x86/xen/spinlock.c:60: error: implicit declaration of function ‘xen_clocksource_read’ > > i've excluded these new commits for now from tip/master - could you > please send a delta fix against tip/x86/xen? Make xen_clocksource_read non-static. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/time.c | 4 +--- arch/x86/xen/xen-ops.h | 2 ++ 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 685b774..20182d9 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -30,8 +30,6 @@ #define TIMER_SLOP 100000 #define NS_PER_TICK (1000000000LL / HZ) -static cycle_t xen_clocksource_read(void); - /* runstate info updated by Xen */ static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); @@ -213,7 +211,7 @@ unsigned long xen_tsc_khz(void) return xen_khz; } -static cycle_t xen_clocksource_read(void) +cycle_t xen_clocksource_read(void) { struct pvclock_vcpu_time_info *src; cycle_t ret; diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 3c70ebc..1e8bfda 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -2,6 +2,7 @@ #define XEN_OPS_H #include +#include #include #include @@ -33,6 +34,7 @@ void __init xen_build_dynamic_phys_to_machine(void); void xen_init_irq_ops(void); void xen_setup_timer(int cpu); +cycle_t xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); unsigned long xen_tsc_khz(void); void __init xen_time_init(void); -- cgit v1.1 From d68d82afd4c88e25763b23cd9cd4974573a3706f Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Fri, 22 Aug 2008 11:52:15 +0100 Subject: xen: implement CPU hotplugging Note the changes from 2.6.18-xen CPU hotplugging: A vcpu_down request from the remote admin via Xenbus both hotunplugs the CPU, and disables it by removing it from the cpu_present map, and removing its entry in /sys. A vcpu_up request from the remote admin only re-enables the CPU, and does not immediately bring the CPU up. A udev event is emitted, which can be caught by the user if he wishes to automatically re-up CPUs when available, or implement a more complex policy. Signed-off-by: Alex Nixon Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/smp.c | 60 +++++++++++++++++++++++++++++++++++++++---------- arch/x86/xen/spinlock.c | 5 +++++ arch/x86/xen/time.c | 8 +++++++ arch/x86/xen/xen-ops.h | 6 +++++ 4 files changed, 67 insertions(+), 12 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index baca7f2..be5cbb2 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -11,8 +11,6 @@ * useful topology information for the kernel to make use of. As a * result, all CPUs are treated as if they're single-core and * single-threaded. - * - * This does not handle HOTPLUG_CPU yet. */ #include #include @@ -61,11 +59,12 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static __cpuinit void cpu_bringup_and_idle(void) +static __cpuinit void cpu_bringup(void) { int cpu = smp_processor_id(); cpu_init(); + touch_softlockup_watchdog(); preempt_disable(); xen_enable_sysenter(); @@ -86,6 +85,11 @@ static __cpuinit void cpu_bringup_and_idle(void) local_irq_enable(); wmb(); /* make sure everything is out */ +} + +static __cpuinit void cpu_bringup_and_idle(void) +{ + cpu_bringup(); cpu_idle(); } @@ -209,8 +213,6 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) cpu_set(cpu, cpu_present_map); } - - //init_xenbus_allowed_cpumask(); } static __cpuinit int @@ -278,12 +280,6 @@ static int __cpuinit xen_cpu_up(unsigned int cpu) struct task_struct *idle = idle_task(cpu); int rc; -#if 0 - rc = cpu_up_check(cpu); - if (rc) - return rc; -#endif - #ifdef CONFIG_X86_64 /* Allocate node local memory for AP pdas */ WARN_ON(cpu == 0); @@ -336,6 +332,42 @@ static void xen_smp_cpus_done(unsigned int max_cpus) { } +int xen_cpu_disable(void) +{ + unsigned int cpu = smp_processor_id(); + if (cpu == 0) + return -EBUSY; + + cpu_disable_common(); + + load_cr3(swapper_pg_dir); + return 0; +} + +void xen_cpu_die(unsigned int cpu) +{ + while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) { + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(HZ/10); + } + unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL); + xen_uninit_lock_cpu(cpu); + xen_teardown_timer(cpu); + + if (num_online_cpus() == 1) + alternatives_smp_switch(0); +} + +void xen_play_dead(void) +{ + play_dead_common(); + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); + cpu_bringup(); +} + static void stop_self(void *v) { int cpu = smp_processor_id(); @@ -419,9 +451,13 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) static const struct smp_ops xen_smp_ops __initdata = { .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, .smp_prepare_cpus = xen_smp_prepare_cpus, - .cpu_up = xen_cpu_up, .smp_cpus_done = xen_smp_cpus_done, + .cpu_up = xen_cpu_up, + .cpu_die = xen_cpu_die, + .cpu_disable = xen_cpu_disable, + .play_dead = xen_play_dead, + .smp_send_stop = xen_smp_send_stop, .smp_send_reschedule = xen_smp_send_reschedule, diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index d072823..dd71e3a 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -357,6 +357,11 @@ void __cpuinit xen_init_lock_cpu(int cpu) printk("cpu %d spinlock event irq %d\n", cpu, irq); } +void xen_uninit_lock_cpu(int cpu) +{ + unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL); +} + void __init xen_init_spinlocks(void) { pv_lock_ops.spin_is_locked = xen_spin_is_locked; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 20182d9..004ba86 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -450,6 +450,14 @@ void xen_setup_timer(int cpu) setup_runstate_info(cpu); } +void xen_teardown_timer(int cpu) +{ + struct clock_event_device *evt; + BUG_ON(cpu == 0); + evt = &per_cpu(xen_clock_events, cpu); + unbind_from_irqhandler(evt->irq, NULL); +} + void xen_setup_cpu_clockevents(void) { BUG_ON(preemptible()); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 1e8bfda..8dbd97f 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -34,6 +34,7 @@ void __init xen_build_dynamic_phys_to_machine(void); void xen_init_irq_ops(void); void xen_setup_timer(int cpu); +void xen_teardown_timer(int cpu); cycle_t xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); unsigned long xen_tsc_khz(void); @@ -50,11 +51,16 @@ void xen_mark_init_mm_pinned(void); void __init xen_setup_vcpu_info_placement(void); +void xen_play_dead(void); +void xen_cpu_die(unsigned int cpu); +int xen_cpu_disable(void); + #ifdef CONFIG_SMP void xen_smp_init(void); void __init xen_init_spinlocks(void); __cpuinit void xen_init_lock_cpu(int cpu); +void xen_uninit_lock_cpu(int cpu); extern cpumask_t xen_cpu_initialized_map; #else -- cgit v1.1 From e4a6be4d2850da032a782b5296c07dfdf583af86 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Thu, 24 Jul 2008 12:15:45 -0300 Subject: x86, xen: Use native_pte_flags instead of native_pte_val for .pte_flags Using native_pte_val triggers the BUG_ON() in the paravirt_ops version of pte_flags(). Signed-off-by: Eduardo Habkost Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 9ff6e3c..a4e201b 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1324,7 +1324,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .ptep_modify_prot_commit = __ptep_modify_prot_commit, .pte_val = xen_pte_val, - .pte_flags = native_pte_val, + .pte_flags = native_pte_flags, .pgd_val = xen_pgd_val, .make_pte = xen_make_pte, -- cgit v1.1 From 2737146b3aa2cf8b5d5ae87a18c49fe1c374528b Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Mon, 8 Sep 2008 13:43:33 +0100 Subject: x86, xen: fix build when !CONFIG_HOTPLUG_CPU Signed-off-by: Alex Nixon Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/smp.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index be5cbb2..bf51a46 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -332,6 +332,7 @@ static void xen_smp_cpus_done(unsigned int max_cpus) { } +#ifdef CONFIG_HOTPLUG_CPU int xen_cpu_disable(void) { unsigned int cpu = smp_processor_id(); @@ -368,6 +369,23 @@ void xen_play_dead(void) cpu_bringup(); } +#else /* !CONFIG_HOTPLUG_CPU */ +int xen_cpu_disable(void) +{ + return -ENOSYS; +} + +void xen_cpu_die(unsigned int cpu) +{ + BUG(); +} + +void xen_play_dead(void) +{ + BUG(); +} + +#endif static void stop_self(void *v) { int cpu = smp_processor_id(); -- cgit v1.1 From 26fd10517e810dd59ea050b052de24a75ee6dc07 Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Mon, 8 Sep 2008 13:43:34 +0100 Subject: xen: make CPU hotplug functions static There's no need for these functions to be accessed from outside of xen/smp.c Signed-off-by: Alex Nixon Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/smp.c | 12 ++++++------ arch/x86/xen/xen-ops.h | 4 ---- 2 files changed, 6 insertions(+), 10 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index bf51a46..d77da61 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -333,7 +333,7 @@ static void xen_smp_cpus_done(unsigned int max_cpus) } #ifdef CONFIG_HOTPLUG_CPU -int xen_cpu_disable(void) +static int xen_cpu_disable(void) { unsigned int cpu = smp_processor_id(); if (cpu == 0) @@ -345,7 +345,7 @@ int xen_cpu_disable(void) return 0; } -void xen_cpu_die(unsigned int cpu) +static void xen_cpu_die(unsigned int cpu) { while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) { current->state = TASK_UNINTERRUPTIBLE; @@ -362,7 +362,7 @@ void xen_cpu_die(unsigned int cpu) alternatives_smp_switch(0); } -void xen_play_dead(void) +static void xen_play_dead(void) { play_dead_common(); HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); @@ -370,17 +370,17 @@ void xen_play_dead(void) } #else /* !CONFIG_HOTPLUG_CPU */ -int xen_cpu_disable(void) +static int xen_cpu_disable(void) { return -ENOSYS; } -void xen_cpu_die(unsigned int cpu) +static void xen_cpu_die(unsigned int cpu) { BUG(); } -void xen_play_dead(void) +static void xen_play_dead(void) { BUG(); } diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 8dbd97f..d7422dc 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -51,10 +51,6 @@ void xen_mark_init_mm_pinned(void); void __init xen_setup_vcpu_info_placement(void); -void xen_play_dead(void); -void xen_cpu_die(unsigned int cpu); -int xen_cpu_disable(void); - #ifdef CONFIG_SMP void xen_smp_init(void); -- cgit v1.1 From f7d0b926ac8c8ec0c7a83ee69409bd2e6bb39f81 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 9 Sep 2008 15:43:22 -0700 Subject: mm: define USE_SPLIT_PTLOCKS rather than repeating expression Define USE_SPLIT_PTLOCKS as a constant expression rather than repeating "NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS" all over the place. Signed-off-by: Jeremy Fitzhardinge Acked-by: Hugh Dickins Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index aa37469..2e1b640 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -646,7 +646,7 @@ static spinlock_t *lock_pte(struct page *page) { spinlock_t *ptl = NULL; -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS +#if USE_SPLIT_PTLOCKS ptl = __pte_lockptr(page); spin_lock(ptl); #endif -- cgit v1.1 From 6a9e91846bf52cc70a0417de19fdfac224c435c4 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 9 Sep 2008 15:43:25 -0700 Subject: xen: fix pinning when not using split pte locks We only pin PTE pages when using split PTE locks, so don't do the pin/unpin when attaching/detaching pte pages to a pinned pagetable. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b106e82..8ca2f88 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -826,7 +826,7 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) if (!PageHighMem(page)) { make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); - if (level == PT_PTE) + if (level == PT_PTE && USE_SPLIT_PTLOCKS) pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); } else /* make sure there are no stray mappings of @@ -894,7 +894,7 @@ static void xen_release_ptpage(u32 pfn, unsigned level) if (PagePinned(page)) { if (!PageHighMem(page)) { - if (level == PT_PTE) + if (level == PT_PTE && USE_SPLIT_PTLOCKS) pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } -- cgit v1.1 From 5670a43d710a12fcbbfaefd3991002768b488d82 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 14 Sep 2008 07:42:23 -0700 Subject: xen: fix for xen guest with mem > 3.7G PFN_PHYS() can truncate large addresses unless its passed a suitable large type. This is fixed more generally in the patch series introducing phys_addr_t, but we need a short-term fix to solve a Xen regression reported by Roberto De Ioris. Reported-by: Roberto De Ioris Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index b6acc3a..d679010 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -42,7 +42,7 @@ char * __init xen_memory_setup(void) e820.nr_map = 0; - e820_add_region(0, PFN_PHYS(max_pfn), E820_RAM); + e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM); /* * Even though this is normal, usable memory under Xen, reserve -- cgit v1.1 From 08115ab4d98cb577a83971ebd57cdfbcc6f50b68 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert Date: Mon, 29 Sep 2008 18:24:23 -0400 Subject: xen: make CONFIG_XEN_SAVE_RESTORE depend on CONFIG_XEN Xen options need to depend on XEN. Also, add newline at end of file. Without this patch you need to disable CONFIG_PM in order to disable CPU hotplugging. Signed-off-by: Chuck Ebbert Acked-by Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index d3e6846..87b9ab1 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -26,7 +26,7 @@ config XEN_MAX_DOMAIN_MEMORY config XEN_SAVE_RESTORE bool - depends on PM + depends on XEN && PM default y config XEN_DEBUG_FS @@ -35,4 +35,4 @@ config XEN_DEBUG_FS default n help Enable statistics output and various tuning options in debugfs. - Enabling this option may incur a significant performance overhead. \ No newline at end of file + Enabling this option may incur a significant performance overhead. -- cgit v1.1 From db053b86f4b1ec790da2dafe2acb93be76288bb9 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 2 Oct 2008 16:41:31 -0700 Subject: xen: clean up x86-64 warnings There are a couple of Xen features which rely on directly accessing per-cpu data via a segment register, which is not yet available on x86-64. In the meantime, just disable direct access to the vcpu info structure; this leaves some of the code as dead, but it will come to life in time, and the warnings are suppressed. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 61 ++++++++--------------------------------------- arch/x86/xen/xen-asm_64.S | 20 +++++++++++++--- 2 files changed, 27 insertions(+), 54 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 8ca2f88..85692c9f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -112,7 +112,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; * * 0: not available, 1: available */ -static int have_vcpu_info_placement = 1; +static int have_vcpu_info_placement = +#ifdef CONFIG_X86_32 + 1 +#else + 0 +#endif + ; + static void xen_vcpu_setup(int cpu) { @@ -941,6 +948,7 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) } #endif +#ifdef CONFIG_X86_32 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) { /* If there's an existing pte, then don't allow _PAGE_RW to be set */ @@ -959,6 +967,7 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) xen_set_pte(ptep, pte); } +#endif static __init void xen_pagetable_setup_start(pgd_t *base) { @@ -1025,7 +1034,6 @@ void xen_setup_vcpu_info_placement(void) /* xen_vcpu_setup managed to place the vcpu_info within the percpu area for all cpus, so make use of it */ -#ifdef CONFIG_X86_32 if (have_vcpu_info_placement) { printk(KERN_INFO "Xen: using vcpu_info placement\n"); @@ -1035,7 +1043,6 @@ void xen_setup_vcpu_info_placement(void) pv_irq_ops.irq_enable = xen_irq_enable_direct; pv_mmu_ops.read_cr2 = xen_read_cr2_direct; } -#endif } static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, @@ -1056,12 +1063,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, goto patch_site switch (type) { -#ifdef CONFIG_X86_32 SITE(pv_irq_ops, irq_enable); SITE(pv_irq_ops, irq_disable); SITE(pv_irq_ops, save_fl); SITE(pv_irq_ops, restore_fl); -#endif /* CONFIG_X86_32 */ #undef SITE patch_site: @@ -1399,48 +1404,11 @@ static void *m2v(phys_addr_t maddr) return __ka(m2p(maddr)); } -#ifdef CONFIG_X86_64 -static void walk(pgd_t *pgd, unsigned long addr) -{ - unsigned l4idx = pgd_index(addr); - unsigned l3idx = pud_index(addr); - unsigned l2idx = pmd_index(addr); - unsigned l1idx = pte_index(addr); - pgd_t l4; - pud_t l3; - pmd_t l2; - pte_t l1; - - xen_raw_printk("walk %p, %lx -> %d %d %d %d\n", - pgd, addr, l4idx, l3idx, l2idx, l1idx); - - l4 = pgd[l4idx]; - xen_raw_printk(" l4: %016lx\n", l4.pgd); - xen_raw_printk(" %016lx\n", pgd_val(l4)); - - l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx]; - xen_raw_printk(" l3: %016lx\n", l3.pud); - xen_raw_printk(" %016lx\n", pud_val(l3)); - - l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx]; - xen_raw_printk(" l2: %016lx\n", l2.pmd); - xen_raw_printk(" %016lx\n", pmd_val(l2)); - - l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx]; - xen_raw_printk(" l1: %016lx\n", l1.pte); - xen_raw_printk(" %016lx\n", pte_val(l1)); -} -#endif - static void set_page_prot(void *addr, pgprot_t prot) { unsigned long pfn = __pa(addr) >> PAGE_SHIFT; pte_t pte = pfn_pte(pfn, prot); - xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n", - addr, pfn, get_phys_to_machine(pfn), - pgprot_val(prot), pte.pte); - if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) BUG(); } @@ -1698,15 +1666,6 @@ asmlinkage void __init xen_start_kernel(void) xen_raw_console_write("about to get started...\n"); -#if 0 - xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n", - &boot_params, __pa_symbol(&boot_params), - __va(__pa_symbol(&boot_params))); - - walk(pgd, &boot_params); - walk(pgd, __va(__pa(&boot_params))); -#endif - /* Start the world */ #ifdef CONFIG_X86_32 i386_start_kernel(); diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 3b9bda4..05794c5 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -26,8 +26,15 @@ /* Pseudo-flag used for virtual NMI, which we don't implement yet */ #define XEN_EFLAGS_NMI 0x80000000 -#if 0 -#include +#if 1 +/* + x86-64 does not yet support direct access to percpu variables + via a segment override, so we just need to make sure this code + never gets used + */ +#define BUG ud2a +#define PER_CPU_VAR(var, off) 0xdeadbeef +#endif /* Enable events. This clears the event mask and tests the pending @@ -35,6 +42,8 @@ events, then enter the hypervisor to get them handled. */ ENTRY(xen_irq_enable_direct) + BUG + /* Unmask events */ movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) @@ -58,6 +67,8 @@ ENDPATCH(xen_irq_enable_direct) non-zero. */ ENTRY(xen_irq_disable_direct) + BUG + movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) ENDPATCH(xen_irq_disable_direct) ret @@ -74,6 +85,8 @@ ENDPATCH(xen_irq_disable_direct) Xen and x86 use opposite senses (mask vs enable). */ ENTRY(xen_save_fl_direct) + BUG + testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) setz %ah addb %ah,%ah @@ -91,6 +104,8 @@ ENDPATCH(xen_save_fl_direct) if so. */ ENTRY(xen_restore_fl_direct) + BUG + testb $X86_EFLAGS_IF>>8, %ah setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) /* Preempt here doesn't matter because that will deal with @@ -133,7 +148,6 @@ check_events: pop %rcx pop %rax ret -#endif ENTRY(xen_adjust_exception_frame) mov 8+0(%rsp),%rcx -- cgit v1.1 From eefb47f6a1e855653d275cb90592a3587ea93a09 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 8 Oct 2008 13:01:39 -0700 Subject: xen: use spin_lock_nest_lock when pinning a pagetable When pinning/unpinning a pagetable with split pte locks, we can end up holding multiple pte locks at once (we need to hold the locks while there's a pending batched hypercall affecting the pte page). Because all the pte locks are in the same lock class, lockdep thinks that we're potentially taking a lock recursively. This warning is spurious because we always take the pte locks while holding mm->page_table_lock. lockdep now has spin_lock_nest_lock to express this kind of dominant lock use, so use it here so that lockdep knows what's going on. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 74 +++++++++++++++++++++++++++++++++++------------------- arch/x86/xen/mmu.h | 3 --- 2 files changed, 48 insertions(+), 29 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 64e5868..ae173f6 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -651,9 +651,12 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) * For 64-bit, we must skip the Xen hole in the middle of the address * space, just after the big x86-64 virtual hole. */ -static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), +static int xen_pgd_walk(struct mm_struct *mm, + int (*func)(struct mm_struct *mm, struct page *, + enum pt_level), unsigned long limit) { + pgd_t *pgd = mm->pgd; int flush = 0; unsigned hole_low, hole_high; unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; @@ -698,7 +701,7 @@ static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), pud = pud_offset(&pgd[pgdidx], 0); if (PTRS_PER_PUD > 1) /* not folded */ - flush |= (*func)(virt_to_page(pud), PT_PUD); + flush |= (*func)(mm, virt_to_page(pud), PT_PUD); for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { pmd_t *pmd; @@ -713,7 +716,7 @@ static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), pmd = pmd_offset(&pud[pudidx], 0); if (PTRS_PER_PMD > 1) /* not folded */ - flush |= (*func)(virt_to_page(pmd), PT_PMD); + flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { struct page *pte; @@ -727,7 +730,7 @@ static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), continue; pte = pmd_page(pmd[pmdidx]); - flush |= (*func)(pte, PT_PTE); + flush |= (*func)(mm, pte, PT_PTE); } } } @@ -735,20 +738,20 @@ static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), out: /* Do the top level last, so that the callbacks can use it as a cue to do final things like tlb flushes. */ - flush |= (*func)(virt_to_page(pgd), PT_PGD); + flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); return flush; } /* If we're using split pte locks, then take the page's lock and return a pointer to it. Otherwise return NULL. */ -static spinlock_t *xen_pte_lock(struct page *page) +static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) { spinlock_t *ptl = NULL; #if USE_SPLIT_PTLOCKS ptl = __pte_lockptr(page); - spin_lock(ptl); + spin_lock_nest_lock(ptl, &mm->page_table_lock); #endif return ptl; @@ -772,7 +775,8 @@ static void xen_do_pin(unsigned level, unsigned long pfn) MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); } -static int xen_pin_page(struct page *page, enum pt_level level) +static int xen_pin_page(struct mm_struct *mm, struct page *page, + enum pt_level level) { unsigned pgfl = TestSetPagePinned(page); int flush; @@ -813,7 +817,7 @@ static int xen_pin_page(struct page *page, enum pt_level level) */ ptl = NULL; if (level == PT_PTE) - ptl = xen_pte_lock(page); + ptl = xen_pte_lock(page, mm); MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, pfn_pte(pfn, PAGE_KERNEL_RO), @@ -834,11 +838,11 @@ static int xen_pin_page(struct page *page, enum pt_level level) /* This is called just after a mm has been created, but it has not been used yet. We need to make sure that its pagetable is all read-only, and can be pinned. */ -void xen_pgd_pin(pgd_t *pgd) +static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) { xen_mc_batch(); - if (xen_pgd_walk(pgd, xen_pin_page, USER_LIMIT)) { + if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) { /* re-enable interrupts for kmap_flush_unused */ xen_mc_issue(0); kmap_flush_unused(); @@ -852,25 +856,35 @@ void xen_pgd_pin(pgd_t *pgd) xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); if (user_pgd) { - xen_pin_page(virt_to_page(user_pgd), PT_PGD); + xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); } } #else /* CONFIG_X86_32 */ #ifdef CONFIG_X86_PAE /* Need to make sure unshared kernel PMD is pinnable */ - xen_pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); + xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), + PT_PMD); #endif xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); #endif /* CONFIG_X86_64 */ xen_mc_issue(0); } +static void xen_pgd_pin(struct mm_struct *mm) +{ + __xen_pgd_pin(mm, mm->pgd); +} + /* * On save, we need to pin all pagetables to make sure they get their * mfns turned into pfns. Search the list for any unpinned pgds and pin * them (unpinned pgds are not currently in use, probably because the * process is under construction or destruction). + * + * Expected to be called in stop_machine() ("equivalent to taking + * every spinlock in the system"), so the locking doesn't really + * matter all that much. */ void xen_mm_pin_all(void) { @@ -881,7 +895,7 @@ void xen_mm_pin_all(void) list_for_each_entry(page, &pgd_list, lru) { if (!PagePinned(page)) { - xen_pgd_pin((pgd_t *)page_address(page)); + __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page)); SetPageSavePinned(page); } } @@ -894,7 +908,8 @@ void xen_mm_pin_all(void) * that's before we have page structures to store the bits. So do all * the book-keeping now. */ -static __init int xen_mark_pinned(struct page *page, enum pt_level level) +static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page, + enum pt_level level) { SetPagePinned(page); return 0; @@ -902,10 +917,11 @@ static __init int xen_mark_pinned(struct page *page, enum pt_level level) void __init xen_mark_init_mm_pinned(void) { - xen_pgd_walk(init_mm.pgd, xen_mark_pinned, FIXADDR_TOP); + xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); } -static int xen_unpin_page(struct page *page, enum pt_level level) +static int xen_unpin_page(struct mm_struct *mm, struct page *page, + enum pt_level level) { unsigned pgfl = TestClearPagePinned(page); @@ -923,7 +939,7 @@ static int xen_unpin_page(struct page *page, enum pt_level level) * partially-pinned state. */ if (level == PT_PTE) { - ptl = xen_pte_lock(page); + ptl = xen_pte_lock(page, mm); if (ptl) xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); @@ -945,7 +961,7 @@ static int xen_unpin_page(struct page *page, enum pt_level level) } /* Release a pagetables pages back as normal RW */ -static void xen_pgd_unpin(pgd_t *pgd) +static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) { xen_mc_batch(); @@ -957,21 +973,27 @@ static void xen_pgd_unpin(pgd_t *pgd) if (user_pgd) { xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); - xen_unpin_page(virt_to_page(user_pgd), PT_PGD); + xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); } } #endif #ifdef CONFIG_X86_PAE /* Need to make sure unshared kernel PMD is unpinned */ - xen_unpin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); + xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), + PT_PMD); #endif - xen_pgd_walk(pgd, xen_unpin_page, USER_LIMIT); + xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT); xen_mc_issue(0); } +static void xen_pgd_unpin(struct mm_struct *mm) +{ + __xen_pgd_unpin(mm, mm->pgd); +} + /* * On resume, undo any pinning done at save, so that the rest of the * kernel doesn't see any unexpected pinned pagetables. @@ -986,7 +1008,7 @@ void xen_mm_unpin_all(void) list_for_each_entry(page, &pgd_list, lru) { if (PageSavePinned(page)) { BUG_ON(!PagePinned(page)); - xen_pgd_unpin((pgd_t *)page_address(page)); + __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page)); ClearPageSavePinned(page); } } @@ -997,14 +1019,14 @@ void xen_mm_unpin_all(void) void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) { spin_lock(&next->page_table_lock); - xen_pgd_pin(next->pgd); + xen_pgd_pin(next); spin_unlock(&next->page_table_lock); } void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { spin_lock(&mm->page_table_lock); - xen_pgd_pin(mm->pgd); + xen_pgd_pin(mm); spin_unlock(&mm->page_table_lock); } @@ -1095,7 +1117,7 @@ void xen_exit_mmap(struct mm_struct *mm) /* pgd may not be pinned in the error exit path of execve */ if (xen_page_pinned(mm->pgd)) - xen_pgd_unpin(mm->pgd); + xen_pgd_unpin(mm); spin_unlock(&mm->page_table_lock); } diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 0f59bd0..98d7165 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -18,9 +18,6 @@ void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); void xen_exit_mmap(struct mm_struct *mm); -void xen_pgd_pin(pgd_t *pgd); -//void xen_pgd_unpin(pgd_t *pgd); - pteval_t xen_pte_val(pte_t); pmdval_t xen_pmd_val(pmd_t); pgdval_t xen_pgd_val(pgd_t); -- cgit v1.1 From 5dc64a3442b98eaa0e3730c35fcf00cf962a93e7 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 10 Oct 2008 11:27:38 +0100 Subject: xen: do not reserve 2 pages of padding between hypervisor and fixmap. When reserving space for the hypervisor the Xen paravirt backend adds an extra two pages (this was carried forward from the 2.6.18-xen tree which had them "for safety"). Depending on various CONFIG options this can cause the boot time fixmaps to span multiple PMDs which is not supported and triggers a WARN in early_ioremap_init(). This was exposed by 2216d199b1430d1c0affb1498a9ebdbd9c0de439 which moved the dmi table parsing earlier. x86: fix CONFIG_X86_RESERVE_LOW_64K=y The bad_bios_dmi_table() quirk never triggered because we do DMI setup too late. Move it a bit earlier. There is no real reason to reserve these two extra pages and the fixmap already incorporates FIX_HOLE which serves the same purpose. None of the other callers of reserve_top_address do this. Signed-off-by: Ian Campbell Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 85692c9f..977a542 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1370,7 +1370,7 @@ static void __init xen_reserve_top(void) if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) top = pp.virt_start; - reserve_top_address(-top + 2 * PAGE_SIZE); + reserve_top_address(-top); #endif /* CONFIG_X86_32 */ } -- cgit v1.1