diff options
author | Timothy Pearson <tpearson@raptorengineering.com> | 2017-08-23 14:45:25 -0500 |
---|---|---|
committer | Timothy Pearson <tpearson@raptorengineering.com> | 2017-08-23 14:45:25 -0500 |
commit | fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204 (patch) | |
tree | 22962a4387943edc841c72a4e636a068c66d58fd /arch/powerpc/mm | |
download | ast2050-linux-kernel-fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204.zip ast2050-linux-kernel-fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204.tar.gz |
Initial import of modified Linux 2.6.28 tree
Original upstream URL:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git | branch linux-2.6.28.y
Diffstat (limited to 'arch/powerpc/mm')
29 files changed, 11218 insertions, 0 deletions
diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c new file mode 100644 index 0000000..29954dc --- /dev/null +++ b/arch/powerpc/mm/40x_mmu.c @@ -0,0 +1,146 @@ +/* + * This file contains the routines for initializing the MMU + * on the 4xx series of chips. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/stddef.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/highmem.h> + +#include <asm/pgalloc.h> +#include <asm/prom.h> +#include <asm/io.h> +#include <asm/mmu_context.h> +#include <asm/pgtable.h> +#include <asm/mmu.h> +#include <asm/uaccess.h> +#include <asm/smp.h> +#include <asm/bootx.h> +#include <asm/machdep.h> +#include <asm/setup.h> +#include "mmu_decl.h" + +extern int __map_without_ltlbs; +/* + * MMU_init_hw does the chip-specific initialization of the MMU hardware. + */ +void __init MMU_init_hw(void) +{ + /* + * The Zone Protection Register (ZPR) defines how protection will + * be applied to every page which is a member of a given zone. At + * present, we utilize only two of the 4xx's zones. + * The zone index bits (of ZSEL) in the PTE are used for software + * indicators, except the LSB. For user access, zone 1 is used, + * for kernel access, zone 0 is used. We set all but zone 1 + * to zero, allowing only kernel access as indicated in the PTE. + * For zone 1, we set a 01 binary (a value of 10 will not work) + * to allow user access as indicated in the PTE. This also allows + * kernel access as indicated in the PTE. + */ + + mtspr(SPRN_ZPR, 0x10000000); + + flush_instruction_cache(); + + /* + * Set up the real-mode cache parameters for the exception vector + * handlers (which are run in real-mode). + */ + + mtspr(SPRN_DCWR, 0x00000000); /* All caching is write-back */ + + /* + * Cache instruction and data space where the exception + * vectors and the kernel live in real-mode. + */ + + mtspr(SPRN_DCCR, 0xF0000000); /* 512 MB of data space at 0x0. */ + mtspr(SPRN_ICCR, 0xF0000000); /* 512 MB of instr. space at 0x0. */ +} + +#define LARGE_PAGE_SIZE_16M (1<<24) +#define LARGE_PAGE_SIZE_4M (1<<22) + +unsigned long __init mmu_mapin_ram(void) +{ + unsigned long v, s, mapped; + phys_addr_t p; + + v = KERNELBASE; + p = 0; + s = total_lowmem; + + if (__map_without_ltlbs) + return 0; + + while (s >= LARGE_PAGE_SIZE_16M) { + pmd_t *pmdp; + unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE; + + pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); + pmd_val(*pmdp++) = val; + pmd_val(*pmdp++) = val; + pmd_val(*pmdp++) = val; + pmd_val(*pmdp++) = val; + + v += LARGE_PAGE_SIZE_16M; + p += LARGE_PAGE_SIZE_16M; + s -= LARGE_PAGE_SIZE_16M; + } + + while (s >= LARGE_PAGE_SIZE_4M) { + pmd_t *pmdp; + unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE; + + pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); + pmd_val(*pmdp) = val; + + v += LARGE_PAGE_SIZE_4M; + p += LARGE_PAGE_SIZE_4M; + s -= LARGE_PAGE_SIZE_4M; + } + + mapped = total_lowmem - s; + + /* If the size of RAM is not an exact power of two, we may not + * have covered RAM in its entirety with 16 and 4 MiB + * pages. Consequently, restrict the top end of RAM currently + * allocable so that calls to the LMB to allocate PTEs for "tail" + * coverage with normal-sized pages (or other reasons) do not + * attempt to allocate outside the allowed range. + */ + + __initial_memory_limit_addr = memstart_addr + mapped; + + return mapped; +} diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c new file mode 100644 index 0000000..98052ac --- /dev/null +++ b/arch/powerpc/mm/44x_mmu.c @@ -0,0 +1,102 @@ +/* + * Modifications by Matt Porter (mporter@mvista.com) to support + * PPC44x Book E processors. + * + * This file contains the routines for initializing the MMU + * on the 4xx series of chips. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/init.h> +#include <asm/mmu.h> +#include <asm/system.h> +#include <asm/page.h> +#include <asm/cacheflush.h> + +#include "mmu_decl.h" + +/* Used by the 44x TLB replacement exception handler. + * Just needed it declared someplace. + */ +unsigned int tlb_44x_index; /* = 0 */ +unsigned int tlb_44x_hwater = PPC44x_TLB_SIZE - 1 - PPC44x_EARLY_TLBS; +int icache_44x_need_flush; + +static void __init ppc44x_update_tlb_hwater(void) +{ + extern unsigned int tlb_44x_patch_hwater_D[]; + extern unsigned int tlb_44x_patch_hwater_I[]; + + /* The TLB miss handlers hard codes the watermark in a cmpli + * instruction to improve performances rather than loading it + * from the global variable. Thus, we patch the instructions + * in the 2 TLB miss handlers when updating the value + */ + tlb_44x_patch_hwater_D[0] = (tlb_44x_patch_hwater_D[0] & 0xffff0000) | + tlb_44x_hwater; + flush_icache_range((unsigned long)&tlb_44x_patch_hwater_D[0], + (unsigned long)&tlb_44x_patch_hwater_D[1]); + tlb_44x_patch_hwater_I[0] = (tlb_44x_patch_hwater_I[0] & 0xffff0000) | + tlb_44x_hwater; + flush_icache_range((unsigned long)&tlb_44x_patch_hwater_I[0], + (unsigned long)&tlb_44x_patch_hwater_I[1]); +} + +/* + * "Pins" a 256MB TLB entry in AS0 for kernel lowmem + */ +static void __init ppc44x_pin_tlb(unsigned int virt, unsigned int phys) +{ + unsigned int entry = tlb_44x_hwater--; + + ppc44x_update_tlb_hwater(); + + __asm__ __volatile__( + "tlbwe %2,%3,%4\n" + "tlbwe %1,%3,%5\n" + "tlbwe %0,%3,%6\n" + : + : "r" (PPC44x_TLB_SW | PPC44x_TLB_SR | PPC44x_TLB_SX | PPC44x_TLB_G), + "r" (phys), + "r" (virt | PPC44x_TLB_VALID | PPC44x_TLB_256M), + "r" (entry), + "i" (PPC44x_TLB_PAGEID), + "i" (PPC44x_TLB_XLAT), + "i" (PPC44x_TLB_ATTRIB)); +} + +void __init MMU_init_hw(void) +{ + ppc44x_update_tlb_hwater(); + + flush_instruction_cache(); +} + +unsigned long __init mmu_mapin_ram(void) +{ + unsigned long addr; + + /* Pin in enough TLBs to cover any lowmem not covered by the + * initial 256M mapping established in head_44x.S */ + for (addr = PPC_PIN_SIZE; addr < lowmem_end_addr; + addr += PPC_PIN_SIZE) + ppc44x_pin_tlb(addr + PAGE_OFFSET, addr); + + return total_lowmem; +} diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile new file mode 100644 index 0000000..e7392b4 --- /dev/null +++ b/arch/powerpc/mm/Makefile @@ -0,0 +1,26 @@ +# +# Makefile for the linux ppc-specific parts of the memory manager. +# + +ifeq ($(CONFIG_PPC64),y) +EXTRA_CFLAGS += -mno-minimal-toc +endif + +obj-y := fault.o mem.o \ + init_$(CONFIG_WORD_SIZE).o \ + pgtable_$(CONFIG_WORD_SIZE).o \ + mmu_context_$(CONFIG_WORD_SIZE).o +hash-$(CONFIG_PPC_NATIVE) := hash_native_64.o +obj-$(CONFIG_PPC64) += hash_utils_64.o \ + slb_low.o slb.o stab.o \ + gup.o mmap.o $(hash-y) +obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o +obj-$(CONFIG_PPC_STD_MMU) += hash_low_$(CONFIG_WORD_SIZE).o \ + tlb_$(CONFIG_WORD_SIZE).o +obj-$(CONFIG_40x) += 40x_mmu.o +obj-$(CONFIG_44x) += 44x_mmu.o +obj-$(CONFIG_FSL_BOOKE) += fsl_booke_mmu.o +obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o +obj-$(CONFIG_PPC_MM_SLICES) += slice.o +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c new file mode 100644 index 0000000..565b7a2 --- /dev/null +++ b/arch/powerpc/mm/fault.c @@ -0,0 +1,412 @@ +/* + * PowerPC version + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Derived from "arch/i386/mm/fault.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Modified by Cort Dougan and Paul Mackerras. + * + * Modified for PPC64 by Dave Engebretsen (engebret@ibm.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/kprobes.h> +#include <linux/kdebug.h> + +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/mmu.h> +#include <asm/mmu_context.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <asm/tlbflush.h> +#include <asm/siginfo.h> + + +#ifdef CONFIG_KPROBES +static inline int notify_page_fault(struct pt_regs *regs) +{ + int ret = 0; + + /* kprobe_running() needs smp_processor_id() */ + if (!user_mode(regs)) { + preempt_disable(); + if (kprobe_running() && kprobe_fault_handler(regs, 11)) + ret = 1; + preempt_enable(); + } + + return ret; +} +#else +static inline int notify_page_fault(struct pt_regs *regs) +{ + return 0; +} +#endif + +/* + * Check whether the instruction at regs->nip is a store using + * an update addressing form which will update r1. + */ +static int store_updates_sp(struct pt_regs *regs) +{ + unsigned int inst; + + if (get_user(inst, (unsigned int __user *)regs->nip)) + return 0; + /* check for 1 in the rA field */ + if (((inst >> 16) & 0x1f) != 1) + return 0; + /* check major opcode */ + switch (inst >> 26) { + case 37: /* stwu */ + case 39: /* stbu */ + case 45: /* sthu */ + case 53: /* stfsu */ + case 55: /* stfdu */ + return 1; + case 62: /* std or stdu */ + return (inst & 3) == 1; + case 31: + /* check minor opcode */ + switch ((inst >> 1) & 0x3ff) { + case 181: /* stdux */ + case 183: /* stwux */ + case 247: /* stbux */ + case 439: /* sthux */ + case 695: /* stfsux */ + case 759: /* stfdux */ + return 1; + } + } + return 0; +} + +/* + * For 600- and 800-family processors, the error_code parameter is DSISR + * for a data fault, SRR1 for an instruction fault. For 400-family processors + * the error_code parameter is ESR for a data fault, 0 for an instruction + * fault. + * For 64-bit processors, the error_code parameter is + * - DSISR for a non-SLB data access fault, + * - SRR1 & 0x08000000 for a non-SLB instruction access fault + * - 0 any SLB fault. + * + * The return value is 0 if the fault was handled, or the signal + * number if this is a kernel fault that can't be handled here. + */ +int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, + unsigned long error_code) +{ + struct vm_area_struct * vma; + struct mm_struct *mm = current->mm; + siginfo_t info; + int code = SEGV_MAPERR; + int is_write = 0, ret; + int trap = TRAP(regs); + int is_exec = trap == 0x400; + +#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) + /* + * Fortunately the bit assignments in SRR1 for an instruction + * fault and DSISR for a data fault are mostly the same for the + * bits we are interested in. But there are some bits which + * indicate errors in DSISR but can validly be set in SRR1. + */ + if (trap == 0x400) + error_code &= 0x48200000; + else + is_write = error_code & DSISR_ISSTORE; +#else + is_write = error_code & ESR_DST; +#endif /* CONFIG_4xx || CONFIG_BOOKE */ + + if (notify_page_fault(regs)) + return 0; + + if (unlikely(debugger_fault_handler(regs))) + return 0; + + /* On a kernel SLB miss we can only check for a valid exception entry */ + if (!user_mode(regs) && (address >= TASK_SIZE)) + return SIGSEGV; + +#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) + if (error_code & DSISR_DABRMATCH) { + /* DABR match */ + do_dabr(regs, address, error_code); + return 0; + } +#endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/ + + if (in_atomic() || mm == NULL) { + if (!user_mode(regs)) + return SIGSEGV; + /* in_atomic() in user mode is really bad, + as is current->mm == NULL. */ + printk(KERN_EMERG "Page fault in user mode with " + "in_atomic() = %d mm = %p\n", in_atomic(), mm); + printk(KERN_EMERG "NIP = %lx MSR = %lx\n", + regs->nip, regs->msr); + die("Weird page fault", regs, SIGSEGV); + } + + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunately, in the case of an + * erroneous fault occurring in a code path which already holds mmap_sem + * we will deadlock attempting to validate the fault against the + * address space. Luckily the kernel only validly references user + * space from well defined areas of code, which are listed in the + * exceptions table. + * + * As the vast majority of faults will be valid we will only perform + * the source reference check when there is a possibility of a deadlock. + * Attempt to lock the address space, if we cannot we then validate the + * source. If this is invalid we can skip the address space check, + * thus avoiding the deadlock. + */ + if (!down_read_trylock(&mm->mmap_sem)) { + if (!user_mode(regs) && !search_exception_tables(regs->nip)) + goto bad_area_nosemaphore; + + down_read(&mm->mmap_sem); + } + + vma = find_vma(mm, address); + if (!vma) + goto bad_area; + if (vma->vm_start <= address) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + + /* + * N.B. The POWER/Open ABI allows programs to access up to + * 288 bytes below the stack pointer. + * The kernel signal delivery code writes up to about 1.5kB + * below the stack pointer (r1) before decrementing it. + * The exec code can write slightly over 640kB to the stack + * before setting the user r1. Thus we allow the stack to + * expand to 1MB without further checks. + */ + if (address + 0x100000 < vma->vm_end) { + /* get user regs even if this fault is in kernel mode */ + struct pt_regs *uregs = current->thread.regs; + if (uregs == NULL) + goto bad_area; + + /* + * A user-mode access to an address a long way below + * the stack pointer is only valid if the instruction + * is one which would update the stack pointer to the + * address accessed if the instruction completed, + * i.e. either stwu rs,n(r1) or stwux rs,r1,rb + * (or the byte, halfword, float or double forms). + * + * If we don't check this then any write to the area + * between the last mapped region and the stack will + * expand the stack rather than segfaulting. + */ + if (address + 2048 < uregs->gpr[1] + && (!user_mode(regs) || !store_updates_sp(regs))) + goto bad_area; + } + if (expand_stack(vma, address)) + goto bad_area; + +good_area: + code = SEGV_ACCERR; +#if defined(CONFIG_6xx) + if (error_code & 0x95700000) + /* an error such as lwarx to I/O controller space, + address matching DABR, eciwx, etc. */ + goto bad_area; +#endif /* CONFIG_6xx */ +#if defined(CONFIG_8xx) + /* The MPC8xx seems to always set 0x80000000, which is + * "undefined". Of those that can be set, this is the only + * one which seems bad. + */ + if (error_code & 0x10000000) + /* Guarded storage error. */ + goto bad_area; +#endif /* CONFIG_8xx */ + + if (is_exec) { +#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) + /* protection fault */ + if (error_code & DSISR_PROTFAULT) + goto bad_area; + /* + * Allow execution from readable areas if the MMU does not + * provide separate controls over reading and executing. + */ + if (!(vma->vm_flags & VM_EXEC) && + (cpu_has_feature(CPU_FTR_NOEXECUTE) || + !(vma->vm_flags & (VM_READ | VM_WRITE)))) + goto bad_area; +#else + pte_t *ptep; + pmd_t *pmdp; + + /* Since 4xx/Book-E supports per-page execute permission, + * we lazily flush dcache to icache. */ + ptep = NULL; + if (get_pteptr(mm, address, &ptep, &pmdp)) { + spinlock_t *ptl = pte_lockptr(mm, pmdp); + spin_lock(ptl); + if (pte_present(*ptep)) { + struct page *page = pte_page(*ptep); + + if (!test_bit(PG_arch_1, &page->flags)) { + flush_dcache_icache_page(page); + set_bit(PG_arch_1, &page->flags); + } + pte_update(ptep, 0, _PAGE_HWEXEC | + _PAGE_ACCESSED); + _tlbie(address, mm->context.id); + pte_unmap_unlock(ptep, ptl); + up_read(&mm->mmap_sem); + return 0; + } + pte_unmap_unlock(ptep, ptl); + } +#endif + /* a write */ + } else if (is_write) { + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; + /* a read */ + } else { + /* protection fault */ + if (error_code & 0x08000000) + goto bad_area; + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) + goto bad_area; + } + + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ + survive: + ret = handle_mm_fault(mm, vma, address, is_write); + if (unlikely(ret & VM_FAULT_ERROR)) { + if (ret & VM_FAULT_OOM) + goto out_of_memory; + else if (ret & VM_FAULT_SIGBUS) + goto do_sigbus; + BUG(); + } + if (ret & VM_FAULT_MAJOR) + current->maj_flt++; + else + current->min_flt++; + up_read(&mm->mmap_sem); + return 0; + +bad_area: + up_read(&mm->mmap_sem); + +bad_area_nosemaphore: + /* User mode accesses cause a SIGSEGV */ + if (user_mode(regs)) { + _exception(SIGSEGV, regs, code, address); + return 0; + } + + if (is_exec && (error_code & DSISR_PROTFAULT) + && printk_ratelimit()) + printk(KERN_CRIT "kernel tried to execute NX-protected" + " page (%lx) - exploit attempt? (uid: %d)\n", + address, current->uid); + + return SIGSEGV; + +/* + * We ran out of memory, or some other thing happened to us that made + * us unable to handle the page fault gracefully. + */ +out_of_memory: + up_read(&mm->mmap_sem); + if (is_global_init(current)) { + yield(); + down_read(&mm->mmap_sem); + goto survive; + } + printk("VM: killing process %s\n", current->comm); + if (user_mode(regs)) + do_group_exit(SIGKILL); + return SIGKILL; + +do_sigbus: + up_read(&mm->mmap_sem); + if (user_mode(regs)) { + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_ADRERR; + info.si_addr = (void __user *)address; + force_sig_info(SIGBUS, &info, current); + return 0; + } + return SIGBUS; +} + +/* + * bad_page_fault is called when we have a bad access from the kernel. + * It is called from the DSI and ISI handlers in head.S and from some + * of the procedures in traps.c. + */ +void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) +{ + const struct exception_table_entry *entry; + + /* Are we prepared to handle this fault? */ + if ((entry = search_exception_tables(regs->nip)) != NULL) { + regs->nip = entry->fixup; + return; + } + + /* kernel has accessed a bad area */ + + switch (regs->trap) { + case 0x300: + case 0x380: + printk(KERN_ALERT "Unable to handle kernel paging request for " + "data at address 0x%08lx\n", regs->dar); + break; + case 0x400: + case 0x480: + printk(KERN_ALERT "Unable to handle kernel paging request for " + "instruction fetch\n"); + break; + default: + printk(KERN_ALERT "Unable to handle kernel paging request for " + "unknown fault\n"); + break; + } + printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n", + regs->nip); + + die("Kernel access of bad area", regs, sig); +} diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c new file mode 100644 index 0000000..6046e51 --- /dev/null +++ b/arch/powerpc/mm/fsl_booke_mmu.c @@ -0,0 +1,232 @@ +/* + * Modifications by Kumar Gala (galak@kernel.crashing.org) to support + * E500 Book E processors. + * + * Copyright 2004 Freescale Semiconductor, Inc + * + * This file contains the routines for initializing the MMU + * on the 4xx series of chips. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/stddef.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/highmem.h> + +#include <asm/pgalloc.h> +#include <asm/prom.h> +#include <asm/io.h> +#include <asm/mmu_context.h> +#include <asm/pgtable.h> +#include <asm/mmu.h> +#include <asm/uaccess.h> +#include <asm/smp.h> +#include <asm/machdep.h> +#include <asm/setup.h> + +#include "mmu_decl.h" + +extern void loadcam_entry(unsigned int index); +unsigned int tlbcam_index; +unsigned int num_tlbcam_entries; +static unsigned long __cam0, __cam1, __cam2; + +#define NUM_TLBCAMS (16) + +struct tlbcam { + u32 MAS0; + u32 MAS1; + u32 MAS2; + u32 MAS3; + u32 MAS7; +} TLBCAM[NUM_TLBCAMS]; + +struct tlbcamrange { + unsigned long start; + unsigned long limit; + phys_addr_t phys; +} tlbcam_addrs[NUM_TLBCAMS]; + +extern unsigned int tlbcam_index; + +/* + * Return PA for this VA if it is mapped by a CAM, or 0 + */ +phys_addr_t v_mapped_by_tlbcam(unsigned long va) +{ + int b; + for (b = 0; b < tlbcam_index; ++b) + if (va >= tlbcam_addrs[b].start && va < tlbcam_addrs[b].limit) + return tlbcam_addrs[b].phys + (va - tlbcam_addrs[b].start); + return 0; +} + +/* + * Return VA for a given PA or 0 if not mapped + */ +unsigned long p_mapped_by_tlbcam(phys_addr_t pa) +{ + int b; + for (b = 0; b < tlbcam_index; ++b) + if (pa >= tlbcam_addrs[b].phys + && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start) + +tlbcam_addrs[b].phys) + return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys); + return 0; +} + +/* + * Set up one of the I/D BAT (block address translation) register pairs. + * The parameters are not checked; in particular size must be a power + * of 4 between 4k and 256M. + */ +void settlbcam(int index, unsigned long virt, phys_addr_t phys, + unsigned int size, int flags, unsigned int pid) +{ + unsigned int tsize, lz; + + asm ("cntlzw %0,%1" : "=r" (lz) : "r" (size)); + tsize = (21 - lz) / 2; + +#ifdef CONFIG_SMP + if ((flags & _PAGE_NO_CACHE) == 0) + flags |= _PAGE_COHERENT; +#endif + + TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index) | MAS0_NV(index+1); + TLBCAM[index].MAS1 = MAS1_VALID | MAS1_IPROT | MAS1_TSIZE(tsize) | MAS1_TID(pid); + TLBCAM[index].MAS2 = virt & PAGE_MASK; + + TLBCAM[index].MAS2 |= (flags & _PAGE_WRITETHRU) ? MAS2_W : 0; + TLBCAM[index].MAS2 |= (flags & _PAGE_NO_CACHE) ? MAS2_I : 0; + TLBCAM[index].MAS2 |= (flags & _PAGE_COHERENT) ? MAS2_M : 0; + TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0; + TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0; + + TLBCAM[index].MAS3 = (phys & PAGE_MASK) | MAS3_SX | MAS3_SR; + TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0); + +#ifndef CONFIG_KGDB /* want user access for breakpoints */ + if (flags & _PAGE_USER) { + TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR; + TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0); + } +#else + TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR; + TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0); +#endif + + tlbcam_addrs[index].start = virt; + tlbcam_addrs[index].limit = virt + size - 1; + tlbcam_addrs[index].phys = phys; + + loadcam_entry(index); +} + +void invalidate_tlbcam_entry(int index) +{ + TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index); + TLBCAM[index].MAS1 = ~MAS1_VALID; + + loadcam_entry(index); +} + +void __init cam_mapin_ram(unsigned long cam0, unsigned long cam1, + unsigned long cam2) +{ + settlbcam(0, PAGE_OFFSET, memstart_addr, cam0, _PAGE_KERNEL, 0); + tlbcam_index++; + if (cam1) { + tlbcam_index++; + settlbcam(1, PAGE_OFFSET+cam0, memstart_addr+cam0, cam1, _PAGE_KERNEL, 0); + } + if (cam2) { + tlbcam_index++; + settlbcam(2, PAGE_OFFSET+cam0+cam1, memstart_addr+cam0+cam1, cam2, _PAGE_KERNEL, 0); + } +} + +/* + * MMU_init_hw does the chip-specific initialization of the MMU hardware. + */ +void __init MMU_init_hw(void) +{ + flush_instruction_cache(); +} + +unsigned long __init mmu_mapin_ram(void) +{ + cam_mapin_ram(__cam0, __cam1, __cam2); + + return __cam0 + __cam1 + __cam2; +} + + +void __init +adjust_total_lowmem(void) +{ + phys_addr_t max_lowmem_size = __max_low_memory; + phys_addr_t cam_max_size = 0x10000000; + phys_addr_t ram; + + /* adjust CAM size to max_lowmem_size */ + if (max_lowmem_size < cam_max_size) + cam_max_size = max_lowmem_size; + + /* adjust lowmem size to max_lowmem_size */ + ram = min(max_lowmem_size, total_lowmem); + + /* Calculate CAM values */ + __cam0 = 1UL << 2 * (__ilog2(ram) / 2); + if (__cam0 > cam_max_size) + __cam0 = cam_max_size; + ram -= __cam0; + if (ram) { + __cam1 = 1UL << 2 * (__ilog2(ram) / 2); + if (__cam1 > cam_max_size) + __cam1 = cam_max_size; + ram -= __cam1; + } + if (ram) { + __cam2 = 1UL << 2 * (__ilog2(ram) / 2); + if (__cam2 > cam_max_size) + __cam2 = cam_max_size; + ram -= __cam2; + } + + printk(KERN_INFO "Memory CAM mapping: CAM0=%ldMb, CAM1=%ldMb," + " CAM2=%ldMb residual: %ldMb\n", + __cam0 >> 20, __cam1 >> 20, __cam2 >> 20, + (long int)((total_lowmem - __cam0 - __cam1 - __cam2) + >> 20)); + __max_low_memory = __cam0 + __cam1 + __cam2; + __initial_memory_limit_addr = memstart_addr + __max_low_memory; +} diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c new file mode 100644 index 0000000..28a114d --- /dev/null +++ b/arch/powerpc/mm/gup.c @@ -0,0 +1,281 @@ +/* + * Lockless get_user_pages_fast for powerpc + * + * Copyright (C) 2008 Nick Piggin + * Copyright (C) 2008 Novell Inc. + */ +#undef DEBUG + +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/vmstat.h> +#include <linux/pagemap.h> +#include <linux/rwsem.h> +#include <asm/pgtable.h> + +/* + * The performance critical leaf functions are made noinline otherwise gcc + * inlines everything into a single function which results in too much + * register pressure. + */ +static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask, result; + pte_t *ptep; + + result = _PAGE_PRESENT|_PAGE_USER; + if (write) + result |= _PAGE_RW; + mask = result | _PAGE_SPECIAL; + + ptep = pte_offset_kernel(&pmd, addr); + do { + pte_t pte = *ptep; + struct page *page; + + if ((pte_val(pte) & mask) != result) + return 0; + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + page = pte_page(pte); + if (!page_cache_get_speculative(page)) + return 0; + if (unlikely(pte_val(pte) != pte_val(*ptep))) { + put_page(page); + return 0; + } + pages[*nr] = page; + (*nr)++; + + } while (ptep++, addr += PAGE_SIZE, addr != end); + + return 1; +} + +#ifdef CONFIG_HUGETLB_PAGE +static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate, + unsigned long *addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long mask; + unsigned long pte_end; + struct page *head, *page; + pte_t pte; + int refs; + + pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate); + if (pte_end < end) + end = pte_end; + + pte = *ptep; + mask = _PAGE_PRESENT|_PAGE_USER; + if (write) + mask |= _PAGE_RW; + if ((pte_val(pte) & mask) != mask) + return 0; + /* hugepages are never "special" */ + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + refs = 0; + head = pte_page(pte); + page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT); + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (*addr += PAGE_SIZE, *addr != end); + + if (!page_cache_add_speculative(head, refs)) { + *nr -= refs; + return 0; + } + if (unlikely(pte_val(pte) != pte_val(*ptep))) { + /* Could be optimized better */ + while (*nr) { + put_page(page); + (*nr)--; + } + } + + return 1; +} +#endif /* CONFIG_HUGETLB_PAGE */ + +static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pmd_t *pmdp; + + pmdp = pmd_offset(&pud, addr); + do { + pmd_t pmd = *pmdp; + + next = pmd_addr_end(addr, end); + if (pmd_none(pmd)) + return 0; + if (!gup_pte_range(pmd, addr, next, write, pages, nr)) + return 0; + } while (pmdp++, addr = next, addr != end); + + return 1; +} + +static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pud_t *pudp; + + pudp = pud_offset(&pgd, addr); + do { + pud_t pud = *pudp; + + next = pud_addr_end(addr, end); + if (pud_none(pud)) + return 0; + if (!gup_pmd_range(pud, addr, next, write, pages, nr)) + return 0; + } while (pudp++, addr = next, addr != end); + + return 1; +} + +int get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long addr, len, end; + unsigned long next; + pgd_t *pgdp; + int psize, nr = 0; + unsigned int shift; + + pr_debug("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read"); + + start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + + if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, + start, len))) + goto slow_irqon; + + pr_debug(" aligned: %lx .. %lx\n", start, end); + +#ifdef CONFIG_HUGETLB_PAGE + /* We bail out on slice boundary crossing when hugetlb is + * enabled in order to not have to deal with two different + * page table formats + */ + if (addr < SLICE_LOW_TOP) { + if (end > SLICE_LOW_TOP) + goto slow_irqon; + + if (unlikely(GET_LOW_SLICE_INDEX(addr) != + GET_LOW_SLICE_INDEX(end - 1))) + goto slow_irqon; + } else { + if (unlikely(GET_HIGH_SLICE_INDEX(addr) != + GET_HIGH_SLICE_INDEX(end - 1))) + goto slow_irqon; + } +#endif /* CONFIG_HUGETLB_PAGE */ + + /* + * XXX: batch / limit 'nr', to avoid large irq off latency + * needs some instrumenting to determine the common sizes used by + * important workloads (eg. DB2), and whether limiting the batch size + * will decrease performance. + * + * It seems like we're in the clear for the moment. Direct-IO is + * the main guy that batches up lots of get_user_pages, and even + * they are limited to 64-at-a-time which is not so many. + */ + /* + * This doesn't prevent pagetable teardown, but does prevent + * the pagetables from being freed on powerpc. + * + * So long as we atomically load page table pointers versus teardown, + * we can follow the address down to the the page and take a ref on it. + */ + local_irq_disable(); + + psize = get_slice_psize(mm, addr); + shift = mmu_psize_defs[psize].shift; + +#ifdef CONFIG_HUGETLB_PAGE + if (unlikely(mmu_huge_psizes[psize])) { + pte_t *ptep; + unsigned long a = addr; + unsigned long sz = ((1UL) << shift); + struct hstate *hstate = size_to_hstate(sz); + + BUG_ON(!hstate); + /* + * XXX: could be optimized to avoid hstate + * lookup entirely (just use shift) + */ + + do { + VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift); + ptep = huge_pte_offset(mm, a); + pr_debug(" %016lx: huge ptep %p\n", a, ptep); + if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages, + &nr)) + goto slow; + } while (a != end); + } else +#endif /* CONFIG_HUGETLB_PAGE */ + { + pgdp = pgd_offset(mm, addr); + do { + pgd_t pgd = *pgdp; + + VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift); + pr_debug(" %016lx: normal pgd %p\n", addr, + (void *)pgd_val(pgd)); + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + goto slow; + if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + goto slow; + } while (pgdp++, addr = next, addr != end); + } + local_irq_enable(); + + VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); + return nr; + + { + int ret; + +slow: + local_irq_enable(); +slow_irqon: + pr_debug(" slow path ! nr = %d\n", nr); + + /* Try to get the remaining pages with get_user_pages */ + start += nr << PAGE_SHIFT; + pages += nr; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, start, + (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); + up_read(&mm->mmap_sem); + + /* Have to be a bit careful with return values */ + if (nr > 0) { + if (ret < 0) + ret = nr; + else + ret += nr; + } + + return ret; + } +} diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S new file mode 100644 index 0000000..7bffb70 --- /dev/null +++ b/arch/powerpc/mm/hash_low_32.S @@ -0,0 +1,665 @@ +/* + * PowerPC version + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP + * Copyright (C) 1996 Cort Dougan <cort@cs.nmt.edu> + * Adapted for Power Macintosh by Paul Mackerras. + * Low-level exception handlers and MMU support + * rewritten by Paul Mackerras. + * Copyright (C) 1996 Paul Mackerras. + * + * This file contains low-level assembler routines for managing + * the PowerPC MMU hash table. (PPC 8xx processors don't use a + * hash table, so this file is not used on them.) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/cputable.h> +#include <asm/ppc_asm.h> +#include <asm/thread_info.h> +#include <asm/asm-offsets.h> + +#ifdef CONFIG_SMP + .section .bss + .align 2 + .globl mmu_hash_lock +mmu_hash_lock: + .space 4 +#endif /* CONFIG_SMP */ + +/* + * Sync CPUs with hash_page taking & releasing the hash + * table lock + */ +#ifdef CONFIG_SMP + .text +_GLOBAL(hash_page_sync) + mfmsr r10 + rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ + mtmsr r0 + lis r8,mmu_hash_lock@h + ori r8,r8,mmu_hash_lock@l + lis r0,0x0fff + b 10f +11: lwz r6,0(r8) + cmpwi 0,r6,0 + bne 11b +10: lwarx r6,0,r8 + cmpwi 0,r6,0 + bne- 11b + stwcx. r0,0,r8 + bne- 10b + isync + eieio + li r0,0 + stw r0,0(r8) + mtmsr r10 + blr +#endif /* CONFIG_SMP */ + +/* + * Load a PTE into the hash table, if possible. + * The address is in r4, and r3 contains an access flag: + * _PAGE_RW (0x400) if a write. + * r9 contains the SRR1 value, from which we use the MSR_PR bit. + * SPRG3 contains the physical address of the current task's thread. + * + * Returns to the caller if the access is illegal or there is no + * mapping for the address. Otherwise it places an appropriate PTE + * in the hash table and returns from the exception. + * Uses r0, r3 - r8, r10, ctr, lr. + */ + .text +_GLOBAL(hash_page) + tophys(r7,0) /* gets -KERNELBASE into r7 */ +#ifdef CONFIG_SMP + addis r8,r7,mmu_hash_lock@h + ori r8,r8,mmu_hash_lock@l + lis r0,0x0fff + b 10f +11: lwz r6,0(r8) + cmpwi 0,r6,0 + bne 11b +10: lwarx r6,0,r8 + cmpwi 0,r6,0 + bne- 11b + stwcx. r0,0,r8 + bne- 10b + isync +#endif + /* Get PTE (linux-style) and check access */ + lis r0,KERNELBASE@h /* check if kernel address */ + cmplw 0,r4,r0 + mfspr r8,SPRN_SPRG3 /* current task's THREAD (phys) */ + ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */ + lwz r5,PGDIR(r8) /* virt page-table root */ + blt+ 112f /* assume user more likely */ + lis r5,swapper_pg_dir@ha /* if kernel address, use */ + addi r5,r5,swapper_pg_dir@l /* kernel page table */ + rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */ +112: add r5,r5,r7 /* convert to phys addr */ +#ifndef CONFIG_PTE_64BIT + rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */ + lwz r8,0(r5) /* get pmd entry */ + rlwinm. r8,r8,0,0,19 /* extract address of pte page */ +#else + rlwinm r8,r4,13,19,29 /* Compute pgdir/pmd offset */ + lwzx r8,r8,r5 /* Get L1 entry */ + rlwinm. r8,r8,0,0,20 /* extract pt base address */ +#endif +#ifdef CONFIG_SMP + beq- hash_page_out /* return if no mapping */ +#else + /* XXX it seems like the 601 will give a machine fault on the + rfi if its alignment is wrong (bottom 4 bits of address are + 8 or 0xc) and we have had a not-taken conditional branch + to the address following the rfi. */ + beqlr- +#endif +#ifndef CONFIG_PTE_64BIT + rlwimi r8,r4,22,20,29 /* insert next 10 bits of address */ +#else + rlwimi r8,r4,23,20,28 /* compute pte address */ +#endif + rlwinm r0,r3,32-3,24,24 /* _PAGE_RW access -> _PAGE_DIRTY */ + ori r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE + + /* + * Update the linux PTE atomically. We do the lwarx up-front + * because almost always, there won't be a permission violation + * and there won't already be an HPTE, and thus we will have + * to update the PTE to set _PAGE_HASHPTE. -- paulus. + * + * If PTE_64BIT is set, the low word is the flags word; use that + * word for locking since it contains all the interesting bits. + */ +#if (PTE_FLAGS_OFFSET != 0) + addi r8,r8,PTE_FLAGS_OFFSET +#endif +retry: + lwarx r6,0,r8 /* get linux-style pte, flag word */ + andc. r5,r3,r6 /* check access & ~permission */ +#ifdef CONFIG_SMP + bne- hash_page_out /* return if access not permitted */ +#else + bnelr- +#endif + or r5,r0,r6 /* set accessed/dirty bits */ +#ifdef CONFIG_PTE_64BIT +#ifdef CONFIG_SMP + subf r10,r6,r8 /* create false data dependency */ + subi r10,r10,PTE_FLAGS_OFFSET + lwzx r10,r6,r10 /* Get upper PTE word */ +#else + lwz r10,-PTE_FLAGS_OFFSET(r8) +#endif /* CONFIG_SMP */ +#endif /* CONFIG_PTE_64BIT */ + stwcx. r5,0,r8 /* attempt to update PTE */ + bne- retry /* retry if someone got there first */ + + mfsrin r3,r4 /* get segment reg for segment */ + mfctr r0 + stw r0,_CTR(r11) + bl create_hpte /* add the hash table entry */ + +#ifdef CONFIG_SMP + eieio + addis r8,r7,mmu_hash_lock@ha + li r0,0 + stw r0,mmu_hash_lock@l(r8) +#endif + + /* Return from the exception */ + lwz r5,_CTR(r11) + mtctr r5 + lwz r0,GPR0(r11) + lwz r7,GPR7(r11) + lwz r8,GPR8(r11) + b fast_exception_return + +#ifdef CONFIG_SMP +hash_page_out: + eieio + addis r8,r7,mmu_hash_lock@ha + li r0,0 + stw r0,mmu_hash_lock@l(r8) + blr +#endif /* CONFIG_SMP */ + +/* + * Add an entry for a particular page to the hash table. + * + * add_hash_page(unsigned context, unsigned long va, unsigned long pmdval) + * + * We assume any necessary modifications to the pte (e.g. setting + * the accessed bit) have already been done and that there is actually + * a hash table in use (i.e. we're not on a 603). + */ +_GLOBAL(add_hash_page) + mflr r0 + stw r0,4(r1) + + /* Convert context and va to VSID */ + mulli r3,r3,897*16 /* multiply context by context skew */ + rlwinm r0,r4,4,28,31 /* get ESID (top 4 bits of va) */ + mulli r0,r0,0x111 /* multiply by ESID skew */ + add r3,r3,r0 /* note create_hpte trims to 24 bits */ + +#ifdef CONFIG_SMP + rlwinm r8,r1,0,0,(31-THREAD_SHIFT) /* use cpu number to make tag */ + lwz r8,TI_CPU(r8) /* to go in mmu_hash_lock */ + oris r8,r8,12 +#endif /* CONFIG_SMP */ + + /* + * We disable interrupts here, even on UP, because we don't + * want to race with hash_page, and because we want the + * _PAGE_HASHPTE bit to be a reliable indication of whether + * the HPTE exists (or at least whether one did once). + * We also turn off the MMU for data accesses so that we + * we can't take a hash table miss (assuming the code is + * covered by a BAT). -- paulus + */ + mfmsr r9 + SYNC + rlwinm r0,r9,0,17,15 /* clear bit 16 (MSR_EE) */ + rlwinm r0,r0,0,28,26 /* clear MSR_DR */ + mtmsr r0 + SYNC_601 + isync + + tophys(r7,0) + +#ifdef CONFIG_SMP + addis r6,r7,mmu_hash_lock@ha + addi r6,r6,mmu_hash_lock@l +10: lwarx r0,0,r6 /* take the mmu_hash_lock */ + cmpi 0,r0,0 + bne- 11f + stwcx. r8,0,r6 + beq+ 12f +11: lwz r0,0(r6) + cmpi 0,r0,0 + beq 10b + b 11b +12: isync +#endif + + /* + * Fetch the linux pte and test and set _PAGE_HASHPTE atomically. + * If _PAGE_HASHPTE was already set, we don't replace the existing + * HPTE, so we just unlock and return. + */ + mr r8,r5 +#ifndef CONFIG_PTE_64BIT + rlwimi r8,r4,22,20,29 +#else + rlwimi r8,r4,23,20,28 + addi r8,r8,PTE_FLAGS_OFFSET +#endif +1: lwarx r6,0,r8 + andi. r0,r6,_PAGE_HASHPTE + bne 9f /* if HASHPTE already set, done */ +#ifdef CONFIG_PTE_64BIT +#ifdef CONFIG_SMP + subf r10,r6,r8 /* create false data dependency */ + subi r10,r10,PTE_FLAGS_OFFSET + lwzx r10,r6,r10 /* Get upper PTE word */ +#else + lwz r10,-PTE_FLAGS_OFFSET(r8) +#endif /* CONFIG_SMP */ +#endif /* CONFIG_PTE_64BIT */ + ori r5,r6,_PAGE_HASHPTE + stwcx. r5,0,r8 + bne- 1b + + bl create_hpte + +9: +#ifdef CONFIG_SMP + addis r6,r7,mmu_hash_lock@ha + addi r6,r6,mmu_hash_lock@l + eieio + li r0,0 + stw r0,0(r6) /* clear mmu_hash_lock */ +#endif + + /* reenable interrupts and DR */ + mtmsr r9 + SYNC_601 + isync + + lwz r0,4(r1) + mtlr r0 + blr + +/* + * This routine adds a hardware PTE to the hash table. + * It is designed to be called with the MMU either on or off. + * r3 contains the VSID, r4 contains the virtual address, + * r5 contains the linux PTE, r6 contains the old value of the + * linux PTE (before setting _PAGE_HASHPTE) and r7 contains the + * offset to be added to addresses (0 if the MMU is on, + * -KERNELBASE if it is off). r10 contains the upper half of + * the PTE if CONFIG_PTE_64BIT. + * On SMP, the caller should have the mmu_hash_lock held. + * We assume that the caller has (or will) set the _PAGE_HASHPTE + * bit in the linux PTE in memory. The value passed in r6 should + * be the old linux PTE value; if it doesn't have _PAGE_HASHPTE set + * this routine will skip the search for an existing HPTE. + * This procedure modifies r0, r3 - r6, r8, cr0. + * -- paulus. + * + * For speed, 4 of the instructions get patched once the size and + * physical address of the hash table are known. These definitions + * of Hash_base and Hash_bits below are just an example. + */ +Hash_base = 0xc0180000 +Hash_bits = 12 /* e.g. 256kB hash table */ +Hash_msk = (((1 << Hash_bits) - 1) * 64) + +/* defines for the PTE format for 32-bit PPCs */ +#define HPTE_SIZE 8 +#define PTEG_SIZE 64 +#define LG_PTEG_SIZE 6 +#define LDPTEu lwzu +#define LDPTE lwz +#define STPTE stw +#define CMPPTE cmpw +#define PTE_H 0x40 +#define PTE_V 0x80000000 +#define TST_V(r) rlwinm. r,r,0,0,0 +#define SET_V(r) oris r,r,PTE_V@h +#define CLR_V(r,t) rlwinm r,r,0,1,31 + +#define HASH_LEFT 31-(LG_PTEG_SIZE+Hash_bits-1) +#define HASH_RIGHT 31-LG_PTEG_SIZE + +_GLOBAL(create_hpte) + /* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */ + rlwinm r8,r5,32-10,31,31 /* _PAGE_RW -> PP lsb */ + rlwinm r0,r5,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */ + and r8,r8,r0 /* writable if _RW & _DIRTY */ + rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */ + rlwimi r5,r5,32-2,31,31 /* _PAGE_USER -> PP lsb */ + ori r8,r8,0xe14 /* clear out reserved bits and M */ + andc r8,r5,r8 /* PP = user? (rw&dirty? 2: 3): 0 */ +BEGIN_FTR_SECTION + ori r8,r8,_PAGE_COHERENT /* set M (coherence required) */ +END_FTR_SECTION_IFSET(CPU_FTR_NEED_COHERENT) +#ifdef CONFIG_PTE_64BIT + /* Put the XPN bits into the PTE */ + rlwimi r8,r10,8,20,22 + rlwimi r8,r10,2,29,29 +#endif + + /* Construct the high word of the PPC-style PTE (r5) */ + rlwinm r5,r3,7,1,24 /* put VSID in 0x7fffff80 bits */ + rlwimi r5,r4,10,26,31 /* put in API (abbrev page index) */ + SET_V(r5) /* set V (valid) bit */ + + /* Get the address of the primary PTE group in the hash table (r3) */ +_GLOBAL(hash_page_patch_A) + addis r0,r7,Hash_base@h /* base address of hash table */ + rlwimi r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */ + rlwinm r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */ + xor r3,r3,r0 /* make primary hash */ + li r0,8 /* PTEs/group */ + + /* + * Test the _PAGE_HASHPTE bit in the old linux PTE, and skip the search + * if it is clear, meaning that the HPTE isn't there already... + */ + andi. r6,r6,_PAGE_HASHPTE + beq+ 10f /* no PTE: go look for an empty slot */ + tlbie r4 + + addis r4,r7,htab_hash_searches@ha + lwz r6,htab_hash_searches@l(r4) + addi r6,r6,1 /* count how many searches we do */ + stw r6,htab_hash_searches@l(r4) + + /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */ + mtctr r0 + addi r4,r3,-HPTE_SIZE +1: LDPTEu r6,HPTE_SIZE(r4) /* get next PTE */ + CMPPTE 0,r6,r5 + bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */ + beq+ found_slot + + /* Search the secondary PTEG for a matching PTE */ + ori r5,r5,PTE_H /* set H (secondary hash) bit */ +_GLOBAL(hash_page_patch_B) + xoris r4,r3,Hash_msk>>16 /* compute secondary hash */ + xori r4,r4,(-PTEG_SIZE & 0xffff) + addi r4,r4,-HPTE_SIZE + mtctr r0 +2: LDPTEu r6,HPTE_SIZE(r4) + CMPPTE 0,r6,r5 + bdnzf 2,2b + beq+ found_slot + xori r5,r5,PTE_H /* clear H bit again */ + + /* Search the primary PTEG for an empty slot */ +10: mtctr r0 + addi r4,r3,-HPTE_SIZE /* search primary PTEG */ +1: LDPTEu r6,HPTE_SIZE(r4) /* get next PTE */ + TST_V(r6) /* test valid bit */ + bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */ + beq+ found_empty + + /* update counter of times that the primary PTEG is full */ + addis r4,r7,primary_pteg_full@ha + lwz r6,primary_pteg_full@l(r4) + addi r6,r6,1 + stw r6,primary_pteg_full@l(r4) + + /* Search the secondary PTEG for an empty slot */ + ori r5,r5,PTE_H /* set H (secondary hash) bit */ +_GLOBAL(hash_page_patch_C) + xoris r4,r3,Hash_msk>>16 /* compute secondary hash */ + xori r4,r4,(-PTEG_SIZE & 0xffff) + addi r4,r4,-HPTE_SIZE + mtctr r0 +2: LDPTEu r6,HPTE_SIZE(r4) + TST_V(r6) + bdnzf 2,2b + beq+ found_empty + xori r5,r5,PTE_H /* clear H bit again */ + + /* + * Choose an arbitrary slot in the primary PTEG to overwrite. + * Since both the primary and secondary PTEGs are full, and we + * have no information that the PTEs in the primary PTEG are + * more important or useful than those in the secondary PTEG, + * and we know there is a definite (although small) speed + * advantage to putting the PTE in the primary PTEG, we always + * put the PTE in the primary PTEG. + * + * In addition, we skip any slot that is mapping kernel text in + * order to avoid a deadlock when not using BAT mappings if + * trying to hash in the kernel hash code itself after it has + * already taken the hash table lock. This works in conjunction + * with pre-faulting of the kernel text. + * + * If the hash table bucket is full of kernel text entries, we'll + * lockup here but that shouldn't happen + */ + +1: addis r4,r7,next_slot@ha /* get next evict slot */ + lwz r6,next_slot@l(r4) + addi r6,r6,HPTE_SIZE /* search for candidate */ + andi. r6,r6,7*HPTE_SIZE + stw r6,next_slot@l(r4) + add r4,r3,r6 + LDPTE r0,HPTE_SIZE/2(r4) /* get PTE second word */ + clrrwi r0,r0,12 + lis r6,etext@h + ori r6,r6,etext@l /* get etext */ + tophys(r6,r6) + cmpl cr0,r0,r6 /* compare and try again */ + blt 1b + +#ifndef CONFIG_SMP + /* Store PTE in PTEG */ +found_empty: + STPTE r5,0(r4) +found_slot: + STPTE r8,HPTE_SIZE/2(r4) + +#else /* CONFIG_SMP */ +/* + * Between the tlbie above and updating the hash table entry below, + * another CPU could read the hash table entry and put it in its TLB. + * There are 3 cases: + * 1. using an empty slot + * 2. updating an earlier entry to change permissions (i.e. enable write) + * 3. taking over the PTE for an unrelated address + * + * In each case it doesn't really matter if the other CPUs have the old + * PTE in their TLB. So we don't need to bother with another tlbie here, + * which is convenient as we've overwritten the register that had the + * address. :-) The tlbie above is mainly to make sure that this CPU comes + * and gets the new PTE from the hash table. + * + * We do however have to make sure that the PTE is never in an invalid + * state with the V bit set. + */ +found_empty: +found_slot: + CLR_V(r5,r0) /* clear V (valid) bit in PTE */ + STPTE r5,0(r4) + sync + TLBSYNC + STPTE r8,HPTE_SIZE/2(r4) /* put in correct RPN, WIMG, PP bits */ + sync + SET_V(r5) + STPTE r5,0(r4) /* finally set V bit in PTE */ +#endif /* CONFIG_SMP */ + + sync /* make sure pte updates get to memory */ + blr + + .section .bss + .align 2 +next_slot: + .space 4 +primary_pteg_full: + .space 4 +htab_hash_searches: + .space 4 + .previous + +/* + * Flush the entry for a particular page from the hash table. + * + * flush_hash_pages(unsigned context, unsigned long va, unsigned long pmdval, + * int count) + * + * We assume that there is a hash table in use (Hash != 0). + */ +_GLOBAL(flush_hash_pages) + tophys(r7,0) + + /* + * We disable interrupts here, even on UP, because we want + * the _PAGE_HASHPTE bit to be a reliable indication of + * whether the HPTE exists (or at least whether one did once). + * We also turn off the MMU for data accesses so that we + * we can't take a hash table miss (assuming the code is + * covered by a BAT). -- paulus + */ + mfmsr r10 + SYNC + rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ + rlwinm r0,r0,0,28,26 /* clear MSR_DR */ + mtmsr r0 + SYNC_601 + isync + + /* First find a PTE in the range that has _PAGE_HASHPTE set */ +#ifndef CONFIG_PTE_64BIT + rlwimi r5,r4,22,20,29 +#else + rlwimi r5,r4,23,20,28 +#endif +1: lwz r0,PTE_FLAGS_OFFSET(r5) + cmpwi cr1,r6,1 + andi. r0,r0,_PAGE_HASHPTE + bne 2f + ble cr1,19f + addi r4,r4,0x1000 + addi r5,r5,PTE_SIZE + addi r6,r6,-1 + b 1b + + /* Convert context and va to VSID */ +2: mulli r3,r3,897*16 /* multiply context by context skew */ + rlwinm r0,r4,4,28,31 /* get ESID (top 4 bits of va) */ + mulli r0,r0,0x111 /* multiply by ESID skew */ + add r3,r3,r0 /* note code below trims to 24 bits */ + + /* Construct the high word of the PPC-style PTE (r11) */ + rlwinm r11,r3,7,1,24 /* put VSID in 0x7fffff80 bits */ + rlwimi r11,r4,10,26,31 /* put in API (abbrev page index) */ + SET_V(r11) /* set V (valid) bit */ + +#ifdef CONFIG_SMP + addis r9,r7,mmu_hash_lock@ha + addi r9,r9,mmu_hash_lock@l + rlwinm r8,r1,0,0,(31-THREAD_SHIFT) + add r8,r8,r7 + lwz r8,TI_CPU(r8) + oris r8,r8,9 +10: lwarx r0,0,r9 + cmpi 0,r0,0 + bne- 11f + stwcx. r8,0,r9 + beq+ 12f +11: lwz r0,0(r9) + cmpi 0,r0,0 + beq 10b + b 11b +12: isync +#endif + + /* + * Check the _PAGE_HASHPTE bit in the linux PTE. If it is + * already clear, we're done (for this pte). If not, + * clear it (atomically) and proceed. -- paulus. + */ +#if (PTE_FLAGS_OFFSET != 0) + addi r5,r5,PTE_FLAGS_OFFSET +#endif +33: lwarx r8,0,r5 /* fetch the pte flags word */ + andi. r0,r8,_PAGE_HASHPTE + beq 8f /* done if HASHPTE is already clear */ + rlwinm r8,r8,0,31,29 /* clear HASHPTE bit */ + stwcx. r8,0,r5 /* update the pte */ + bne- 33b + + /* Get the address of the primary PTE group in the hash table (r3) */ +_GLOBAL(flush_hash_patch_A) + addis r8,r7,Hash_base@h /* base address of hash table */ + rlwimi r8,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */ + rlwinm r0,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */ + xor r8,r0,r8 /* make primary hash */ + + /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */ + li r0,8 /* PTEs/group */ + mtctr r0 + addi r12,r8,-HPTE_SIZE +1: LDPTEu r0,HPTE_SIZE(r12) /* get next PTE */ + CMPPTE 0,r0,r11 + bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */ + beq+ 3f + + /* Search the secondary PTEG for a matching PTE */ + ori r11,r11,PTE_H /* set H (secondary hash) bit */ + li r0,8 /* PTEs/group */ +_GLOBAL(flush_hash_patch_B) + xoris r12,r8,Hash_msk>>16 /* compute secondary hash */ + xori r12,r12,(-PTEG_SIZE & 0xffff) + addi r12,r12,-HPTE_SIZE + mtctr r0 +2: LDPTEu r0,HPTE_SIZE(r12) + CMPPTE 0,r0,r11 + bdnzf 2,2b + xori r11,r11,PTE_H /* clear H again */ + bne- 4f /* should rarely fail to find it */ + +3: li r0,0 + STPTE r0,0(r12) /* invalidate entry */ +4: sync + tlbie r4 /* in hw tlb too */ + sync + +8: ble cr1,9f /* if all ptes checked */ +81: addi r6,r6,-1 + addi r5,r5,PTE_SIZE + addi r4,r4,0x1000 + lwz r0,0(r5) /* check next pte */ + cmpwi cr1,r6,1 + andi. r0,r0,_PAGE_HASHPTE + bne 33b + bgt cr1,81b + +9: +#ifdef CONFIG_SMP + TLBSYNC + li r0,0 + stw r0,0(r9) /* clear mmu_hash_lock */ +#endif + +19: mtmsr r10 + SYNC_601 + isync + blr diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S new file mode 100644 index 0000000..a719f53 --- /dev/null +++ b/arch/powerpc/mm/hash_low_64.S @@ -0,0 +1,958 @@ +/* + * ppc64 MMU hashtable management routines + * + * (c) Copyright IBM Corp. 2003, 2005 + * + * Maintained by: Benjamin Herrenschmidt + * <benh@kernel.crashing.org> + * + * This file is covered by the GNU Public Licence v2 as + * described in the kernel's COPYING file. + */ + +#include <asm/reg.h> +#include <asm/pgtable.h> +#include <asm/mmu.h> +#include <asm/page.h> +#include <asm/types.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/cputable.h> + + .text + +/* + * Stackframe: + * + * +-> Back chain (SP + 256) + * | General register save area (SP + 112) + * | Parameter save area (SP + 48) + * | TOC save area (SP + 40) + * | link editor doubleword (SP + 32) + * | compiler doubleword (SP + 24) + * | LR save area (SP + 16) + * | CR save area (SP + 8) + * SP ---> +-- Back chain (SP + 0) + */ +#define STACKFRAMESIZE 256 + +/* Save parameters offsets */ +#define STK_PARM(i) (STACKFRAMESIZE + 48 + ((i)-3)*8) + +/* Save non-volatile offsets */ +#define STK_REG(i) (112 + ((i)-14)*8) + + +#ifndef CONFIG_PPC_64K_PAGES + +/***************************************************************************** + * * + * 4K SW & 4K HW pages implementation * + * * + *****************************************************************************/ + + +/* + * _hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, + * pte_t *ptep, unsigned long trap, int local, int ssize) + * + * Adds a 4K page to the hash table in a segment of 4K pages only + */ + +_GLOBAL(__hash_page_4K) + mflr r0 + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + /* Save all params that we need after a function call */ + std r6,STK_PARM(r6)(r1) + std r8,STK_PARM(r8)(r1) + std r9,STK_PARM(r9)(r1) + + /* Add _PAGE_PRESENT to access */ + ori r4,r4,_PAGE_PRESENT + + /* Save non-volatile registers. + * r31 will hold "old PTE" + * r30 is "new PTE" + * r29 is "va" + * r28 is a hash value + * r27 is hashtab mask (maybe dynamic patched instead ?) + */ + std r27,STK_REG(r27)(r1) + std r28,STK_REG(r28)(r1) + std r29,STK_REG(r29)(r1) + std r30,STK_REG(r30)(r1) + std r31,STK_REG(r31)(r1) + + /* Step 1: + * + * Check permissions, atomically mark the linux PTE busy + * and hashed. + */ +1: + ldarx r31,0,r6 + /* Check access rights (access & ~(pte_val(*ptep))) */ + andc. r0,r4,r31 + bne- htab_wrong_access + /* Check if PTE is busy */ + andi. r0,r31,_PAGE_BUSY + /* If so, just bail out and refault if needed. Someone else + * is changing this PTE anyway and might hash it. + */ + bne- htab_bail_ok + + /* Prepare new PTE value (turn access RW into DIRTY, then + * add BUSY,HASHPTE and ACCESSED) + */ + rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */ + or r30,r30,r31 + ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE + /* Write the linux PTE atomically (setting busy) */ + stdcx. r30,0,r6 + bne- 1b + isync + + /* Step 2: + * + * Insert/Update the HPTE in the hash table. At this point, + * r4 (access) is re-useable, we use it for the new HPTE flags + */ + +BEGIN_FTR_SECTION + cmpdi r9,0 /* check segment size */ + bne 3f +END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT) + /* Calc va and put it in r29 */ + rldicr r29,r5,28,63-28 + rldicl r3,r3,0,36 + or r29,r3,r29 + + /* Calculate hash value for primary slot and store it in r28 */ + rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */ + rldicl r0,r3,64-12,48 /* (ea >> 12) & 0xffff */ + xor r28,r5,r0 + b 4f + +3: /* Calc VA and hash in r29 and r28 for 1T segment */ + sldi r29,r5,40 /* vsid << 40 */ + clrldi r3,r3,24 /* ea & 0xffffffffff */ + rldic r28,r5,25,25 /* (vsid << 25) & 0x7fffffffff */ + clrldi r5,r5,40 /* vsid & 0xffffff */ + rldicl r0,r3,64-12,36 /* (ea >> 12) & 0xfffffff */ + xor r28,r28,r5 + or r29,r3,r29 /* VA */ + xor r28,r28,r0 /* hash */ + + /* Convert linux PTE bits into HW equivalents */ +4: andi. r3,r30,0x1fe /* Get basic set of flags */ + xori r3,r3,HPTE_R_N /* _PAGE_EXEC -> NOEXEC */ + rlwinm r0,r30,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */ + rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */ + and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/ + andc r0,r30,r0 /* r0 = pte & ~r0 */ + rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */ + ori r3,r3,HPTE_R_C /* Always add "C" bit for perf. */ + + /* We eventually do the icache sync here (maybe inline that + * code rather than call a C function...) + */ +BEGIN_FTR_SECTION + mr r4,r30 + mr r5,r7 + bl .hash_page_do_lazy_icache +END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE) + + /* At this point, r3 contains new PP bits, save them in + * place of "access" in the param area (sic) + */ + std r3,STK_PARM(r4)(r1) + + /* Get htab_hash_mask */ + ld r4,htab_hash_mask@got(2) + ld r27,0(r4) /* htab_hash_mask -> r27 */ + + /* Check if we may already be in the hashtable, in this case, we + * go to out-of-line code to try to modify the HPTE + */ + andi. r0,r31,_PAGE_HASHPTE + bne htab_modify_pte + +htab_insert_pte: + /* Clear hpte bits in new pte (we also clear BUSY btw) and + * add _PAGE_HASHPTE + */ + lis r0,_PAGE_HPTEFLAGS@h + ori r0,r0,_PAGE_HPTEFLAGS@l + andc r30,r30,r0 + ori r30,r30,_PAGE_HASHPTE + + /* physical address r5 */ + rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT + sldi r5,r5,PAGE_SHIFT + + /* Calculate primary group hash */ + and r0,r28,r27 + rldicr r3,r0,3,63-3 /* r3 = (hash & mask) << 3 */ + + /* Call ppc_md.hpte_insert */ + ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */ + mr r4,r29 /* Retreive va */ + li r7,0 /* !bolted, !secondary */ + li r8,MMU_PAGE_4K /* page size */ + ld r9,STK_PARM(r9)(r1) /* segment size */ +_GLOBAL(htab_call_hpte_insert1) + bl . /* Patched by htab_finish_init() */ + cmpdi 0,r3,0 + bge htab_pte_insert_ok /* Insertion successful */ + cmpdi 0,r3,-2 /* Critical failure */ + beq- htab_pte_insert_failure + + /* Now try secondary slot */ + + /* physical address r5 */ + rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT + sldi r5,r5,PAGE_SHIFT + + /* Calculate secondary group hash */ + andc r0,r27,r28 + rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */ + + /* Call ppc_md.hpte_insert */ + ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */ + mr r4,r29 /* Retreive va */ + li r7,HPTE_V_SECONDARY /* !bolted, secondary */ + li r8,MMU_PAGE_4K /* page size */ + ld r9,STK_PARM(r9)(r1) /* segment size */ +_GLOBAL(htab_call_hpte_insert2) + bl . /* Patched by htab_finish_init() */ + cmpdi 0,r3,0 + bge+ htab_pte_insert_ok /* Insertion successful */ + cmpdi 0,r3,-2 /* Critical failure */ + beq- htab_pte_insert_failure + + /* Both are full, we need to evict something */ + mftb r0 + /* Pick a random group based on TB */ + andi. r0,r0,1 + mr r5,r28 + bne 2f + not r5,r5 +2: and r0,r5,r27 + rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + /* Call ppc_md.hpte_remove */ +_GLOBAL(htab_call_hpte_remove) + bl . /* Patched by htab_finish_init() */ + + /* Try all again */ + b htab_insert_pte + +htab_bail_ok: + li r3,0 + b htab_bail + +htab_pte_insert_ok: + /* Insert slot number & secondary bit in PTE */ + rldimi r30,r3,12,63-15 + + /* Write out the PTE with a normal write + * (maybe add eieio may be good still ?) + */ +htab_write_out_pte: + ld r6,STK_PARM(r6)(r1) + std r30,0(r6) + li r3, 0 +htab_bail: + ld r27,STK_REG(r27)(r1) + ld r28,STK_REG(r28)(r1) + ld r29,STK_REG(r29)(r1) + ld r30,STK_REG(r30)(r1) + ld r31,STK_REG(r31)(r1) + addi r1,r1,STACKFRAMESIZE + ld r0,16(r1) + mtlr r0 + blr + +htab_modify_pte: + /* Keep PP bits in r4 and slot idx from the PTE around in r3 */ + mr r4,r3 + rlwinm r3,r31,32-12,29,31 + + /* Secondary group ? if yes, get a inverted hash value */ + mr r5,r28 + andi. r0,r31,_PAGE_SECONDARY + beq 1f + not r5,r5 +1: + /* Calculate proper slot value for ppc_md.hpte_updatepp */ + and r0,r5,r27 + rldicr r0,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + add r3,r0,r3 /* add slot idx */ + + /* Call ppc_md.hpte_updatepp */ + mr r5,r29 /* va */ + li r6,MMU_PAGE_4K /* page size */ + ld r7,STK_PARM(r9)(r1) /* segment size */ + ld r8,STK_PARM(r8)(r1) /* get "local" param */ +_GLOBAL(htab_call_hpte_updatepp) + bl . /* Patched by htab_finish_init() */ + + /* if we failed because typically the HPTE wasn't really here + * we try an insertion. + */ + cmpdi 0,r3,-1 + beq- htab_insert_pte + + /* Clear the BUSY bit and Write out the PTE */ + li r0,_PAGE_BUSY + andc r30,r30,r0 + b htab_write_out_pte + +htab_wrong_access: + /* Bail out clearing reservation */ + stdcx. r31,0,r6 + li r3,1 + b htab_bail + +htab_pte_insert_failure: + /* Bail out restoring old PTE */ + ld r6,STK_PARM(r6)(r1) + std r31,0(r6) + li r3,-1 + b htab_bail + + +#else /* CONFIG_PPC_64K_PAGES */ + + +/***************************************************************************** + * * + * 64K SW & 4K or 64K HW in a 4K segment pages implementation * + * * + *****************************************************************************/ + +/* _hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, + * pte_t *ptep, unsigned long trap, int local, int ssize, + * int subpg_prot) + */ + +/* + * For now, we do NOT implement Admixed pages + */ +_GLOBAL(__hash_page_4K) + mflr r0 + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + /* Save all params that we need after a function call */ + std r6,STK_PARM(r6)(r1) + std r8,STK_PARM(r8)(r1) + std r9,STK_PARM(r9)(r1) + + /* Add _PAGE_PRESENT to access */ + ori r4,r4,_PAGE_PRESENT + + /* Save non-volatile registers. + * r31 will hold "old PTE" + * r30 is "new PTE" + * r29 is "va" + * r28 is a hash value + * r27 is hashtab mask (maybe dynamic patched instead ?) + * r26 is the hidx mask + * r25 is the index in combo page + */ + std r25,STK_REG(r25)(r1) + std r26,STK_REG(r26)(r1) + std r27,STK_REG(r27)(r1) + std r28,STK_REG(r28)(r1) + std r29,STK_REG(r29)(r1) + std r30,STK_REG(r30)(r1) + std r31,STK_REG(r31)(r1) + + /* Step 1: + * + * Check permissions, atomically mark the linux PTE busy + * and hashed. + */ +1: + ldarx r31,0,r6 + /* Check access rights (access & ~(pte_val(*ptep))) */ + andc. r0,r4,r31 + bne- htab_wrong_access + /* Check if PTE is busy */ + andi. r0,r31,_PAGE_BUSY + /* If so, just bail out and refault if needed. Someone else + * is changing this PTE anyway and might hash it. + */ + bne- htab_bail_ok + /* Prepare new PTE value (turn access RW into DIRTY, then + * add BUSY and ACCESSED) + */ + rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */ + or r30,r30,r31 + ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED + oris r30,r30,_PAGE_COMBO@h + /* Write the linux PTE atomically (setting busy) */ + stdcx. r30,0,r6 + bne- 1b + isync + + /* Step 2: + * + * Insert/Update the HPTE in the hash table. At this point, + * r4 (access) is re-useable, we use it for the new HPTE flags + */ + + /* Load the hidx index */ + rldicl r25,r3,64-12,60 + +BEGIN_FTR_SECTION + cmpdi r9,0 /* check segment size */ + bne 3f +END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT) + /* Calc va and put it in r29 */ + rldicr r29,r5,28,63-28 /* r29 = (vsid << 28) */ + rldicl r3,r3,0,36 /* r3 = (ea & 0x0fffffff) */ + or r29,r3,r29 /* r29 = va */ + + /* Calculate hash value for primary slot and store it in r28 */ + rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */ + rldicl r0,r3,64-12,48 /* (ea >> 12) & 0xffff */ + xor r28,r5,r0 + b 4f + +3: /* Calc VA and hash in r29 and r28 for 1T segment */ + sldi r29,r5,40 /* vsid << 40 */ + clrldi r3,r3,24 /* ea & 0xffffffffff */ + rldic r28,r5,25,25 /* (vsid << 25) & 0x7fffffffff */ + clrldi r5,r5,40 /* vsid & 0xffffff */ + rldicl r0,r3,64-12,36 /* (ea >> 12) & 0xfffffff */ + xor r28,r28,r5 + or r29,r3,r29 /* VA */ + xor r28,r28,r0 /* hash */ + + /* Convert linux PTE bits into HW equivalents */ +4: +#ifdef CONFIG_PPC_SUBPAGE_PROT + andc r10,r30,r10 + andi. r3,r10,0x1fe /* Get basic set of flags */ + rlwinm r0,r10,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */ +#else + andi. r3,r30,0x1fe /* Get basic set of flags */ + rlwinm r0,r30,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */ +#endif + xori r3,r3,HPTE_R_N /* _PAGE_EXEC -> NOEXEC */ + rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */ + and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/ + andc r0,r3,r0 /* r0 = pte & ~r0 */ + rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */ + ori r3,r3,HPTE_R_C /* Always add "C" bit for perf. */ + + /* We eventually do the icache sync here (maybe inline that + * code rather than call a C function...) + */ +BEGIN_FTR_SECTION + mr r4,r30 + mr r5,r7 + bl .hash_page_do_lazy_icache +END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE) + + /* At this point, r3 contains new PP bits, save them in + * place of "access" in the param area (sic) + */ + std r3,STK_PARM(r4)(r1) + + /* Get htab_hash_mask */ + ld r4,htab_hash_mask@got(2) + ld r27,0(r4) /* htab_hash_mask -> r27 */ + + /* Check if we may already be in the hashtable, in this case, we + * go to out-of-line code to try to modify the HPTE. We look for + * the bit at (1 >> (index + 32)) + */ + rldicl. r0,r31,64-12,48 + li r26,0 /* Default hidx */ + beq htab_insert_pte + + /* + * Check if the pte was already inserted into the hash table + * as a 64k HW page, and invalidate the 64k HPTE if so. + */ + andis. r0,r31,_PAGE_COMBO@h + beq htab_inval_old_hpte + + ld r6,STK_PARM(r6)(r1) + ori r26,r6,0x8000 /* Load the hidx mask */ + ld r26,0(r26) + addi r5,r25,36 /* Check actual HPTE_SUB bit, this */ + rldcr. r0,r31,r5,0 /* must match pgtable.h definition */ + bne htab_modify_pte + +htab_insert_pte: + /* real page number in r5, PTE RPN value + index */ + andis. r0,r31,_PAGE_4K_PFN@h + srdi r5,r31,PTE_RPN_SHIFT + bne- htab_special_pfn + sldi r5,r5,PAGE_SHIFT-HW_PAGE_SHIFT + add r5,r5,r25 +htab_special_pfn: + sldi r5,r5,HW_PAGE_SHIFT + + /* Calculate primary group hash */ + and r0,r28,r27 + rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + + /* Call ppc_md.hpte_insert */ + ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */ + mr r4,r29 /* Retreive va */ + li r7,0 /* !bolted, !secondary */ + li r8,MMU_PAGE_4K /* page size */ + ld r9,STK_PARM(r9)(r1) /* segment size */ +_GLOBAL(htab_call_hpte_insert1) + bl . /* patched by htab_finish_init() */ + cmpdi 0,r3,0 + bge htab_pte_insert_ok /* Insertion successful */ + cmpdi 0,r3,-2 /* Critical failure */ + beq- htab_pte_insert_failure + + /* Now try secondary slot */ + + /* real page number in r5, PTE RPN value + index */ + andis. r0,r31,_PAGE_4K_PFN@h + srdi r5,r31,PTE_RPN_SHIFT + bne- 3f + sldi r5,r5,PAGE_SHIFT-HW_PAGE_SHIFT + add r5,r5,r25 +3: sldi r5,r5,HW_PAGE_SHIFT + + /* Calculate secondary group hash */ + andc r0,r27,r28 + rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */ + + /* Call ppc_md.hpte_insert */ + ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */ + mr r4,r29 /* Retreive va */ + li r7,HPTE_V_SECONDARY /* !bolted, secondary */ + li r8,MMU_PAGE_4K /* page size */ + ld r9,STK_PARM(r9)(r1) /* segment size */ +_GLOBAL(htab_call_hpte_insert2) + bl . /* patched by htab_finish_init() */ + cmpdi 0,r3,0 + bge+ htab_pte_insert_ok /* Insertion successful */ + cmpdi 0,r3,-2 /* Critical failure */ + beq- htab_pte_insert_failure + + /* Both are full, we need to evict something */ + mftb r0 + /* Pick a random group based on TB */ + andi. r0,r0,1 + mr r5,r28 + bne 2f + not r5,r5 +2: and r0,r5,r27 + rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + /* Call ppc_md.hpte_remove */ +_GLOBAL(htab_call_hpte_remove) + bl . /* patched by htab_finish_init() */ + + /* Try all again */ + b htab_insert_pte + + /* + * Call out to C code to invalidate an 64k HW HPTE that is + * useless now that the segment has been switched to 4k pages. + */ +htab_inval_old_hpte: + mr r3,r29 /* virtual addr */ + mr r4,r31 /* PTE.pte */ + li r5,0 /* PTE.hidx */ + li r6,MMU_PAGE_64K /* psize */ + ld r7,STK_PARM(r9)(r1) /* ssize */ + ld r8,STK_PARM(r8)(r1) /* local */ + bl .flush_hash_page + /* Clear out _PAGE_HPTE_SUB bits in the new linux PTE */ + lis r0,_PAGE_HPTE_SUB@h + ori r0,r0,_PAGE_HPTE_SUB@l + andc r30,r30,r0 + b htab_insert_pte + +htab_bail_ok: + li r3,0 + b htab_bail + +htab_pte_insert_ok: + /* Insert slot number & secondary bit in PTE second half, + * clear _PAGE_BUSY and set approriate HPTE slot bit + */ + ld r6,STK_PARM(r6)(r1) + li r0,_PAGE_BUSY + andc r30,r30,r0 + /* HPTE SUB bit */ + li r0,1 + subfic r5,r25,27 /* Must match bit position in */ + sld r0,r0,r5 /* pgtable.h */ + or r30,r30,r0 + /* hindx */ + sldi r5,r25,2 + sld r3,r3,r5 + li r4,0xf + sld r4,r4,r5 + andc r26,r26,r4 + or r26,r26,r3 + ori r5,r6,0x8000 + std r26,0(r5) + lwsync + std r30,0(r6) + li r3, 0 +htab_bail: + ld r25,STK_REG(r25)(r1) + ld r26,STK_REG(r26)(r1) + ld r27,STK_REG(r27)(r1) + ld r28,STK_REG(r28)(r1) + ld r29,STK_REG(r29)(r1) + ld r30,STK_REG(r30)(r1) + ld r31,STK_REG(r31)(r1) + addi r1,r1,STACKFRAMESIZE + ld r0,16(r1) + mtlr r0 + blr + +htab_modify_pte: + /* Keep PP bits in r4 and slot idx from the PTE around in r3 */ + mr r4,r3 + sldi r5,r25,2 + srd r3,r26,r5 + + /* Secondary group ? if yes, get a inverted hash value */ + mr r5,r28 + andi. r0,r3,0x8 /* page secondary ? */ + beq 1f + not r5,r5 +1: andi. r3,r3,0x7 /* extract idx alone */ + + /* Calculate proper slot value for ppc_md.hpte_updatepp */ + and r0,r5,r27 + rldicr r0,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + add r3,r0,r3 /* add slot idx */ + + /* Call ppc_md.hpte_updatepp */ + mr r5,r29 /* va */ + li r6,MMU_PAGE_4K /* page size */ + ld r7,STK_PARM(r9)(r1) /* segment size */ + ld r8,STK_PARM(r8)(r1) /* get "local" param */ +_GLOBAL(htab_call_hpte_updatepp) + bl . /* patched by htab_finish_init() */ + + /* if we failed because typically the HPTE wasn't really here + * we try an insertion. + */ + cmpdi 0,r3,-1 + beq- htab_insert_pte + + /* Clear the BUSY bit and Write out the PTE */ + li r0,_PAGE_BUSY + andc r30,r30,r0 + ld r6,STK_PARM(r6)(r1) + std r30,0(r6) + li r3,0 + b htab_bail + +htab_wrong_access: + /* Bail out clearing reservation */ + stdcx. r31,0,r6 + li r3,1 + b htab_bail + +htab_pte_insert_failure: + /* Bail out restoring old PTE */ + ld r6,STK_PARM(r6)(r1) + std r31,0(r6) + li r3,-1 + b htab_bail + +#endif /* CONFIG_PPC_64K_PAGES */ + +#ifdef CONFIG_PPC_HAS_HASH_64K + +/***************************************************************************** + * * + * 64K SW & 64K HW in a 64K segment pages implementation * + * * + *****************************************************************************/ + +_GLOBAL(__hash_page_64K) + mflr r0 + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + /* Save all params that we need after a function call */ + std r6,STK_PARM(r6)(r1) + std r8,STK_PARM(r8)(r1) + std r9,STK_PARM(r9)(r1) + + /* Add _PAGE_PRESENT to access */ + ori r4,r4,_PAGE_PRESENT + + /* Save non-volatile registers. + * r31 will hold "old PTE" + * r30 is "new PTE" + * r29 is "va" + * r28 is a hash value + * r27 is hashtab mask (maybe dynamic patched instead ?) + */ + std r27,STK_REG(r27)(r1) + std r28,STK_REG(r28)(r1) + std r29,STK_REG(r29)(r1) + std r30,STK_REG(r30)(r1) + std r31,STK_REG(r31)(r1) + + /* Step 1: + * + * Check permissions, atomically mark the linux PTE busy + * and hashed. + */ +1: + ldarx r31,0,r6 + /* Check access rights (access & ~(pte_val(*ptep))) */ + andc. r0,r4,r31 + bne- ht64_wrong_access + /* Check if PTE is busy */ + andi. r0,r31,_PAGE_BUSY + /* If so, just bail out and refault if needed. Someone else + * is changing this PTE anyway and might hash it. + */ + bne- ht64_bail_ok +BEGIN_FTR_SECTION + /* Check if PTE has the cache-inhibit bit set */ + andi. r0,r31,_PAGE_NO_CACHE + /* If so, bail out and refault as a 4k page */ + bne- ht64_bail_ok +END_FTR_SECTION_IFCLR(CPU_FTR_CI_LARGE_PAGE) + /* Prepare new PTE value (turn access RW into DIRTY, then + * add BUSY and ACCESSED) + */ + rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */ + or r30,r30,r31 + ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED + /* Write the linux PTE atomically (setting busy) */ + stdcx. r30,0,r6 + bne- 1b + isync + + /* Step 2: + * + * Insert/Update the HPTE in the hash table. At this point, + * r4 (access) is re-useable, we use it for the new HPTE flags + */ + +BEGIN_FTR_SECTION + cmpdi r9,0 /* check segment size */ + bne 3f +END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT) + /* Calc va and put it in r29 */ + rldicr r29,r5,28,63-28 + rldicl r3,r3,0,36 + or r29,r3,r29 + + /* Calculate hash value for primary slot and store it in r28 */ + rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */ + rldicl r0,r3,64-16,52 /* (ea >> 16) & 0xfff */ + xor r28,r5,r0 + b 4f + +3: /* Calc VA and hash in r29 and r28 for 1T segment */ + sldi r29,r5,40 /* vsid << 40 */ + clrldi r3,r3,24 /* ea & 0xffffffffff */ + rldic r28,r5,25,25 /* (vsid << 25) & 0x7fffffffff */ + clrldi r5,r5,40 /* vsid & 0xffffff */ + rldicl r0,r3,64-16,40 /* (ea >> 16) & 0xffffff */ + xor r28,r28,r5 + or r29,r3,r29 /* VA */ + xor r28,r28,r0 /* hash */ + + /* Convert linux PTE bits into HW equivalents */ +4: andi. r3,r30,0x1fe /* Get basic set of flags */ + xori r3,r3,HPTE_R_N /* _PAGE_EXEC -> NOEXEC */ + rlwinm r0,r30,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */ + rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */ + and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/ + andc r0,r30,r0 /* r0 = pte & ~r0 */ + rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */ + ori r3,r3,HPTE_R_C /* Always add "C" bit for perf. */ + + /* We eventually do the icache sync here (maybe inline that + * code rather than call a C function...) + */ +BEGIN_FTR_SECTION + mr r4,r30 + mr r5,r7 + bl .hash_page_do_lazy_icache +END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE) + + /* At this point, r3 contains new PP bits, save them in + * place of "access" in the param area (sic) + */ + std r3,STK_PARM(r4)(r1) + + /* Get htab_hash_mask */ + ld r4,htab_hash_mask@got(2) + ld r27,0(r4) /* htab_hash_mask -> r27 */ + + /* Check if we may already be in the hashtable, in this case, we + * go to out-of-line code to try to modify the HPTE + */ + rldicl. r0,r31,64-12,48 + bne ht64_modify_pte + +ht64_insert_pte: + /* Clear hpte bits in new pte (we also clear BUSY btw) and + * add _PAGE_HPTE_SUB0 + */ + lis r0,_PAGE_HPTEFLAGS@h + ori r0,r0,_PAGE_HPTEFLAGS@l + andc r30,r30,r0 +#ifdef CONFIG_PPC_64K_PAGES + oris r30,r30,_PAGE_HPTE_SUB0@h +#else + ori r30,r30,_PAGE_HASHPTE +#endif + /* Phyical address in r5 */ + rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT + sldi r5,r5,PAGE_SHIFT + + /* Calculate primary group hash */ + and r0,r28,r27 + rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + + /* Call ppc_md.hpte_insert */ + ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */ + mr r4,r29 /* Retreive va */ + li r7,0 /* !bolted, !secondary */ + li r8,MMU_PAGE_64K + ld r9,STK_PARM(r9)(r1) /* segment size */ +_GLOBAL(ht64_call_hpte_insert1) + bl . /* patched by htab_finish_init() */ + cmpdi 0,r3,0 + bge ht64_pte_insert_ok /* Insertion successful */ + cmpdi 0,r3,-2 /* Critical failure */ + beq- ht64_pte_insert_failure + + /* Now try secondary slot */ + + /* Phyical address in r5 */ + rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT + sldi r5,r5,PAGE_SHIFT + + /* Calculate secondary group hash */ + andc r0,r27,r28 + rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */ + + /* Call ppc_md.hpte_insert */ + ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */ + mr r4,r29 /* Retreive va */ + li r7,HPTE_V_SECONDARY /* !bolted, secondary */ + li r8,MMU_PAGE_64K + ld r9,STK_PARM(r9)(r1) /* segment size */ +_GLOBAL(ht64_call_hpte_insert2) + bl . /* patched by htab_finish_init() */ + cmpdi 0,r3,0 + bge+ ht64_pte_insert_ok /* Insertion successful */ + cmpdi 0,r3,-2 /* Critical failure */ + beq- ht64_pte_insert_failure + + /* Both are full, we need to evict something */ + mftb r0 + /* Pick a random group based on TB */ + andi. r0,r0,1 + mr r5,r28 + bne 2f + not r5,r5 +2: and r0,r5,r27 + rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + /* Call ppc_md.hpte_remove */ +_GLOBAL(ht64_call_hpte_remove) + bl . /* patched by htab_finish_init() */ + + /* Try all again */ + b ht64_insert_pte + +ht64_bail_ok: + li r3,0 + b ht64_bail + +ht64_pte_insert_ok: + /* Insert slot number & secondary bit in PTE */ + rldimi r30,r3,12,63-15 + + /* Write out the PTE with a normal write + * (maybe add eieio may be good still ?) + */ +ht64_write_out_pte: + ld r6,STK_PARM(r6)(r1) + std r30,0(r6) + li r3, 0 +ht64_bail: + ld r27,STK_REG(r27)(r1) + ld r28,STK_REG(r28)(r1) + ld r29,STK_REG(r29)(r1) + ld r30,STK_REG(r30)(r1) + ld r31,STK_REG(r31)(r1) + addi r1,r1,STACKFRAMESIZE + ld r0,16(r1) + mtlr r0 + blr + +ht64_modify_pte: + /* Keep PP bits in r4 and slot idx from the PTE around in r3 */ + mr r4,r3 + rlwinm r3,r31,32-12,29,31 + + /* Secondary group ? if yes, get a inverted hash value */ + mr r5,r28 + andi. r0,r31,_PAGE_F_SECOND + beq 1f + not r5,r5 +1: + /* Calculate proper slot value for ppc_md.hpte_updatepp */ + and r0,r5,r27 + rldicr r0,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + add r3,r0,r3 /* add slot idx */ + + /* Call ppc_md.hpte_updatepp */ + mr r5,r29 /* va */ + li r6,MMU_PAGE_64K + ld r7,STK_PARM(r9)(r1) /* segment size */ + ld r8,STK_PARM(r8)(r1) /* get "local" param */ +_GLOBAL(ht64_call_hpte_updatepp) + bl . /* patched by htab_finish_init() */ + + /* if we failed because typically the HPTE wasn't really here + * we try an insertion. + */ + cmpdi 0,r3,-1 + beq- ht64_insert_pte + + /* Clear the BUSY bit and Write out the PTE */ + li r0,_PAGE_BUSY + andc r30,r30,r0 + b ht64_write_out_pte + +ht64_wrong_access: + /* Bail out clearing reservation */ + stdcx. r31,0,r6 + li r3,1 + b ht64_bail + +ht64_pte_insert_failure: + /* Bail out restoring old PTE */ + ld r6,STK_PARM(r6)(r1) + std r31,0(r6) + li r3,-1 + b ht64_bail + + +#endif /* CONFIG_PPC_HAS_HASH_64K */ + + +/***************************************************************************** + * * + * Huge pages implementation is in hugetlbpage.c * + * * + *****************************************************************************/ diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c new file mode 100644 index 0000000..34e5c0b --- /dev/null +++ b/arch/powerpc/mm/hash_native_64.c @@ -0,0 +1,569 @@ +/* + * native hashtable management. + * + * SMP scalability work: + * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG_LOW + +#include <linux/spinlock.h> +#include <linux/bitops.h> +#include <linux/threads.h> +#include <linux/smp.h> + +#include <asm/abs_addr.h> +#include <asm/machdep.h> +#include <asm/mmu.h> +#include <asm/mmu_context.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/tlb.h> +#include <asm/cputable.h> +#include <asm/udbg.h> +#include <asm/kexec.h> + +#ifdef DEBUG_LOW +#define DBG_LOW(fmt...) udbg_printf(fmt) +#else +#define DBG_LOW(fmt...) +#endif + +#define HPTE_LOCK_BIT 3 + +static DEFINE_SPINLOCK(native_tlbie_lock); + +static inline void __tlbie(unsigned long va, int psize, int ssize) +{ + unsigned int penc; + + /* clear top 16 bits, non SLS segment */ + va &= ~(0xffffULL << 48); + + switch (psize) { + case MMU_PAGE_4K: + va &= ~0xffful; + va |= ssize << 8; + asm volatile("tlbie %0,0" : : "r" (va) : "memory"); + break; + default: + penc = mmu_psize_defs[psize].penc; + va &= ~((1ul << mmu_psize_defs[psize].shift) - 1); + va |= penc << 12; + va |= ssize << 8; + asm volatile("tlbie %0,1" : : "r" (va) : "memory"); + break; + } +} + +static inline void __tlbiel(unsigned long va, int psize, int ssize) +{ + unsigned int penc; + + /* clear top 16 bits, non SLS segment */ + va &= ~(0xffffULL << 48); + + switch (psize) { + case MMU_PAGE_4K: + va &= ~0xffful; + va |= ssize << 8; + asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)" + : : "r"(va) : "memory"); + break; + default: + penc = mmu_psize_defs[psize].penc; + va &= ~((1ul << mmu_psize_defs[psize].shift) - 1); + va |= penc << 12; + va |= ssize << 8; + asm volatile(".long 0x7c000224 | (%0 << 11) | (1 << 21)" + : : "r"(va) : "memory"); + break; + } + +} + +static inline void tlbie(unsigned long va, int psize, int ssize, int local) +{ + unsigned int use_local = local && cpu_has_feature(CPU_FTR_TLBIEL); + int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); + + if (use_local) + use_local = mmu_psize_defs[psize].tlbiel; + if (lock_tlbie && !use_local) + spin_lock(&native_tlbie_lock); + asm volatile("ptesync": : :"memory"); + if (use_local) { + __tlbiel(va, psize, ssize); + asm volatile("ptesync": : :"memory"); + } else { + __tlbie(va, psize, ssize); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); + } + if (lock_tlbie && !use_local) + spin_unlock(&native_tlbie_lock); +} + +static inline void native_lock_hpte(struct hash_pte *hptep) +{ + unsigned long *word = &hptep->v; + + while (1) { + if (!test_and_set_bit(HPTE_LOCK_BIT, word)) + break; + while(test_bit(HPTE_LOCK_BIT, word)) + cpu_relax(); + } +} + +static inline void native_unlock_hpte(struct hash_pte *hptep) +{ + unsigned long *word = &hptep->v; + + asm volatile("lwsync":::"memory"); + clear_bit(HPTE_LOCK_BIT, word); +} + +static long native_hpte_insert(unsigned long hpte_group, unsigned long va, + unsigned long pa, unsigned long rflags, + unsigned long vflags, int psize, int ssize) +{ + struct hash_pte *hptep = htab_address + hpte_group; + unsigned long hpte_v, hpte_r; + int i; + + if (!(vflags & HPTE_V_BOLTED)) { + DBG_LOW(" insert(group=%lx, va=%016lx, pa=%016lx," + " rflags=%lx, vflags=%lx, psize=%d)\n", + hpte_group, va, pa, rflags, vflags, psize); + } + + for (i = 0; i < HPTES_PER_GROUP; i++) { + if (! (hptep->v & HPTE_V_VALID)) { + /* retry with lock held */ + native_lock_hpte(hptep); + if (! (hptep->v & HPTE_V_VALID)) + break; + native_unlock_hpte(hptep); + } + + hptep++; + } + + if (i == HPTES_PER_GROUP) + return -1; + + hpte_v = hpte_encode_v(va, psize, ssize) | vflags | HPTE_V_VALID; + hpte_r = hpte_encode_r(pa, psize) | rflags; + + if (!(vflags & HPTE_V_BOLTED)) { + DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n", + i, hpte_v, hpte_r); + } + + hptep->r = hpte_r; + /* Guarantee the second dword is visible before the valid bit */ + eieio(); + /* + * Now set the first dword including the valid bit + * NOTE: this also unlocks the hpte + */ + hptep->v = hpte_v; + + __asm__ __volatile__ ("ptesync" : : : "memory"); + + return i | (!!(vflags & HPTE_V_SECONDARY) << 3); +} + +static long native_hpte_remove(unsigned long hpte_group) +{ + struct hash_pte *hptep; + int i; + int slot_offset; + unsigned long hpte_v; + + DBG_LOW(" remove(group=%lx)\n", hpte_group); + + /* pick a random entry to start at */ + slot_offset = mftb() & 0x7; + + for (i = 0; i < HPTES_PER_GROUP; i++) { + hptep = htab_address + hpte_group + slot_offset; + hpte_v = hptep->v; + + if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) { + /* retry with lock held */ + native_lock_hpte(hptep); + hpte_v = hptep->v; + if ((hpte_v & HPTE_V_VALID) + && !(hpte_v & HPTE_V_BOLTED)) + break; + native_unlock_hpte(hptep); + } + + slot_offset++; + slot_offset &= 0x7; + } + + if (i == HPTES_PER_GROUP) + return -1; + + /* Invalidate the hpte. NOTE: this also unlocks it */ + hptep->v = 0; + + return i; +} + +static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, + unsigned long va, int psize, int ssize, + int local) +{ + struct hash_pte *hptep = htab_address + slot; + unsigned long hpte_v, want_v; + int ret = 0; + + want_v = hpte_encode_v(va, psize, ssize); + + DBG_LOW(" update(va=%016lx, avpnv=%016lx, hash=%016lx, newpp=%x)", + va, want_v & HPTE_V_AVPN, slot, newpp); + + native_lock_hpte(hptep); + + hpte_v = hptep->v; + + /* Even if we miss, we need to invalidate the TLB */ + if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) { + DBG_LOW(" -> miss\n"); + ret = -1; + } else { + DBG_LOW(" -> hit\n"); + /* Update the HPTE */ + hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) | + (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C)); + } + native_unlock_hpte(hptep); + + /* Ensure it is out of the tlb too. */ + tlbie(va, psize, ssize, local); + + return ret; +} + +static long native_hpte_find(unsigned long va, int psize, int ssize) +{ + struct hash_pte *hptep; + unsigned long hash; + unsigned long i; + long slot; + unsigned long want_v, hpte_v; + + hash = hpt_hash(va, mmu_psize_defs[psize].shift, ssize); + want_v = hpte_encode_v(va, psize, ssize); + + /* Bolted mappings are only ever in the primary group */ + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + for (i = 0; i < HPTES_PER_GROUP; i++) { + hptep = htab_address + slot; + hpte_v = hptep->v; + + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) + /* HPTE matches */ + return slot; + ++slot; + } + + return -1; +} + +/* + * Update the page protection bits. Intended to be used to create + * guard pages for kernel data structures on pages which are bolted + * in the HPT. Assumes pages being operated on will not be stolen. + * + * No need to lock here because we should be the only user. + */ +static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, + int psize, int ssize) +{ + unsigned long vsid, va; + long slot; + struct hash_pte *hptep; + + vsid = get_kernel_vsid(ea, ssize); + va = hpt_va(ea, vsid, ssize); + + slot = native_hpte_find(va, psize, ssize); + if (slot == -1) + panic("could not find page to bolt\n"); + hptep = htab_address + slot; + + /* Update the HPTE */ + hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) | + (newpp & (HPTE_R_PP | HPTE_R_N)); + + /* Ensure it is out of the tlb too. */ + tlbie(va, psize, ssize, 0); +} + +static void native_hpte_invalidate(unsigned long slot, unsigned long va, + int psize, int ssize, int local) +{ + struct hash_pte *hptep = htab_address + slot; + unsigned long hpte_v; + unsigned long want_v; + unsigned long flags; + + local_irq_save(flags); + + DBG_LOW(" invalidate(va=%016lx, hash: %x)\n", va, slot); + + want_v = hpte_encode_v(va, psize, ssize); + native_lock_hpte(hptep); + hpte_v = hptep->v; + + /* Even if we miss, we need to invalidate the TLB */ + if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) + native_unlock_hpte(hptep); + else + /* Invalidate the hpte. NOTE: this also unlocks it */ + hptep->v = 0; + + /* Invalidate the TLB */ + tlbie(va, psize, ssize, local); + + local_irq_restore(flags); +} + +#define LP_SHIFT 12 +#define LP_BITS 8 +#define LP_MASK(i) ((0xFF >> (i)) << LP_SHIFT) + +static void hpte_decode(struct hash_pte *hpte, unsigned long slot, + int *psize, int *ssize, unsigned long *va) +{ + unsigned long hpte_r = hpte->r; + unsigned long hpte_v = hpte->v; + unsigned long avpn; + int i, size, shift, penc; + + if (!(hpte_v & HPTE_V_LARGE)) + size = MMU_PAGE_4K; + else { + for (i = 0; i < LP_BITS; i++) { + if ((hpte_r & LP_MASK(i+1)) == LP_MASK(i+1)) + break; + } + penc = LP_MASK(i+1) >> LP_SHIFT; + for (size = 0; size < MMU_PAGE_COUNT; size++) { + + /* 4K pages are not represented by LP */ + if (size == MMU_PAGE_4K) + continue; + + /* valid entries have a shift value */ + if (!mmu_psize_defs[size].shift) + continue; + + if (penc == mmu_psize_defs[size].penc) + break; + } + } + + /* This works for all page sizes, and for 256M and 1T segments */ + shift = mmu_psize_defs[size].shift; + avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm) << 23; + + if (shift < 23) { + unsigned long vpi, vsid, pteg; + + pteg = slot / HPTES_PER_GROUP; + if (hpte_v & HPTE_V_SECONDARY) + pteg = ~pteg; + switch (hpte_v >> HPTE_V_SSIZE_SHIFT) { + case MMU_SEGSIZE_256M: + vpi = ((avpn >> 28) ^ pteg) & htab_hash_mask; + break; + case MMU_SEGSIZE_1T: + vsid = avpn >> 40; + vpi = (vsid ^ (vsid << 25) ^ pteg) & htab_hash_mask; + break; + default: + avpn = vpi = size = 0; + } + avpn |= (vpi << mmu_psize_defs[size].shift); + } + + *va = avpn; + *psize = size; + *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT; +} + +/* + * clear all mappings on kexec. All cpus are in real mode (or they will + * be when they isi), and we are the only one left. We rely on our kernel + * mapping being 0xC0's and the hardware ignoring those two real bits. + * + * TODO: add batching support when enabled. remember, no dynamic memory here, + * athough there is the control page available... + */ +static void native_hpte_clear(void) +{ + unsigned long slot, slots, flags; + struct hash_pte *hptep = htab_address; + unsigned long hpte_v, va; + unsigned long pteg_count; + int psize, ssize; + + pteg_count = htab_hash_mask + 1; + + local_irq_save(flags); + + /* we take the tlbie lock and hold it. Some hardware will + * deadlock if we try to tlbie from two processors at once. + */ + spin_lock(&native_tlbie_lock); + + slots = pteg_count * HPTES_PER_GROUP; + + for (slot = 0; slot < slots; slot++, hptep++) { + /* + * we could lock the pte here, but we are the only cpu + * running, right? and for crash dump, we probably + * don't want to wait for a maybe bad cpu. + */ + hpte_v = hptep->v; + + /* + * Call __tlbie() here rather than tlbie() since we + * already hold the native_tlbie_lock. + */ + if (hpte_v & HPTE_V_VALID) { + hpte_decode(hptep, slot, &psize, &ssize, &va); + hptep->v = 0; + __tlbie(va, psize, ssize); + } + } + + asm volatile("eieio; tlbsync; ptesync":::"memory"); + spin_unlock(&native_tlbie_lock); + local_irq_restore(flags); +} + +/* + * Batched hash table flush, we batch the tlbie's to avoid taking/releasing + * the lock all the time + */ +static void native_flush_hash_range(unsigned long number, int local) +{ + unsigned long va, hash, index, hidx, shift, slot; + struct hash_pte *hptep; + unsigned long hpte_v; + unsigned long want_v; + unsigned long flags; + real_pte_t pte; + struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); + unsigned long psize = batch->psize; + int ssize = batch->ssize; + int i; + + local_irq_save(flags); + + for (i = 0; i < number; i++) { + va = batch->vaddr[i]; + pte = batch->pte[i]; + + pte_iterate_hashed_subpages(pte, psize, va, index, shift) { + hash = hpt_hash(va, shift, ssize); + hidx = __rpte_to_hidx(pte, index); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + hptep = htab_address + slot; + want_v = hpte_encode_v(va, psize, ssize); + native_lock_hpte(hptep); + hpte_v = hptep->v; + if (!HPTE_V_COMPARE(hpte_v, want_v) || + !(hpte_v & HPTE_V_VALID)) + native_unlock_hpte(hptep); + else + hptep->v = 0; + } pte_iterate_hashed_end(); + } + + if (cpu_has_feature(CPU_FTR_TLBIEL) && + mmu_psize_defs[psize].tlbiel && local) { + asm volatile("ptesync":::"memory"); + for (i = 0; i < number; i++) { + va = batch->vaddr[i]; + pte = batch->pte[i]; + + pte_iterate_hashed_subpages(pte, psize, va, index, + shift) { + __tlbiel(va, psize, ssize); + } pte_iterate_hashed_end(); + } + asm volatile("ptesync":::"memory"); + } else { + int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + spin_lock(&native_tlbie_lock); + + asm volatile("ptesync":::"memory"); + for (i = 0; i < number; i++) { + va = batch->vaddr[i]; + pte = batch->pte[i]; + + pte_iterate_hashed_subpages(pte, psize, va, index, + shift) { + __tlbie(va, psize, ssize); + } pte_iterate_hashed_end(); + } + asm volatile("eieio; tlbsync; ptesync":::"memory"); + + if (lock_tlbie) + spin_unlock(&native_tlbie_lock); + } + + local_irq_restore(flags); +} + +#ifdef CONFIG_PPC_PSERIES +/* Disable TLB batching on nighthawk */ +static inline int tlb_batching_enabled(void) +{ + struct device_node *root = of_find_node_by_path("/"); + int enabled = 1; + + if (root) { + const char *model = of_get_property(root, "model", NULL); + if (model && !strcmp(model, "IBM,9076-N81")) + enabled = 0; + of_node_put(root); + } + + return enabled; +} +#else +static inline int tlb_batching_enabled(void) +{ + return 1; +} +#endif + +void __init hpte_init_native(void) +{ + ppc_md.hpte_invalidate = native_hpte_invalidate; + ppc_md.hpte_updatepp = native_hpte_updatepp; + ppc_md.hpte_updateboltedpp = native_hpte_updateboltedpp; + ppc_md.hpte_insert = native_hpte_insert; + ppc_md.hpte_remove = native_hpte_remove; + ppc_md.hpte_clear_all = native_hpte_clear; + if (tlb_batching_enabled()) + ppc_md.flush_hash_range = native_flush_hash_range; +} diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c new file mode 100644 index 0000000..8d5b475 --- /dev/null +++ b/arch/powerpc/mm/hash_utils_64.c @@ -0,0 +1,1197 @@ +/* + * PowerPC64 port by Mike Corrigan and Dave Engebretsen + * {mikejc|engebret}@us.ibm.com + * + * Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com> + * + * SMP scalability work: + * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM + * + * Module name: htab.c + * + * Description: + * PowerPC Hashed Page Table functions + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG +#undef DEBUG_LOW + +#include <linux/spinlock.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/sysctl.h> +#include <linux/ctype.h> +#include <linux/cache.h> +#include <linux/init.h> +#include <linux/signal.h> +#include <linux/lmb.h> + +#include <asm/processor.h> +#include <asm/pgtable.h> +#include <asm/mmu.h> +#include <asm/mmu_context.h> +#include <asm/page.h> +#include <asm/types.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <asm/machdep.h> +#include <asm/prom.h> +#include <asm/abs_addr.h> +#include <asm/tlbflush.h> +#include <asm/io.h> +#include <asm/eeh.h> +#include <asm/tlb.h> +#include <asm/cacheflush.h> +#include <asm/cputable.h> +#include <asm/sections.h> +#include <asm/spu.h> +#include <asm/udbg.h> + +#ifdef DEBUG +#define DBG(fmt...) udbg_printf(fmt) +#else +#define DBG(fmt...) +#endif + +#ifdef DEBUG_LOW +#define DBG_LOW(fmt...) udbg_printf(fmt) +#else +#define DBG_LOW(fmt...) +#endif + +#define KB (1024) +#define MB (1024*KB) +#define GB (1024L*MB) + +/* + * Note: pte --> Linux PTE + * HPTE --> PowerPC Hashed Page Table Entry + * + * Execution context: + * htab_initialize is called with the MMU off (of course), but + * the kernel has been copied down to zero so it can directly + * reference global data. At this point it is very difficult + * to print debug info. + * + */ + +#ifdef CONFIG_U3_DART +extern unsigned long dart_tablebase; +#endif /* CONFIG_U3_DART */ + +static unsigned long _SDR1; +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; + +struct hash_pte *htab_address; +unsigned long htab_size_bytes; +unsigned long htab_hash_mask; +int mmu_linear_psize = MMU_PAGE_4K; +int mmu_virtual_psize = MMU_PAGE_4K; +int mmu_vmalloc_psize = MMU_PAGE_4K; +#ifdef CONFIG_SPARSEMEM_VMEMMAP +int mmu_vmemmap_psize = MMU_PAGE_4K; +#endif +int mmu_io_psize = MMU_PAGE_4K; +int mmu_kernel_ssize = MMU_SEGSIZE_256M; +int mmu_highuser_ssize = MMU_SEGSIZE_256M; +u16 mmu_slb_size = 64; +#ifdef CONFIG_HUGETLB_PAGE +unsigned int HPAGE_SHIFT; +#endif +#ifdef CONFIG_PPC_64K_PAGES +int mmu_ci_restrictions; +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC +static u8 *linear_map_hash_slots; +static unsigned long linear_map_hash_count; +static DEFINE_SPINLOCK(linear_map_hash_lock); +#endif /* CONFIG_DEBUG_PAGEALLOC */ + +/* There are definitions of page sizes arrays to be used when none + * is provided by the firmware. + */ + +/* Pre-POWER4 CPUs (4k pages only) + */ +static struct mmu_psize_def mmu_psize_defaults_old[] = { + [MMU_PAGE_4K] = { + .shift = 12, + .sllp = 0, + .penc = 0, + .avpnm = 0, + .tlbiel = 0, + }, +}; + +/* POWER4, GPUL, POWER5 + * + * Support for 16Mb large pages + */ +static struct mmu_psize_def mmu_psize_defaults_gp[] = { + [MMU_PAGE_4K] = { + .shift = 12, + .sllp = 0, + .penc = 0, + .avpnm = 0, + .tlbiel = 1, + }, + [MMU_PAGE_16M] = { + .shift = 24, + .sllp = SLB_VSID_L, + .penc = 0, + .avpnm = 0x1UL, + .tlbiel = 0, + }, +}; + +static unsigned long htab_convert_pte_flags(unsigned long pteflags) +{ + unsigned long rflags = pteflags & 0x1fa; + + /* _PAGE_EXEC -> NOEXEC */ + if ((pteflags & _PAGE_EXEC) == 0) + rflags |= HPTE_R_N; + + /* PP bits. PAGE_USER is already PP bit 0x2, so we only + * need to add in 0x1 if it's a read-only user page + */ + if ((pteflags & _PAGE_USER) && !((pteflags & _PAGE_RW) && + (pteflags & _PAGE_DIRTY))) + rflags |= 1; + + /* Always add C */ + return rflags | HPTE_R_C; +} + +int htab_bolt_mapping(unsigned long vstart, unsigned long vend, + unsigned long pstart, unsigned long prot, + int psize, int ssize) +{ + unsigned long vaddr, paddr; + unsigned int step, shift; + int ret = 0; + + shift = mmu_psize_defs[psize].shift; + step = 1 << shift; + + prot = htab_convert_pte_flags(prot); + + DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n", + vstart, vend, pstart, prot, psize, ssize); + + for (vaddr = vstart, paddr = pstart; vaddr < vend; + vaddr += step, paddr += step) { + unsigned long hash, hpteg; + unsigned long vsid = get_kernel_vsid(vaddr, ssize); + unsigned long va = hpt_va(vaddr, vsid, ssize); + unsigned long tprot = prot; + + /* Make kernel text executable */ + if (overlaps_kernel_text(vaddr, vaddr + step)) + tprot &= ~HPTE_R_N; + + hash = hpt_hash(va, shift, ssize); + hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); + + BUG_ON(!ppc_md.hpte_insert); + ret = ppc_md.hpte_insert(hpteg, va, paddr, tprot, + HPTE_V_BOLTED, psize, ssize); + + if (ret < 0) + break; +#ifdef CONFIG_DEBUG_PAGEALLOC + if ((paddr >> PAGE_SHIFT) < linear_map_hash_count) + linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80; +#endif /* CONFIG_DEBUG_PAGEALLOC */ + } + return ret < 0 ? ret : 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static int htab_remove_mapping(unsigned long vstart, unsigned long vend, + int psize, int ssize) +{ + unsigned long vaddr; + unsigned int step, shift; + + shift = mmu_psize_defs[psize].shift; + step = 1 << shift; + + if (!ppc_md.hpte_removebolted) { + printk(KERN_WARNING "Platform doesn't implement " + "hpte_removebolted\n"); + return -EINVAL; + } + + for (vaddr = vstart; vaddr < vend; vaddr += step) + ppc_md.hpte_removebolted(vaddr, psize, ssize); + + return 0; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + +static int __init htab_dt_scan_seg_sizes(unsigned long node, + const char *uname, int depth, + void *data) +{ + char *type = of_get_flat_dt_prop(node, "device_type", NULL); + u32 *prop; + unsigned long size = 0; + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + prop = (u32 *)of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", + &size); + if (prop == NULL) + return 0; + for (; size >= 4; size -= 4, ++prop) { + if (prop[0] == 40) { + DBG("1T segment support detected\n"); + cur_cpu_spec->cpu_features |= CPU_FTR_1T_SEGMENT; + return 1; + } + } + cur_cpu_spec->cpu_features &= ~CPU_FTR_NO_SLBIE_B; + return 0; +} + +static void __init htab_init_seg_sizes(void) +{ + of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL); +} + +static int __init htab_dt_scan_page_sizes(unsigned long node, + const char *uname, int depth, + void *data) +{ + char *type = of_get_flat_dt_prop(node, "device_type", NULL); + u32 *prop; + unsigned long size = 0; + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + prop = (u32 *)of_get_flat_dt_prop(node, + "ibm,segment-page-sizes", &size); + if (prop != NULL) { + DBG("Page sizes from device-tree:\n"); + size /= 4; + cur_cpu_spec->cpu_features &= ~(CPU_FTR_16M_PAGE); + while(size > 0) { + unsigned int shift = prop[0]; + unsigned int slbenc = prop[1]; + unsigned int lpnum = prop[2]; + unsigned int lpenc = 0; + struct mmu_psize_def *def; + int idx = -1; + + size -= 3; prop += 3; + while(size > 0 && lpnum) { + if (prop[0] == shift) + lpenc = prop[1]; + prop += 2; size -= 2; + lpnum--; + } + switch(shift) { + case 0xc: + idx = MMU_PAGE_4K; + break; + case 0x10: + idx = MMU_PAGE_64K; + break; + case 0x14: + idx = MMU_PAGE_1M; + break; + case 0x18: + idx = MMU_PAGE_16M; + cur_cpu_spec->cpu_features |= CPU_FTR_16M_PAGE; + break; + case 0x22: + idx = MMU_PAGE_16G; + break; + } + if (idx < 0) + continue; + def = &mmu_psize_defs[idx]; + def->shift = shift; + if (shift <= 23) + def->avpnm = 0; + else + def->avpnm = (1 << (shift - 23)) - 1; + def->sllp = slbenc; + def->penc = lpenc; + /* We don't know for sure what's up with tlbiel, so + * for now we only set it for 4K and 64K pages + */ + if (idx == MMU_PAGE_4K || idx == MMU_PAGE_64K) + def->tlbiel = 1; + else + def->tlbiel = 0; + + DBG(" %d: shift=%02x, sllp=%04x, avpnm=%08x, " + "tlbiel=%d, penc=%d\n", + idx, shift, def->sllp, def->avpnm, def->tlbiel, + def->penc); + } + return 1; + } + return 0; +} + +#ifdef CONFIG_HUGETLB_PAGE +/* Scan for 16G memory blocks that have been set aside for huge pages + * and reserve those blocks for 16G huge pages. + */ +static int __init htab_dt_scan_hugepage_blocks(unsigned long node, + const char *uname, int depth, + void *data) { + char *type = of_get_flat_dt_prop(node, "device_type", NULL); + unsigned long *addr_prop; + u32 *page_count_prop; + unsigned int expected_pages; + long unsigned int phys_addr; + long unsigned int block_size; + + /* We are scanning "memory" nodes only */ + if (type == NULL || strcmp(type, "memory") != 0) + return 0; + + /* This property is the log base 2 of the number of virtual pages that + * will represent this memory block. */ + page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL); + if (page_count_prop == NULL) + return 0; + expected_pages = (1 << page_count_prop[0]); + addr_prop = of_get_flat_dt_prop(node, "reg", NULL); + if (addr_prop == NULL) + return 0; + phys_addr = addr_prop[0]; + block_size = addr_prop[1]; + if (block_size != (16 * GB)) + return 0; + printk(KERN_INFO "Huge page(16GB) memory: " + "addr = 0x%lX size = 0x%lX pages = %d\n", + phys_addr, block_size, expected_pages); + if (phys_addr + (16 * GB) <= lmb_end_of_DRAM()) { + lmb_reserve(phys_addr, block_size * expected_pages); + add_gpage(phys_addr, block_size, expected_pages); + } + return 0; +} +#endif /* CONFIG_HUGETLB_PAGE */ + +static void __init htab_init_page_sizes(void) +{ + int rc; + + /* Default to 4K pages only */ + memcpy(mmu_psize_defs, mmu_psize_defaults_old, + sizeof(mmu_psize_defaults_old)); + + /* + * Try to find the available page sizes in the device-tree + */ + rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL); + if (rc != 0) /* Found */ + goto found; + + /* + * Not in the device-tree, let's fallback on known size + * list for 16M capable GP & GR + */ + if (cpu_has_feature(CPU_FTR_16M_PAGE)) + memcpy(mmu_psize_defs, mmu_psize_defaults_gp, + sizeof(mmu_psize_defaults_gp)); + found: +#ifndef CONFIG_DEBUG_PAGEALLOC + /* + * Pick a size for the linear mapping. Currently, we only support + * 16M, 1M and 4K which is the default + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift) + mmu_linear_psize = MMU_PAGE_16M; + else if (mmu_psize_defs[MMU_PAGE_1M].shift) + mmu_linear_psize = MMU_PAGE_1M; +#endif /* CONFIG_DEBUG_PAGEALLOC */ + +#ifdef CONFIG_PPC_64K_PAGES + /* + * Pick a size for the ordinary pages. Default is 4K, we support + * 64K for user mappings and vmalloc if supported by the processor. + * We only use 64k for ioremap if the processor + * (and firmware) support cache-inhibited large pages. + * If not, we use 4k and set mmu_ci_restrictions so that + * hash_page knows to switch processes that use cache-inhibited + * mappings to 4k pages. + */ + if (mmu_psize_defs[MMU_PAGE_64K].shift) { + mmu_virtual_psize = MMU_PAGE_64K; + mmu_vmalloc_psize = MMU_PAGE_64K; + if (mmu_linear_psize == MMU_PAGE_4K) + mmu_linear_psize = MMU_PAGE_64K; + if (cpu_has_feature(CPU_FTR_CI_LARGE_PAGE)) { + /* + * Don't use 64k pages for ioremap on pSeries, since + * that would stop us accessing the HEA ethernet. + */ + if (!machine_is(pseries)) + mmu_io_psize = MMU_PAGE_64K; + } else + mmu_ci_restrictions = 1; + } +#endif /* CONFIG_PPC_64K_PAGES */ + +#ifdef CONFIG_SPARSEMEM_VMEMMAP + /* We try to use 16M pages for vmemmap if that is supported + * and we have at least 1G of RAM at boot + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift && + lmb_phys_mem_size() >= 0x40000000) + mmu_vmemmap_psize = MMU_PAGE_16M; + else if (mmu_psize_defs[MMU_PAGE_64K].shift) + mmu_vmemmap_psize = MMU_PAGE_64K; + else + mmu_vmemmap_psize = MMU_PAGE_4K; +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + + printk(KERN_DEBUG "Page orders: linear mapping = %d, " + "virtual = %d, io = %d" +#ifdef CONFIG_SPARSEMEM_VMEMMAP + ", vmemmap = %d" +#endif + "\n", + mmu_psize_defs[mmu_linear_psize].shift, + mmu_psize_defs[mmu_virtual_psize].shift, + mmu_psize_defs[mmu_io_psize].shift +#ifdef CONFIG_SPARSEMEM_VMEMMAP + ,mmu_psize_defs[mmu_vmemmap_psize].shift +#endif + ); + +#ifdef CONFIG_HUGETLB_PAGE + /* Reserve 16G huge page memory sections for huge pages */ + of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); + +/* Set default large page size. Currently, we pick 16M or 1M depending + * on what is available + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift) + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; + /* With 4k/4level pagetables, we can't (for now) cope with a + * huge page size < PMD_SIZE */ + else if (mmu_psize_defs[MMU_PAGE_1M].shift) + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; +#endif /* CONFIG_HUGETLB_PAGE */ +} + +static int __init htab_dt_scan_pftsize(unsigned long node, + const char *uname, int depth, + void *data) +{ + char *type = of_get_flat_dt_prop(node, "device_type", NULL); + u32 *prop; + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + prop = (u32 *)of_get_flat_dt_prop(node, "ibm,pft-size", NULL); + if (prop != NULL) { + /* pft_size[0] is the NUMA CEC cookie */ + ppc64_pft_size = prop[1]; + return 1; + } + return 0; +} + +static unsigned long __init htab_get_table_size(void) +{ + unsigned long mem_size, rnd_mem_size, pteg_count; + + /* If hash size isn't already provided by the platform, we try to + * retrieve it from the device-tree. If it's not there neither, we + * calculate it now based on the total RAM size + */ + if (ppc64_pft_size == 0) + of_scan_flat_dt(htab_dt_scan_pftsize, NULL); + if (ppc64_pft_size) + return 1UL << ppc64_pft_size; + + /* round mem_size up to next power of 2 */ + mem_size = lmb_phys_mem_size(); + rnd_mem_size = 1UL << __ilog2(mem_size); + if (rnd_mem_size < mem_size) + rnd_mem_size <<= 1; + + /* # pages / 2 */ + pteg_count = max(rnd_mem_size >> (12 + 1), 1UL << 11); + + return pteg_count << 7; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void create_section_mapping(unsigned long start, unsigned long end) +{ + BUG_ON(htab_bolt_mapping(start, end, __pa(start), + pgprot_val(PAGE_KERNEL), mmu_linear_psize, + mmu_kernel_ssize)); +} + +int remove_section_mapping(unsigned long start, unsigned long end) +{ + return htab_remove_mapping(start, end, mmu_linear_psize, + mmu_kernel_ssize); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + +static inline void make_bl(unsigned int *insn_addr, void *func) +{ + unsigned long funcp = *((unsigned long *)func); + int offset = funcp - (unsigned long)insn_addr; + + *insn_addr = (unsigned int)(0x48000001 | (offset & 0x03fffffc)); + flush_icache_range((unsigned long)insn_addr, 4+ + (unsigned long)insn_addr); +} + +static void __init htab_finish_init(void) +{ + extern unsigned int *htab_call_hpte_insert1; + extern unsigned int *htab_call_hpte_insert2; + extern unsigned int *htab_call_hpte_remove; + extern unsigned int *htab_call_hpte_updatepp; + +#ifdef CONFIG_PPC_HAS_HASH_64K + extern unsigned int *ht64_call_hpte_insert1; + extern unsigned int *ht64_call_hpte_insert2; + extern unsigned int *ht64_call_hpte_remove; + extern unsigned int *ht64_call_hpte_updatepp; + + make_bl(ht64_call_hpte_insert1, ppc_md.hpte_insert); + make_bl(ht64_call_hpte_insert2, ppc_md.hpte_insert); + make_bl(ht64_call_hpte_remove, ppc_md.hpte_remove); + make_bl(ht64_call_hpte_updatepp, ppc_md.hpte_updatepp); +#endif /* CONFIG_PPC_HAS_HASH_64K */ + + make_bl(htab_call_hpte_insert1, ppc_md.hpte_insert); + make_bl(htab_call_hpte_insert2, ppc_md.hpte_insert); + make_bl(htab_call_hpte_remove, ppc_md.hpte_remove); + make_bl(htab_call_hpte_updatepp, ppc_md.hpte_updatepp); +} + +void __init htab_initialize(void) +{ + unsigned long table; + unsigned long pteg_count; + unsigned long prot; + unsigned long base = 0, size = 0, limit; + int i; + + DBG(" -> htab_initialize()\n"); + + /* Initialize segment sizes */ + htab_init_seg_sizes(); + + /* Initialize page sizes */ + htab_init_page_sizes(); + + if (cpu_has_feature(CPU_FTR_1T_SEGMENT)) { + mmu_kernel_ssize = MMU_SEGSIZE_1T; + mmu_highuser_ssize = MMU_SEGSIZE_1T; + printk(KERN_INFO "Using 1TB segments\n"); + } + + /* + * Calculate the required size of the htab. We want the number of + * PTEGs to equal one half the number of real pages. + */ + htab_size_bytes = htab_get_table_size(); + pteg_count = htab_size_bytes >> 7; + + htab_hash_mask = pteg_count - 1; + + if (firmware_has_feature(FW_FEATURE_LPAR)) { + /* Using a hypervisor which owns the htab */ + htab_address = NULL; + _SDR1 = 0; + } else { + /* Find storage for the HPT. Must be contiguous in + * the absolute address space. On cell we want it to be + * in the first 2 Gig so we can use it for IOMMU hacks. + */ + if (machine_is(cell)) + limit = 0x80000000; + else + limit = 0; + + table = lmb_alloc_base(htab_size_bytes, htab_size_bytes, limit); + + DBG("Hash table allocated at %lx, size: %lx\n", table, + htab_size_bytes); + + htab_address = abs_to_virt(table); + + /* htab absolute addr + encoded htabsize */ + _SDR1 = table + __ilog2(pteg_count) - 11; + + /* Initialize the HPT with no entries */ + memset((void *)table, 0, htab_size_bytes); + + /* Set SDR1 */ + mtspr(SPRN_SDR1, _SDR1); + } + + prot = pgprot_val(PAGE_KERNEL); + +#ifdef CONFIG_DEBUG_PAGEALLOC + linear_map_hash_count = lmb_end_of_DRAM() >> PAGE_SHIFT; + linear_map_hash_slots = __va(lmb_alloc_base(linear_map_hash_count, + 1, lmb.rmo_size)); + memset(linear_map_hash_slots, 0, linear_map_hash_count); +#endif /* CONFIG_DEBUG_PAGEALLOC */ + + /* On U3 based machines, we need to reserve the DART area and + * _NOT_ map it to avoid cache paradoxes as it's remapped non + * cacheable later on + */ + + /* create bolted the linear mapping in the hash table */ + for (i=0; i < lmb.memory.cnt; i++) { + base = (unsigned long)__va(lmb.memory.region[i].base); + size = lmb.memory.region[i].size; + + DBG("creating mapping for region: %lx..%lx (prot: %x)\n", + base, size, prot); + +#ifdef CONFIG_U3_DART + /* Do not map the DART space. Fortunately, it will be aligned + * in such a way that it will not cross two lmb regions and + * will fit within a single 16Mb page. + * The DART space is assumed to be a full 16Mb region even if + * we only use 2Mb of that space. We will use more of it later + * for AGP GART. We have to use a full 16Mb large page. + */ + DBG("DART base: %lx\n", dart_tablebase); + + if (dart_tablebase != 0 && dart_tablebase >= base + && dart_tablebase < (base + size)) { + unsigned long dart_table_end = dart_tablebase + 16 * MB; + if (base != dart_tablebase) + BUG_ON(htab_bolt_mapping(base, dart_tablebase, + __pa(base), prot, + mmu_linear_psize, + mmu_kernel_ssize)); + if ((base + size) > dart_table_end) + BUG_ON(htab_bolt_mapping(dart_tablebase+16*MB, + base + size, + __pa(dart_table_end), + prot, + mmu_linear_psize, + mmu_kernel_ssize)); + continue; + } +#endif /* CONFIG_U3_DART */ + BUG_ON(htab_bolt_mapping(base, base + size, __pa(base), + prot, mmu_linear_psize, mmu_kernel_ssize)); + } + + /* + * If we have a memory_limit and we've allocated TCEs then we need to + * explicitly map the TCE area at the top of RAM. We also cope with the + * case that the TCEs start below memory_limit. + * tce_alloc_start/end are 16MB aligned so the mapping should work + * for either 4K or 16MB pages. + */ + if (tce_alloc_start) { + tce_alloc_start = (unsigned long)__va(tce_alloc_start); + tce_alloc_end = (unsigned long)__va(tce_alloc_end); + + if (base + size >= tce_alloc_start) + tce_alloc_start = base + size + 1; + + BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end, + __pa(tce_alloc_start), prot, + mmu_linear_psize, mmu_kernel_ssize)); + } + + htab_finish_init(); + + DBG(" <- htab_initialize()\n"); +} +#undef KB +#undef MB + +void htab_initialize_secondary(void) +{ + if (!firmware_has_feature(FW_FEATURE_LPAR)) + mtspr(SPRN_SDR1, _SDR1); +} + +/* + * Called by asm hashtable.S for doing lazy icache flush + */ +unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) +{ + struct page *page; + + if (!pfn_valid(pte_pfn(pte))) + return pp; + + page = pte_page(pte); + + /* page is dirty */ + if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { + if (trap == 0x400) { + __flush_dcache_icache(page_address(page)); + set_bit(PG_arch_1, &page->flags); + } else + pp |= HPTE_R_N; + } + return pp; +} + +#ifdef CONFIG_PPC_MM_SLICES +unsigned int get_paca_psize(unsigned long addr) +{ + unsigned long index, slices; + + if (addr < SLICE_LOW_TOP) { + slices = get_paca()->context.low_slices_psize; + index = GET_LOW_SLICE_INDEX(addr); + } else { + slices = get_paca()->context.high_slices_psize; + index = GET_HIGH_SLICE_INDEX(addr); + } + return (slices >> (index * 4)) & 0xF; +} + +#else +unsigned int get_paca_psize(unsigned long addr) +{ + return get_paca()->context.user_psize; +} +#endif + +/* + * Demote a segment to using 4k pages. + * For now this makes the whole process use 4k pages. + */ +#ifdef CONFIG_PPC_64K_PAGES +void demote_segment_4k(struct mm_struct *mm, unsigned long addr) +{ + if (get_slice_psize(mm, addr) == MMU_PAGE_4K) + return; + slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K); +#ifdef CONFIG_SPU_BASE + spu_flush_all_slbs(mm); +#endif + if (get_paca_psize(addr) != MMU_PAGE_4K) { + get_paca()->context = mm->context; + slb_flush_and_rebolt(); + } +} +#endif /* CONFIG_PPC_64K_PAGES */ + +#ifdef CONFIG_PPC_SUBPAGE_PROT +/* + * This looks up a 2-bit protection code for a 4k subpage of a 64k page. + * Userspace sets the subpage permissions using the subpage_prot system call. + * + * Result is 0: full permissions, _PAGE_RW: read-only, + * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access. + */ +static int subpage_protection(pgd_t *pgdir, unsigned long ea) +{ + struct subpage_prot_table *spt = pgd_subpage_prot(pgdir); + u32 spp = 0; + u32 **sbpm, *sbpp; + + if (ea >= spt->maxaddr) + return 0; + if (ea < 0x100000000) { + /* addresses below 4GB use spt->low_prot */ + sbpm = spt->low_prot; + } else { + sbpm = spt->protptrs[ea >> SBP_L3_SHIFT]; + if (!sbpm) + return 0; + } + sbpp = sbpm[(ea >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)]; + if (!sbpp) + return 0; + spp = sbpp[(ea >> PAGE_SHIFT) & (SBP_L1_COUNT - 1)]; + + /* extract 2-bit bitfield for this 4k subpage */ + spp >>= 30 - 2 * ((ea >> 12) & 0xf); + + /* turn 0,1,2,3 into combination of _PAGE_USER and _PAGE_RW */ + spp = ((spp & 2) ? _PAGE_USER : 0) | ((spp & 1) ? _PAGE_RW : 0); + return spp; +} + +#else /* CONFIG_PPC_SUBPAGE_PROT */ +static inline int subpage_protection(pgd_t *pgdir, unsigned long ea) +{ + return 0; +} +#endif + +/* Result code is: + * 0 - handled + * 1 - normal page fault + * -1 - critical hash insertion error + * -2 - access not permitted by subpage protection mechanism + */ +int hash_page(unsigned long ea, unsigned long access, unsigned long trap) +{ + void *pgdir; + unsigned long vsid; + struct mm_struct *mm; + pte_t *ptep; + cpumask_t tmp; + int rc, user_region = 0, local = 0; + int psize, ssize; + + DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n", + ea, access, trap); + + if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) { + DBG_LOW(" out of pgtable range !\n"); + return 1; + } + + /* Get region & vsid */ + switch (REGION_ID(ea)) { + case USER_REGION_ID: + user_region = 1; + mm = current->mm; + if (! mm) { + DBG_LOW(" user region with no mm !\n"); + return 1; + } + psize = get_slice_psize(mm, ea); + ssize = user_segment_size(ea); + vsid = get_vsid(mm->context.id, ea, ssize); + break; + case VMALLOC_REGION_ID: + mm = &init_mm; + vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + if (ea < VMALLOC_END) + psize = mmu_vmalloc_psize; + else + psize = mmu_io_psize; + ssize = mmu_kernel_ssize; + break; + default: + /* Not a valid range + * Send the problem up to do_page_fault + */ + return 1; + } + DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid); + + /* Get pgdir */ + pgdir = mm->pgd; + if (pgdir == NULL) + return 1; + + /* Check CPU locality */ + tmp = cpumask_of_cpu(smp_processor_id()); + if (user_region && cpus_equal(mm->cpu_vm_mask, tmp)) + local = 1; + +#ifdef CONFIG_HUGETLB_PAGE + /* Handle hugepage regions */ + if (HPAGE_SHIFT && mmu_huge_psizes[psize]) { + DBG_LOW(" -> huge page !\n"); + return hash_huge_page(mm, access, ea, vsid, local, trap); + } +#endif /* CONFIG_HUGETLB_PAGE */ + +#ifndef CONFIG_PPC_64K_PAGES + /* If we use 4K pages and our psize is not 4K, then we are hitting + * a special driver mapping, we need to align the address before + * we fetch the PTE + */ + if (psize != MMU_PAGE_4K) + ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1); +#endif /* CONFIG_PPC_64K_PAGES */ + + /* Get PTE and page size from page tables */ + ptep = find_linux_pte(pgdir, ea); + if (ptep == NULL || !pte_present(*ptep)) { + DBG_LOW(" no PTE !\n"); + return 1; + } + +#ifndef CONFIG_PPC_64K_PAGES + DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); +#else + DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep), + pte_val(*(ptep + PTRS_PER_PTE))); +#endif + /* Pre-check access permissions (will be re-checked atomically + * in __hash_page_XX but this pre-check is a fast path + */ + if (access & ~pte_val(*ptep)) { + DBG_LOW(" no access !\n"); + return 1; + } + + /* Do actual hashing */ +#ifdef CONFIG_PPC_64K_PAGES + /* If _PAGE_4K_PFN is set, make sure this is a 4k segment */ + if ((pte_val(*ptep) & _PAGE_4K_PFN) && psize == MMU_PAGE_64K) { + demote_segment_4k(mm, ea); + psize = MMU_PAGE_4K; + } + + /* If this PTE is non-cacheable and we have restrictions on + * using non cacheable large pages, then we switch to 4k + */ + if (mmu_ci_restrictions && psize == MMU_PAGE_64K && + (pte_val(*ptep) & _PAGE_NO_CACHE)) { + if (user_region) { + demote_segment_4k(mm, ea); + psize = MMU_PAGE_4K; + } else if (ea < VMALLOC_END) { + /* + * some driver did a non-cacheable mapping + * in vmalloc space, so switch vmalloc + * to 4k pages + */ + printk(KERN_ALERT "Reducing vmalloc segment " + "to 4kB pages because of " + "non-cacheable mapping\n"); + psize = mmu_vmalloc_psize = MMU_PAGE_4K; +#ifdef CONFIG_SPU_BASE + spu_flush_all_slbs(mm); +#endif + } + } + if (user_region) { + if (psize != get_paca_psize(ea)) { + get_paca()->context = mm->context; + slb_flush_and_rebolt(); + } + } else if (get_paca()->vmalloc_sllp != + mmu_psize_defs[mmu_vmalloc_psize].sllp) { + get_paca()->vmalloc_sllp = + mmu_psize_defs[mmu_vmalloc_psize].sllp; + slb_vmalloc_update(); + } +#endif /* CONFIG_PPC_64K_PAGES */ + +#ifdef CONFIG_PPC_HAS_HASH_64K + if (psize == MMU_PAGE_64K) + rc = __hash_page_64K(ea, access, vsid, ptep, trap, local, ssize); + else +#endif /* CONFIG_PPC_HAS_HASH_64K */ + { + int spp = subpage_protection(pgdir, ea); + if (access & spp) + rc = -2; + else + rc = __hash_page_4K(ea, access, vsid, ptep, trap, + local, ssize, spp); + } + +#ifndef CONFIG_PPC_64K_PAGES + DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep)); +#else + DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep), + pte_val(*(ptep + PTRS_PER_PTE))); +#endif + DBG_LOW(" -> rc=%d\n", rc); + return rc; +} +EXPORT_SYMBOL_GPL(hash_page); + +void hash_preload(struct mm_struct *mm, unsigned long ea, + unsigned long access, unsigned long trap) +{ + unsigned long vsid; + void *pgdir; + pte_t *ptep; + cpumask_t mask; + unsigned long flags; + int local = 0; + int ssize; + + BUG_ON(REGION_ID(ea) != USER_REGION_ID); + +#ifdef CONFIG_PPC_MM_SLICES + /* We only prefault standard pages for now */ + if (unlikely(get_slice_psize(mm, ea) != mm->context.user_psize)) + return; +#endif + + DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx," + " trap=%lx\n", mm, mm->pgd, ea, access, trap); + + /* Get Linux PTE if available */ + pgdir = mm->pgd; + if (pgdir == NULL) + return; + ptep = find_linux_pte(pgdir, ea); + if (!ptep) + return; + +#ifdef CONFIG_PPC_64K_PAGES + /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on + * a 64K kernel), then we don't preload, hash_page() will take + * care of it once we actually try to access the page. + * That way we don't have to duplicate all of the logic for segment + * page size demotion here + */ + if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE)) + return; +#endif /* CONFIG_PPC_64K_PAGES */ + + /* Get VSID */ + ssize = user_segment_size(ea); + vsid = get_vsid(mm->context.id, ea, ssize); + + /* Hash doesn't like irqs */ + local_irq_save(flags); + + /* Is that local to this CPU ? */ + mask = cpumask_of_cpu(smp_processor_id()); + if (cpus_equal(mm->cpu_vm_mask, mask)) + local = 1; + + /* Hash it in */ +#ifdef CONFIG_PPC_HAS_HASH_64K + if (mm->context.user_psize == MMU_PAGE_64K) + __hash_page_64K(ea, access, vsid, ptep, trap, local, ssize); + else +#endif /* CONFIG_PPC_HAS_HASH_64K */ + __hash_page_4K(ea, access, vsid, ptep, trap, local, ssize, + subpage_protection(pgdir, ea)); + + local_irq_restore(flags); +} + +/* WARNING: This is called from hash_low_64.S, if you change this prototype, + * do not forget to update the assembly call site ! + */ +void flush_hash_page(unsigned long va, real_pte_t pte, int psize, int ssize, + int local) +{ + unsigned long hash, index, shift, hidx, slot; + + DBG_LOW("flush_hash_page(va=%016x)\n", va); + pte_iterate_hashed_subpages(pte, psize, va, index, shift) { + hash = hpt_hash(va, shift, ssize); + hidx = __rpte_to_hidx(pte, index); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + DBG_LOW(" sub %d: hash=%x, hidx=%x\n", index, slot, hidx); + ppc_md.hpte_invalidate(slot, va, psize, ssize, local); + } pte_iterate_hashed_end(); +} + +void flush_hash_range(unsigned long number, int local) +{ + if (ppc_md.flush_hash_range) + ppc_md.flush_hash_range(number, local); + else { + int i; + struct ppc64_tlb_batch *batch = + &__get_cpu_var(ppc64_tlb_batch); + + for (i = 0; i < number; i++) + flush_hash_page(batch->vaddr[i], batch->pte[i], + batch->psize, batch->ssize, local); + } +} + +/* + * low_hash_fault is called when we the low level hash code failed + * to instert a PTE due to an hypervisor error + */ +void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc) +{ + if (user_mode(regs)) { +#ifdef CONFIG_PPC_SUBPAGE_PROT + if (rc == -2) + _exception(SIGSEGV, regs, SEGV_ACCERR, address); + else +#endif + _exception(SIGBUS, regs, BUS_ADRERR, address); + } else + bad_page_fault(regs, address, SIGBUS); +} + +#ifdef CONFIG_DEBUG_PAGEALLOC +static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) +{ + unsigned long hash, hpteg; + unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); + unsigned long va = hpt_va(vaddr, vsid, mmu_kernel_ssize); + unsigned long mode = htab_convert_pte_flags(PAGE_KERNEL); + int ret; + + hash = hpt_hash(va, PAGE_SHIFT, mmu_kernel_ssize); + hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); + + ret = ppc_md.hpte_insert(hpteg, va, __pa(vaddr), + mode, HPTE_V_BOLTED, + mmu_linear_psize, mmu_kernel_ssize); + BUG_ON (ret < 0); + spin_lock(&linear_map_hash_lock); + BUG_ON(linear_map_hash_slots[lmi] & 0x80); + linear_map_hash_slots[lmi] = ret | 0x80; + spin_unlock(&linear_map_hash_lock); +} + +static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) +{ + unsigned long hash, hidx, slot; + unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); + unsigned long va = hpt_va(vaddr, vsid, mmu_kernel_ssize); + + hash = hpt_hash(va, PAGE_SHIFT, mmu_kernel_ssize); + spin_lock(&linear_map_hash_lock); + BUG_ON(!(linear_map_hash_slots[lmi] & 0x80)); + hidx = linear_map_hash_slots[lmi] & 0x7f; + linear_map_hash_slots[lmi] = 0; + spin_unlock(&linear_map_hash_lock); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + ppc_md.hpte_invalidate(slot, va, mmu_linear_psize, mmu_kernel_ssize, 0); +} + +void kernel_map_pages(struct page *page, int numpages, int enable) +{ + unsigned long flags, vaddr, lmi; + int i; + + local_irq_save(flags); + for (i = 0; i < numpages; i++, page++) { + vaddr = (unsigned long)page_address(page); + lmi = __pa(vaddr) >> PAGE_SHIFT; + if (lmi >= linear_map_hash_count) + continue; + if (enable) + kernel_map_linear_page(vaddr, lmi); + else + kernel_unmap_linear_page(vaddr, lmi); + } + local_irq_restore(flags); +} +#endif /* CONFIG_DEBUG_PAGEALLOC */ diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c new file mode 100644 index 0000000..f0c3b88 --- /dev/null +++ b/arch/powerpc/mm/hugetlbpage.c @@ -0,0 +1,781 @@ +/* + * PPC64 (POWER4) Huge TLB Page Support for Kernel. + * + * Copyright (C) 2003 David Gibson, IBM Corporation. + * + * Based on the IA-32 version: + * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> + */ + +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/sysctl.h> +#include <asm/mman.h> +#include <asm/pgalloc.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/machdep.h> +#include <asm/cputable.h> +#include <asm/spu.h> + +#define PAGE_SHIFT_64K 16 +#define PAGE_SHIFT_16M 24 +#define PAGE_SHIFT_16G 34 + +#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT) +#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT) +#define MAX_NUMBER_GPAGES 1024 + +/* Tracks the 16G pages after the device tree is scanned and before the + * huge_boot_pages list is ready. */ +static unsigned long gpage_freearray[MAX_NUMBER_GPAGES]; +static unsigned nr_gpages; + +/* Array of valid huge page sizes - non-zero value(hugepte_shift) is + * stored for the huge page sizes that are valid. + */ +unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */ + +#define hugepte_shift mmu_huge_psizes +#define PTRS_PER_HUGEPTE(psize) (1 << hugepte_shift[psize]) +#define HUGEPTE_TABLE_SIZE(psize) (sizeof(pte_t) << hugepte_shift[psize]) + +#define HUGEPD_SHIFT(psize) (mmu_psize_to_shift(psize) \ + + hugepte_shift[psize]) +#define HUGEPD_SIZE(psize) (1UL << HUGEPD_SHIFT(psize)) +#define HUGEPD_MASK(psize) (~(HUGEPD_SIZE(psize)-1)) + +/* Subtract one from array size because we don't need a cache for 4K since + * is not a huge page size */ +#define huge_pgtable_cache(psize) (pgtable_cache[HUGEPTE_CACHE_NUM \ + + psize-1]) +#define HUGEPTE_CACHE_NAME(psize) (huge_pgtable_cache_name[psize]) + +static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = { + "unused_4K", "hugepte_cache_64K", "unused_64K_AP", + "hugepte_cache_1M", "hugepte_cache_16M", "hugepte_cache_16G" +}; + +/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() + * will choke on pointers to hugepte tables, which is handy for + * catching screwups early. */ +#define HUGEPD_OK 0x1 + +typedef struct { unsigned long pd; } hugepd_t; + +#define hugepd_none(hpd) ((hpd).pd == 0) + +static inline int shift_to_mmu_psize(unsigned int shift) +{ + switch (shift) { +#ifndef CONFIG_PPC_64K_PAGES + case PAGE_SHIFT_64K: + return MMU_PAGE_64K; +#endif + case PAGE_SHIFT_16M: + return MMU_PAGE_16M; + case PAGE_SHIFT_16G: + return MMU_PAGE_16G; + } + return -1; +} + +static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) +{ + if (mmu_psize_defs[mmu_psize].shift) + return mmu_psize_defs[mmu_psize].shift; + BUG(); +} + +static inline pte_t *hugepd_page(hugepd_t hpd) +{ + BUG_ON(!(hpd.pd & HUGEPD_OK)); + return (pte_t *)(hpd.pd & ~HUGEPD_OK); +} + +static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, + struct hstate *hstate) +{ + unsigned int shift = huge_page_shift(hstate); + int psize = shift_to_mmu_psize(shift); + unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1)); + pte_t *dir = hugepd_page(*hpdp); + + return dir + idx; +} + +static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, + unsigned long address, unsigned int psize) +{ + pte_t *new = kmem_cache_zalloc(huge_pgtable_cache(psize), + GFP_KERNEL|__GFP_REPEAT); + + if (! new) + return -ENOMEM; + + spin_lock(&mm->page_table_lock); + if (!hugepd_none(*hpdp)) + kmem_cache_free(huge_pgtable_cache(psize), new); + else + hpdp->pd = (unsigned long)new | HUGEPD_OK; + spin_unlock(&mm->page_table_lock); + return 0; +} + + +static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate) +{ + if (huge_page_shift(hstate) < PUD_SHIFT) + return pud_offset(pgd, addr); + else + return (pud_t *) pgd; +} +static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, + struct hstate *hstate) +{ + if (huge_page_shift(hstate) < PUD_SHIFT) + return pud_alloc(mm, pgd, addr); + else + return (pud_t *) pgd; +} +static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate) +{ + if (huge_page_shift(hstate) < PMD_SHIFT) + return pmd_offset(pud, addr); + else + return (pmd_t *) pud; +} +static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr, + struct hstate *hstate) +{ + if (huge_page_shift(hstate) < PMD_SHIFT) + return pmd_alloc(mm, pud, addr); + else + return (pmd_t *) pud; +} + +/* Build list of addresses of gigantic pages. This function is used in early + * boot before the buddy or bootmem allocator is setup. + */ +void add_gpage(unsigned long addr, unsigned long page_size, + unsigned long number_of_pages) +{ + if (!addr) + return; + while (number_of_pages > 0) { + gpage_freearray[nr_gpages] = addr; + nr_gpages++; + number_of_pages--; + addr += page_size; + } +} + +/* Moves the gigantic page addresses from the temporary list to the + * huge_boot_pages list. + */ +int alloc_bootmem_huge_page(struct hstate *hstate) +{ + struct huge_bootmem_page *m; + if (nr_gpages == 0) + return 0; + m = phys_to_virt(gpage_freearray[--nr_gpages]); + gpage_freearray[nr_gpages] = 0; + list_add(&m->list, &huge_boot_pages); + m->hstate = hstate; + return 1; +} + + +/* Modelled after find_linux_pte() */ +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pg; + pud_t *pu; + pmd_t *pm; + + unsigned int psize; + unsigned int shift; + unsigned long sz; + struct hstate *hstate; + psize = get_slice_psize(mm, addr); + shift = mmu_psize_to_shift(psize); + sz = ((1UL) << shift); + hstate = size_to_hstate(sz); + + addr &= hstate->mask; + + pg = pgd_offset(mm, addr); + if (!pgd_none(*pg)) { + pu = hpud_offset(pg, addr, hstate); + if (!pud_none(*pu)) { + pm = hpmd_offset(pu, addr, hstate); + if (!pmd_none(*pm)) + return hugepte_offset((hugepd_t *)pm, addr, + hstate); + } + } + + return NULL; +} + +pte_t *huge_pte_alloc(struct mm_struct *mm, + unsigned long addr, unsigned long sz) +{ + pgd_t *pg; + pud_t *pu; + pmd_t *pm; + hugepd_t *hpdp = NULL; + struct hstate *hstate; + unsigned int psize; + hstate = size_to_hstate(sz); + + psize = get_slice_psize(mm, addr); + BUG_ON(!mmu_huge_psizes[psize]); + + addr &= hstate->mask; + + pg = pgd_offset(mm, addr); + pu = hpud_alloc(mm, pg, addr, hstate); + + if (pu) { + pm = hpmd_alloc(mm, pu, addr, hstate); + if (pm) + hpdp = (hugepd_t *)pm; + } + + if (! hpdp) + return NULL; + + if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize)) + return NULL; + + return hugepte_offset(hpdp, addr, hstate); +} + +int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) +{ + return 0; +} + +static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp, + unsigned int psize) +{ + pte_t *hugepte = hugepd_page(*hpdp); + + hpdp->pd = 0; + tlb->need_flush = 1; + pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, + HUGEPTE_CACHE_NUM+psize-1, + PGF_CACHENUM_MASK)); +} + +static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling, + unsigned int psize) +{ + pmd_t *pmd; + unsigned long next; + unsigned long start; + + start = addr; + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none(*pmd)) + continue; + free_hugepte_range(tlb, (hugepd_t *)pmd, psize); + } while (pmd++, addr = next, addr != end); + + start &= PUD_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= PUD_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + + pmd = pmd_offset(pud, start); + pud_clear(pud); + pmd_free_tlb(tlb, pmd); +} + +static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pud_t *pud; + unsigned long next; + unsigned long start; + unsigned int shift; + unsigned int psize = get_slice_psize(tlb->mm, addr); + shift = mmu_psize_to_shift(psize); + + start = addr; + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (shift < PMD_SHIFT) { + if (pud_none_or_clear_bad(pud)) + continue; + hugetlb_free_pmd_range(tlb, pud, addr, next, floor, + ceiling, psize); + } else { + if (pud_none(*pud)) + continue; + free_hugepte_range(tlb, (hugepd_t *)pud, psize); + } + } while (pud++, addr = next, addr != end); + + start &= PGDIR_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= PGDIR_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + + pud = pud_offset(pgd, start); + pgd_clear(pgd); + pud_free_tlb(tlb, pud); +} + +/* + * This function frees user-level page tables of a process. + * + * Must be called with pagetable lock held. + */ +void hugetlb_free_pgd_range(struct mmu_gather *tlb, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pgd_t *pgd; + unsigned long next; + unsigned long start; + + /* + * Comments below take from the normal free_pgd_range(). They + * apply here too. The tests against HUGEPD_MASK below are + * essential, because we *don't* test for this at the bottom + * level. Without them we'll attempt to free a hugepte table + * when we unmap just part of it, even if there are other + * active mappings using it. + * + * The next few lines have given us lots of grief... + * + * Why are we testing HUGEPD* at this top level? Because + * often there will be no work to do at all, and we'd prefer + * not to go all the way down to the bottom just to discover + * that. + * + * Why all these "- 1"s? Because 0 represents both the bottom + * of the address space and the top of it (using -1 for the + * top wouldn't help much: the masks would do the wrong thing). + * The rule is that addr 0 and floor 0 refer to the bottom of + * the address space, but end 0 and ceiling 0 refer to the top + * Comparisons need to use "end - 1" and "ceiling - 1" (though + * that end 0 case should be mythical). + * + * Wherever addr is brought up or ceiling brought down, we + * must be careful to reject "the opposite 0" before it + * confuses the subsequent tests. But what about where end is + * brought down by HUGEPD_SIZE below? no, end can't go down to + * 0 there. + * + * Whereas we round start (addr) and ceiling down, by different + * masks at different levels, in order to test whether a table + * now has no other vmas using it, so can be freed, we don't + * bother to round floor or end up - the tests don't need that. + */ + unsigned int psize = get_slice_psize(tlb->mm, addr); + + addr &= HUGEPD_MASK(psize); + if (addr < floor) { + addr += HUGEPD_SIZE(psize); + if (!addr) + return; + } + if (ceiling) { + ceiling &= HUGEPD_MASK(psize); + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + end -= HUGEPD_SIZE(psize); + if (addr > end - 1) + return; + + start = addr; + pgd = pgd_offset(tlb->mm, addr); + do { + psize = get_slice_psize(tlb->mm, addr); + BUG_ON(!mmu_huge_psizes[psize]); + next = pgd_addr_end(addr, end); + if (mmu_psize_to_shift(psize) < PUD_SHIFT) { + if (pgd_none_or_clear_bad(pgd)) + continue; + hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); + } else { + if (pgd_none(*pgd)) + continue; + free_hugepte_range(tlb, (hugepd_t *)pgd, psize); + } + } while (pgd++, addr = next, addr != end); +} + +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + if (pte_present(*ptep)) { + /* We open-code pte_clear because we need to pass the right + * argument to hpte_need_flush (huge / !huge). Might not be + * necessary anymore if we make hpte_need_flush() get the + * page size from the slices + */ + unsigned int psize = get_slice_psize(mm, addr); + unsigned int shift = mmu_psize_to_shift(psize); + unsigned long sz = ((1UL) << shift); + struct hstate *hstate = size_to_hstate(sz); + pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1); + } + *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); +} + +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1); + return __pte(old); +} + +struct page * +follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) +{ + pte_t *ptep; + struct page *page; + unsigned int mmu_psize = get_slice_psize(mm, address); + + /* Verify it is a huge page else bail. */ + if (!mmu_huge_psizes[mmu_psize]) + return ERR_PTR(-EINVAL); + + ptep = huge_pte_offset(mm, address); + page = pte_page(*ptep); + if (page) { + unsigned int shift = mmu_psize_to_shift(mmu_psize); + unsigned long sz = ((1UL) << shift); + page += (address % sz) / PAGE_SIZE; + } + + return page; +} + +int pmd_huge(pmd_t pmd) +{ + return 0; +} + +int pud_huge(pud_t pud) +{ + return 0; +} + +struct page * +follow_huge_pmd(struct mm_struct *mm, unsigned long address, + pmd_t *pmd, int write) +{ + BUG(); + return NULL; +} + + +unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct hstate *hstate = hstate_file(file); + int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); + + if (!mmu_huge_psizes[mmu_psize]) + return -EINVAL; + return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0); +} + +/* + * Called by asm hashtable.S for doing lazy icache flush + */ +static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags, + pte_t pte, int trap, unsigned long sz) +{ + struct page *page; + int i; + + if (!pfn_valid(pte_pfn(pte))) + return rflags; + + page = pte_page(pte); + + /* page is dirty */ + if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { + if (trap == 0x400) { + for (i = 0; i < (sz / PAGE_SIZE); i++) + __flush_dcache_icache(page_address(page+i)); + set_bit(PG_arch_1, &page->flags); + } else { + rflags |= HPTE_R_N; + } + } + return rflags; +} + +int hash_huge_page(struct mm_struct *mm, unsigned long access, + unsigned long ea, unsigned long vsid, int local, + unsigned long trap) +{ + pte_t *ptep; + unsigned long old_pte, new_pte; + unsigned long va, rflags, pa, sz; + long slot; + int err = 1; + int ssize = user_segment_size(ea); + unsigned int mmu_psize; + int shift; + mmu_psize = get_slice_psize(mm, ea); + + if (!mmu_huge_psizes[mmu_psize]) + goto out; + ptep = huge_pte_offset(mm, ea); + + /* Search the Linux page table for a match with va */ + va = hpt_va(ea, vsid, ssize); + + /* + * If no pte found or not present, send the problem up to + * do_page_fault + */ + if (unlikely(!ptep || pte_none(*ptep))) + goto out; + + /* + * Check the user's access rights to the page. If access should be + * prevented then send the problem up to do_page_fault. + */ + if (unlikely(access & ~pte_val(*ptep))) + goto out; + /* + * At this point, we have a pte (old_pte) which can be used to build + * or update an HPTE. There are 2 cases: + * + * 1. There is a valid (present) pte with no associated HPTE (this is + * the most common case) + * 2. There is a valid (present) pte with an associated HPTE. The + * current values of the pp bits in the HPTE prevent access + * because we are doing software DIRTY bit management and the + * page is currently not DIRTY. + */ + + + do { + old_pte = pte_val(*ptep); + if (old_pte & _PAGE_BUSY) + goto out; + new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; + } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, + old_pte, new_pte)); + + rflags = 0x2 | (!(new_pte & _PAGE_RW)); + /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ + rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); + shift = mmu_psize_to_shift(mmu_psize); + sz = ((1UL) << shift); + if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) + /* No CPU has hugepages but lacks no execute, so we + * don't need to worry about that case */ + rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte), + trap, sz); + + /* Check if pte already has an hpte (case 2) */ + if (unlikely(old_pte & _PAGE_HASHPTE)) { + /* There MIGHT be an HPTE for this pte */ + unsigned long hash, slot; + + hash = hpt_hash(va, shift, ssize); + if (old_pte & _PAGE_F_SECOND) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += (old_pte & _PAGE_F_GIX) >> 12; + + if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize, + ssize, local) == -1) + old_pte &= ~_PAGE_HPTEFLAGS; + } + + if (likely(!(old_pte & _PAGE_HASHPTE))) { + unsigned long hash = hpt_hash(va, shift, ssize); + unsigned long hpte_group; + + pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; + +repeat: + hpte_group = ((hash & htab_hash_mask) * + HPTES_PER_GROUP) & ~0x7UL; + + /* clear HPTE slot informations in new PTE */ +#ifdef CONFIG_PPC_64K_PAGES + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0; +#else + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; +#endif + /* Add in WIMG bits */ + rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | + _PAGE_COHERENT | _PAGE_GUARDED)); + + /* Insert into the hash table, primary slot */ + slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0, + mmu_psize, ssize); + + /* Primary is full, try the secondary */ + if (unlikely(slot == -1)) { + hpte_group = ((~hash & htab_hash_mask) * + HPTES_PER_GROUP) & ~0x7UL; + slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, + HPTE_V_SECONDARY, + mmu_psize, ssize); + if (slot == -1) { + if (mftb() & 0x1) + hpte_group = ((hash & htab_hash_mask) * + HPTES_PER_GROUP)&~0x7UL; + + ppc_md.hpte_remove(hpte_group); + goto repeat; + } + } + + if (unlikely(slot == -2)) + panic("hash_huge_page: pte_insert failed\n"); + + new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX); + } + + /* + * No need to use ldarx/stdcx here + */ + *ptep = __pte(new_pte & ~_PAGE_BUSY); + + err = 0; + + out: + return err; +} + +static void __init set_huge_psize(int psize) +{ + /* Check that it is a page size supported by the hardware and + * that it fits within pagetable limits. */ + if (mmu_psize_defs[psize].shift && + mmu_psize_defs[psize].shift < SID_SHIFT_1T && + (mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT || + mmu_psize_defs[psize].shift == PAGE_SHIFT_64K || + mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) { + /* Return if huge page size has already been setup or is the + * same as the base page size. */ + if (mmu_huge_psizes[psize] || + mmu_psize_defs[psize].shift == PAGE_SHIFT) + return; + hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT); + + switch (mmu_psize_defs[psize].shift) { + case PAGE_SHIFT_64K: + /* We only allow 64k hpages with 4k base page, + * which was checked above, and always put them + * at the PMD */ + hugepte_shift[psize] = PMD_SHIFT; + break; + case PAGE_SHIFT_16M: + /* 16M pages can be at two different levels + * of pagestables based on base page size */ + if (PAGE_SHIFT == PAGE_SHIFT_64K) + hugepte_shift[psize] = PMD_SHIFT; + else /* 4k base page */ + hugepte_shift[psize] = PUD_SHIFT; + break; + case PAGE_SHIFT_16G: + /* 16G pages are always at PGD level */ + hugepte_shift[psize] = PGDIR_SHIFT; + break; + } + hugepte_shift[psize] -= mmu_psize_defs[psize].shift; + } else + hugepte_shift[psize] = 0; +} + +static int __init hugepage_setup_sz(char *str) +{ + unsigned long long size; + int mmu_psize; + int shift; + + size = memparse(str, &str); + + shift = __ffs(size); + mmu_psize = shift_to_mmu_psize(shift); + if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift) + set_huge_psize(mmu_psize); + else + printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size); + + return 1; +} +__setup("hugepagesz=", hugepage_setup_sz); + +static int __init hugetlbpage_init(void) +{ + unsigned int psize; + + if (!cpu_has_feature(CPU_FTR_16M_PAGE)) + return -ENODEV; + + /* Add supported huge page sizes. Need to change HUGE_MAX_HSTATE + * and adjust PTE_NONCACHE_NUM if the number of supported huge page + * sizes changes. + */ + set_huge_psize(MMU_PAGE_16M); + set_huge_psize(MMU_PAGE_16G); + + /* Temporarily disable support for 64K huge pages when 64K SPU local + * store support is enabled as the current implementation conflicts. + */ +#ifndef CONFIG_SPU_FS_64K_LS + set_huge_psize(MMU_PAGE_64K); +#endif + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + if (mmu_huge_psizes[psize]) { + huge_pgtable_cache(psize) = kmem_cache_create( + HUGEPTE_CACHE_NAME(psize), + HUGEPTE_TABLE_SIZE(psize), + HUGEPTE_TABLE_SIZE(psize), + 0, + NULL); + if (!huge_pgtable_cache(psize)) + panic("hugetlbpage_init(): could not create %s"\ + "\n", HUGEPTE_CACHE_NAME(psize)); + } + } + + return 0; +} + +module_init(hugetlbpage_init); diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c new file mode 100644 index 0000000..388ceda --- /dev/null +++ b/arch/powerpc/mm/init_32.c @@ -0,0 +1,290 @@ +/* + * PowerPC version + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * PPC44x/36-bit changes by Matt Porter (mporter@mvista.com) + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/stddef.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/highmem.h> +#include <linux/initrd.h> +#include <linux/pagemap.h> +#include <linux/lmb.h> + +#include <asm/pgalloc.h> +#include <asm/prom.h> +#include <asm/io.h> +#include <asm/mmu_context.h> +#include <asm/pgtable.h> +#include <asm/mmu.h> +#include <asm/smp.h> +#include <asm/machdep.h> +#include <asm/btext.h> +#include <asm/tlb.h> +#include <asm/sections.h> +#include <asm/system.h> + +#include "mmu_decl.h" + +#if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL) +/* The ammount of lowmem must be within 0xF0000000 - KERNELBASE. */ +#if (CONFIG_LOWMEM_SIZE > (0xF0000000 - KERNELBASE)) +#error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_START_KERNEL" +#endif +#endif +#define MAX_LOW_MEM CONFIG_LOWMEM_SIZE + +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); + +phys_addr_t total_memory; +phys_addr_t total_lowmem; + +phys_addr_t memstart_addr = (phys_addr_t)~0ull; +EXPORT_SYMBOL(memstart_addr); +phys_addr_t kernstart_addr; +EXPORT_SYMBOL(kernstart_addr); +phys_addr_t lowmem_end_addr; + +int boot_mapsize; +#ifdef CONFIG_PPC_PMAC +unsigned long agp_special_page; +EXPORT_SYMBOL(agp_special_page); +#endif + +void MMU_init(void); + +/* XXX should be in current.h -- paulus */ +extern struct task_struct *current_set[NR_CPUS]; + +/* + * this tells the system to map all of ram with the segregs + * (i.e. page tables) instead of the bats. + * -- Cort + */ +int __map_without_bats; +int __map_without_ltlbs; + +/* max amount of low RAM to map in */ +unsigned long __max_low_memory = MAX_LOW_MEM; + +/* + * address of the limit of what is accessible with initial MMU setup - + * 256MB usually, but only 16MB on 601. + */ +phys_addr_t __initial_memory_limit_addr = (phys_addr_t)0x10000000; + +/* + * Check for command-line options that affect what MMU_init will do. + */ +void MMU_setup(void) +{ + /* Check for nobats option (used in mapin_ram). */ + if (strstr(cmd_line, "nobats")) { + __map_without_bats = 1; + } + + if (strstr(cmd_line, "noltlbs")) { + __map_without_ltlbs = 1; + } +#ifdef CONFIG_DEBUG_PAGEALLOC + __map_without_bats = 1; + __map_without_ltlbs = 1; +#endif +} + +/* + * MMU_init sets up the basic memory mappings for the kernel, + * including both RAM and possibly some I/O regions, + * and sets up the page tables and the MMU hardware ready to go. + */ +void __init MMU_init(void) +{ + if (ppc_md.progress) + ppc_md.progress("MMU:enter", 0x111); + + /* 601 can only access 16MB at the moment */ + if (PVR_VER(mfspr(SPRN_PVR)) == 1) + __initial_memory_limit_addr = 0x01000000; + /* 8xx can only access 8MB at the moment */ + if (PVR_VER(mfspr(SPRN_PVR)) == 0x50) + __initial_memory_limit_addr = 0x00800000; + + /* parse args from command line */ + MMU_setup(); + + if (lmb.memory.cnt > 1) { + lmb.memory.cnt = 1; + lmb_analyze(); + printk(KERN_WARNING "Only using first contiguous memory region"); + } + + total_lowmem = total_memory = lmb_end_of_DRAM() - memstart_addr; + lowmem_end_addr = memstart_addr + total_lowmem; + +#ifdef CONFIG_FSL_BOOKE + /* Freescale Book-E parts expect lowmem to be mapped by fixed TLB + * entries, so we need to adjust lowmem to match the amount we can map + * in the fixed entries */ + adjust_total_lowmem(); +#endif /* CONFIG_FSL_BOOKE */ + + if (total_lowmem > __max_low_memory) { + total_lowmem = __max_low_memory; + lowmem_end_addr = memstart_addr + total_lowmem; +#ifndef CONFIG_HIGHMEM + total_memory = total_lowmem; + lmb_enforce_memory_limit(lowmem_end_addr); + lmb_analyze(); +#endif /* CONFIG_HIGHMEM */ + } + + /* Initialize the MMU hardware */ + if (ppc_md.progress) + ppc_md.progress("MMU:hw init", 0x300); + MMU_init_hw(); + + /* Map in all of RAM starting at KERNELBASE */ + if (ppc_md.progress) + ppc_md.progress("MMU:mapin", 0x301); + mapin_ram(); + +#ifdef CONFIG_HIGHMEM + ioremap_base = PKMAP_BASE; +#else + ioremap_base = 0xfe000000UL; /* for now, could be 0xfffff000 */ +#endif /* CONFIG_HIGHMEM */ + ioremap_bot = ioremap_base; + + /* Map in I/O resources */ + if (ppc_md.progress) + ppc_md.progress("MMU:setio", 0x302); + + /* Initialize the context management stuff */ + mmu_context_init(); + + if (ppc_md.progress) + ppc_md.progress("MMU:exit", 0x211); + + /* From now on, btext is no longer BAT mapped if it was at all */ +#ifdef CONFIG_BOOTX_TEXT + btext_unmap(); +#endif +} + +/* This is only called until mem_init is done. */ +void __init *early_get_page(void) +{ + void *p; + + if (init_bootmem_done) { + p = alloc_bootmem_pages(PAGE_SIZE); + } else { + p = __va(lmb_alloc_base(PAGE_SIZE, PAGE_SIZE, + __initial_memory_limit_addr)); + } + return p; +} + +/* Free up now-unused memory */ +static void free_sec(unsigned long start, unsigned long end, const char *name) +{ + unsigned long cnt = 0; + + while (start < end) { + ClearPageReserved(virt_to_page(start)); + init_page_count(virt_to_page(start)); + free_page(start); + cnt++; + start += PAGE_SIZE; + } + if (cnt) { + printk(" %ldk %s", cnt << (PAGE_SHIFT - 10), name); + totalram_pages += cnt; + } +} + +void free_initmem(void) +{ +#define FREESEC(TYPE) \ + free_sec((unsigned long)(&__ ## TYPE ## _begin), \ + (unsigned long)(&__ ## TYPE ## _end), \ + #TYPE); + + printk ("Freeing unused kernel memory:"); + FREESEC(init); + printk("\n"); + ppc_md.progress = NULL; +#undef FREESEC +} + +#ifdef CONFIG_BLK_DEV_INITRD +void free_initrd_mem(unsigned long start, unsigned long end) +{ + if (start < end) + printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); + for (; start < end; start += PAGE_SIZE) { + ClearPageReserved(virt_to_page(start)); + init_page_count(virt_to_page(start)); + free_page(start); + totalram_pages++; + } +} +#endif + +#ifdef CONFIG_PROC_KCORE +static struct kcore_list kcore_vmem; + +static int __init setup_kcore(void) +{ + int i; + + for (i = 0; i < lmb.memory.cnt; i++) { + unsigned long base; + unsigned long size; + struct kcore_list *kcore_mem; + + base = lmb.memory.region[i].base; + size = lmb.memory.region[i].size; + + kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC); + if (!kcore_mem) + panic("%s: kmalloc failed\n", __func__); + + /* must stay under 32 bits */ + if ( 0xfffffffful - (unsigned long)__va(base) < size) { + size = 0xfffffffful - (unsigned long)(__va(base)); + printk(KERN_DEBUG "setup_kcore: restrict size=%lx\n", + size); + } + + kclist_add(kcore_mem, __va(base), size); + } + + kclist_add(&kcore_vmem, (void *)VMALLOC_START, + VMALLOC_END-VMALLOC_START); + + return 0; +} +module_init(setup_kcore); +#endif diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c new file mode 100644 index 0000000..3e6a654 --- /dev/null +++ b/arch/powerpc/mm/init_64.c @@ -0,0 +1,238 @@ +/* + * PowerPC version + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Dave Engebretsen <engebret@us.ibm.com> + * Rework for PPC64 port. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#undef DEBUG + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/stddef.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/bootmem.h> +#include <linux/highmem.h> +#include <linux/idr.h> +#include <linux/nodemask.h> +#include <linux/module.h> +#include <linux/poison.h> +#include <linux/lmb.h> + +#include <asm/pgalloc.h> +#include <asm/page.h> +#include <asm/prom.h> +#include <asm/rtas.h> +#include <asm/io.h> +#include <asm/mmu_context.h> +#include <asm/pgtable.h> +#include <asm/mmu.h> +#include <asm/uaccess.h> +#include <asm/smp.h> +#include <asm/machdep.h> +#include <asm/tlb.h> +#include <asm/eeh.h> +#include <asm/processor.h> +#include <asm/mmzone.h> +#include <asm/cputable.h> +#include <asm/sections.h> +#include <asm/system.h> +#include <asm/iommu.h> +#include <asm/abs_addr.h> +#include <asm/vdso.h> + +#include "mmu_decl.h" + +#if PGTABLE_RANGE > USER_VSID_RANGE +#warning Limited user VSID range means pagetable space is wasted +#endif + +#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) +#warning TASK_SIZE is smaller than it needs to be. +#endif + +phys_addr_t memstart_addr = ~0; +phys_addr_t kernstart_addr; + +void free_initmem(void) +{ + unsigned long addr; + + addr = (unsigned long)__init_begin; + for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) { + memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); + ClearPageReserved(virt_to_page(addr)); + init_page_count(virt_to_page(addr)); + free_page(addr); + totalram_pages++; + } + printk ("Freeing unused kernel memory: %luk freed\n", + ((unsigned long)__init_end - (unsigned long)__init_begin) >> 10); +} + +#ifdef CONFIG_BLK_DEV_INITRD +void free_initrd_mem(unsigned long start, unsigned long end) +{ + if (start < end) + printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); + for (; start < end; start += PAGE_SIZE) { + ClearPageReserved(virt_to_page(start)); + init_page_count(virt_to_page(start)); + free_page(start); + totalram_pages++; + } +} +#endif + +#ifdef CONFIG_PROC_KCORE +static struct kcore_list kcore_vmem; + +static int __init setup_kcore(void) +{ + int i; + + for (i=0; i < lmb.memory.cnt; i++) { + unsigned long base, size; + struct kcore_list *kcore_mem; + + base = lmb.memory.region[i].base; + size = lmb.memory.region[i].size; + + /* GFP_ATOMIC to avoid might_sleep warnings during boot */ + kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC); + if (!kcore_mem) + panic("%s: kmalloc failed\n", __func__); + + kclist_add(kcore_mem, __va(base), size); + } + + kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START); + + return 0; +} +module_init(setup_kcore); +#endif + +static void pgd_ctor(void *addr) +{ + memset(addr, 0, PGD_TABLE_SIZE); +} + +static void pmd_ctor(void *addr) +{ + memset(addr, 0, PMD_TABLE_SIZE); +} + +static const unsigned int pgtable_cache_size[2] = { + PGD_TABLE_SIZE, PMD_TABLE_SIZE +}; +static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = { +#ifdef CONFIG_PPC_64K_PAGES + "pgd_cache", "pmd_cache", +#else + "pgd_cache", "pud_pmd_cache", +#endif /* CONFIG_PPC_64K_PAGES */ +}; + +#ifdef CONFIG_HUGETLB_PAGE +/* Hugepages need an extra cache per hugepagesize, initialized in + * hugetlbpage.c. We can't put into the tables above, because HPAGE_SHIFT + * is not compile time constant. */ +struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+MMU_PAGE_COUNT]; +#else +struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)]; +#endif + +void pgtable_cache_init(void) +{ + pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor); + pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor); +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* + * Given an address within the vmemmap, determine the pfn of the page that + * represents the start of the section it is within. Note that we have to + * do this by hand as the proffered address may not be correctly aligned. + * Subtraction of non-aligned pointers produces undefined results. + */ +static unsigned long __meminit vmemmap_section_start(unsigned long page) +{ + unsigned long offset = page - ((unsigned long)(vmemmap)); + + /* Return the pfn of the start of the section. */ + return (offset / sizeof(struct page)) & PAGE_SECTION_MASK; +} + +/* + * Check if this vmemmap page is already initialised. If any section + * which overlaps this vmemmap page is initialised then this page is + * initialised already. + */ +static int __meminit vmemmap_populated(unsigned long start, int page_size) +{ + unsigned long end = start + page_size; + + for (; start < end; start += (PAGES_PER_SECTION * sizeof(struct page))) + if (pfn_valid(vmemmap_section_start(start))) + return 1; + + return 0; +} + +int __meminit vmemmap_populate(struct page *start_page, + unsigned long nr_pages, int node) +{ + unsigned long start = (unsigned long)start_page; + unsigned long end = (unsigned long)(start_page + nr_pages); + unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; + + /* Align to the page size of the linear mapping. */ + start = _ALIGN_DOWN(start, page_size); + + for (; start < end; start += page_size) { + int mapped; + void *p; + + if (vmemmap_populated(start, page_size)) + continue; + + p = vmemmap_alloc_block(page_size, node); + if (!p) + return -ENOMEM; + + pr_debug("vmemmap %08lx allocated at %p, physical %08lx.\n", + start, p, __pa(p)); + + mapped = htab_bolt_mapping(start, start + page_size, __pa(p), + pgprot_val(PAGE_KERNEL), + mmu_vmemmap_psize, mmu_kernel_ssize); + BUG_ON(mapped < 0); + } + + return 0; +} +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c new file mode 100644 index 0000000..b9e1a1d --- /dev/null +++ b/arch/powerpc/mm/mem.c @@ -0,0 +1,529 @@ +/* + * PowerPC version + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * PPC44x/36-bit changes by Matt Porter (mporter@mvista.com) + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/stddef.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/highmem.h> +#include <linux/initrd.h> +#include <linux/pagemap.h> +#include <linux/suspend.h> +#include <linux/lmb.h> + +#include <asm/pgalloc.h> +#include <asm/prom.h> +#include <asm/io.h> +#include <asm/mmu_context.h> +#include <asm/pgtable.h> +#include <asm/mmu.h> +#include <asm/smp.h> +#include <asm/machdep.h> +#include <asm/btext.h> +#include <asm/tlb.h> +#include <asm/sections.h> +#include <asm/sparsemem.h> +#include <asm/vdso.h> +#include <asm/fixmap.h> + +#include "mmu_decl.h" + +#ifndef CPU_FTR_COHERENT_ICACHE +#define CPU_FTR_COHERENT_ICACHE 0 /* XXX for now */ +#define CPU_FTR_NOEXECUTE 0 +#endif + +int init_bootmem_done; +int mem_init_done; +unsigned long memory_limit; + +#ifdef CONFIG_HIGHMEM +pte_t *kmap_pte; +pgprot_t kmap_prot; + +EXPORT_SYMBOL(kmap_prot); +EXPORT_SYMBOL(kmap_pte); + +static inline pte_t *virt_to_kpte(unsigned long vaddr) +{ + return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), + vaddr), vaddr), vaddr); +} +#endif + +int page_is_ram(unsigned long pfn) +{ +#ifndef CONFIG_PPC64 /* XXX for now */ + return pfn < max_pfn; +#else + unsigned long paddr = (pfn << PAGE_SHIFT); + int i; + for (i=0; i < lmb.memory.cnt; i++) { + unsigned long base; + + base = lmb.memory.region[i].base; + + if ((paddr >= base) && + (paddr < (base + lmb.memory.region[i].size))) { + return 1; + } + } + + return 0; +#endif +} + +pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t vma_prot) +{ + if (ppc_md.phys_mem_access_prot) + return ppc_md.phys_mem_access_prot(file, pfn, size, vma_prot); + + if (!page_is_ram(pfn)) + vma_prot = __pgprot(pgprot_val(vma_prot) + | _PAGE_GUARDED | _PAGE_NO_CACHE); + return vma_prot; +} +EXPORT_SYMBOL(phys_mem_access_prot); + +#ifdef CONFIG_MEMORY_HOTPLUG + +#ifdef CONFIG_NUMA +int memory_add_physaddr_to_nid(u64 start) +{ + return hot_add_scn_to_nid(start); +} +#endif + +int arch_add_memory(int nid, u64 start, u64 size) +{ + struct pglist_data *pgdata; + struct zone *zone; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + pgdata = NODE_DATA(nid); + + start = (unsigned long)__va(start); + create_section_mapping(start, start + size); + + /* this should work for most non-highmem platforms */ + zone = pgdata->node_zones; + + return __add_pages(zone, start_pfn, nr_pages); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + +/* + * walk_memory_resource() needs to make sure there is no holes in a given + * memory range. PPC64 does not maintain the memory layout in /proc/iomem. + * Instead it maintains it in lmb.memory structures. Walk through the + * memory regions, find holes and callback for contiguous regions. + */ +int +walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg, + int (*func)(unsigned long, unsigned long, void *)) +{ + struct lmb_property res; + unsigned long pfn, len; + u64 end; + int ret = -1; + + res.base = (u64) start_pfn << PAGE_SHIFT; + res.size = (u64) nr_pages << PAGE_SHIFT; + + end = res.base + res.size - 1; + while ((res.base < end) && (lmb_find(&res) >= 0)) { + pfn = (unsigned long)(res.base >> PAGE_SHIFT); + len = (unsigned long)(res.size >> PAGE_SHIFT); + ret = (*func)(pfn, len, arg); + if (ret) + break; + res.base += (res.size + 1); + res.size = (end - res.base + 1); + } + return ret; +} +EXPORT_SYMBOL_GPL(walk_memory_resource); + +/* + * Initialize the bootmem system and give it all the memory we + * have available. If we are using highmem, we only put the + * lowmem into the bootmem system. + */ +#ifndef CONFIG_NEED_MULTIPLE_NODES +void __init do_init_bootmem(void) +{ + unsigned long i; + unsigned long start, bootmap_pages; + unsigned long total_pages; + int boot_mapsize; + + max_low_pfn = max_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT; + total_pages = (lmb_end_of_DRAM() - memstart_addr) >> PAGE_SHIFT; +#ifdef CONFIG_HIGHMEM + total_pages = total_lowmem >> PAGE_SHIFT; + max_low_pfn = lowmem_end_addr >> PAGE_SHIFT; +#endif + + /* + * Find an area to use for the bootmem bitmap. Calculate the size of + * bitmap required as (Total Memory) / PAGE_SIZE / BITS_PER_BYTE. + * Add 1 additional page in case the address isn't page-aligned. + */ + bootmap_pages = bootmem_bootmap_pages(total_pages); + + start = lmb_alloc(bootmap_pages << PAGE_SHIFT, PAGE_SIZE); + + min_low_pfn = MEMORY_START >> PAGE_SHIFT; + boot_mapsize = init_bootmem_node(NODE_DATA(0), start >> PAGE_SHIFT, min_low_pfn, max_low_pfn); + + /* Add active regions with valid PFNs */ + for (i = 0; i < lmb.memory.cnt; i++) { + unsigned long start_pfn, end_pfn; + start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT; + end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i); + add_active_range(0, start_pfn, end_pfn); + } + + /* Add all physical memory to the bootmem map, mark each area + * present. + */ +#ifdef CONFIG_HIGHMEM + free_bootmem_with_active_regions(0, lowmem_end_addr >> PAGE_SHIFT); + + /* reserve the sections we're already using */ + for (i = 0; i < lmb.reserved.cnt; i++) { + unsigned long addr = lmb.reserved.region[i].base + + lmb_size_bytes(&lmb.reserved, i) - 1; + if (addr < lowmem_end_addr) + reserve_bootmem(lmb.reserved.region[i].base, + lmb_size_bytes(&lmb.reserved, i), + BOOTMEM_DEFAULT); + else if (lmb.reserved.region[i].base < lowmem_end_addr) { + unsigned long adjusted_size = lowmem_end_addr - + lmb.reserved.region[i].base; + reserve_bootmem(lmb.reserved.region[i].base, + adjusted_size, BOOTMEM_DEFAULT); + } + } +#else + free_bootmem_with_active_regions(0, max_pfn); + + /* reserve the sections we're already using */ + for (i = 0; i < lmb.reserved.cnt; i++) + reserve_bootmem(lmb.reserved.region[i].base, + lmb_size_bytes(&lmb.reserved, i), + BOOTMEM_DEFAULT); + +#endif + /* XXX need to clip this if using highmem? */ + sparse_memory_present_with_active_regions(0); + + init_bootmem_done = 1; +} + +/* mark pages that don't exist as nosave */ +static int __init mark_nonram_nosave(void) +{ + unsigned long lmb_next_region_start_pfn, + lmb_region_max_pfn; + int i; + + for (i = 0; i < lmb.memory.cnt - 1; i++) { + lmb_region_max_pfn = + (lmb.memory.region[i].base >> PAGE_SHIFT) + + (lmb.memory.region[i].size >> PAGE_SHIFT); + lmb_next_region_start_pfn = + lmb.memory.region[i+1].base >> PAGE_SHIFT; + + if (lmb_region_max_pfn < lmb_next_region_start_pfn) + register_nosave_region(lmb_region_max_pfn, + lmb_next_region_start_pfn); + } + + return 0; +} + +/* + * paging_init() sets up the page tables - in fact we've already done this. + */ +void __init paging_init(void) +{ + unsigned long total_ram = lmb_phys_mem_size(); + phys_addr_t top_of_ram = lmb_end_of_DRAM(); + unsigned long max_zone_pfns[MAX_NR_ZONES]; + +#ifdef CONFIG_PPC32 + unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1); + unsigned long end = __fix_to_virt(FIX_HOLE); + + for (; v < end; v += PAGE_SIZE) + map_page(v, 0, 0); /* XXX gross */ +#endif + +#ifdef CONFIG_HIGHMEM + map_page(PKMAP_BASE, 0, 0); /* XXX gross */ + pkmap_page_table = virt_to_kpte(PKMAP_BASE); + + kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); + kmap_prot = PAGE_KERNEL; +#endif /* CONFIG_HIGHMEM */ + + printk(KERN_DEBUG "Top of RAM: 0x%llx, Total RAM: 0x%lx\n", + (unsigned long long)top_of_ram, total_ram); + printk(KERN_DEBUG "Memory hole size: %ldMB\n", + (long int)((top_of_ram - total_ram) >> 20)); + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); +#ifdef CONFIG_HIGHMEM + max_zone_pfns[ZONE_DMA] = lowmem_end_addr >> PAGE_SHIFT; + max_zone_pfns[ZONE_HIGHMEM] = top_of_ram >> PAGE_SHIFT; +#else + max_zone_pfns[ZONE_DMA] = top_of_ram >> PAGE_SHIFT; +#endif + free_area_init_nodes(max_zone_pfns); + + mark_nonram_nosave(); +} +#endif /* ! CONFIG_NEED_MULTIPLE_NODES */ + +void __init mem_init(void) +{ +#ifdef CONFIG_NEED_MULTIPLE_NODES + int nid; +#endif + pg_data_t *pgdat; + unsigned long i; + struct page *page; + unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize; + + num_physpages = lmb.memory.size >> PAGE_SHIFT; + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); + +#ifdef CONFIG_NEED_MULTIPLE_NODES + for_each_online_node(nid) { + if (NODE_DATA(nid)->node_spanned_pages != 0) { + printk("freeing bootmem node %d\n", nid); + totalram_pages += + free_all_bootmem_node(NODE_DATA(nid)); + } + } +#else + max_mapnr = max_pfn; + totalram_pages += free_all_bootmem(); +#endif + for_each_online_pgdat(pgdat) { + for (i = 0; i < pgdat->node_spanned_pages; i++) { + if (!pfn_valid(pgdat->node_start_pfn + i)) + continue; + page = pgdat_page_nr(pgdat, i); + if (PageReserved(page)) + reservedpages++; + } + } + + codesize = (unsigned long)&_sdata - (unsigned long)&_stext; + datasize = (unsigned long)&_edata - (unsigned long)&_sdata; + initsize = (unsigned long)&__init_end - (unsigned long)&__init_begin; + bsssize = (unsigned long)&__bss_stop - (unsigned long)&__bss_start; + +#ifdef CONFIG_HIGHMEM + { + unsigned long pfn, highmem_mapnr; + + highmem_mapnr = lowmem_end_addr >> PAGE_SHIFT; + for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) { + struct page *page = pfn_to_page(pfn); + if (lmb_is_reserved(pfn << PAGE_SHIFT)) + continue; + ClearPageReserved(page); + init_page_count(page); + __free_page(page); + totalhigh_pages++; + reservedpages--; + } + totalram_pages += totalhigh_pages; + printk(KERN_DEBUG "High memory: %luk\n", + totalhigh_pages << (PAGE_SHIFT-10)); + } +#endif /* CONFIG_HIGHMEM */ + + printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, " + "%luk reserved, %luk data, %luk bss, %luk init)\n", + (unsigned long)nr_free_pages() << (PAGE_SHIFT-10), + num_physpages << (PAGE_SHIFT-10), + codesize >> 10, + reservedpages << (PAGE_SHIFT-10), + datasize >> 10, + bsssize >> 10, + initsize >> 10); + + mem_init_done = 1; +} + +/* + * This is called when a page has been modified by the kernel. + * It just marks the page as not i-cache clean. We do the i-cache + * flush later when the page is given to a user process, if necessary. + */ +void flush_dcache_page(struct page *page) +{ + if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) + return; + /* avoid an atomic op if possible */ + if (test_bit(PG_arch_1, &page->flags)) + clear_bit(PG_arch_1, &page->flags); +} +EXPORT_SYMBOL(flush_dcache_page); + +void flush_dcache_icache_page(struct page *page) +{ +#ifdef CONFIG_BOOKE + void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE); + __flush_dcache_icache(start); + kunmap_atomic(start, KM_PPC_SYNC_ICACHE); +#elif defined(CONFIG_8xx) || defined(CONFIG_PPC64) + /* On 8xx there is no need to kmap since highmem is not supported */ + __flush_dcache_icache(page_address(page)); +#else + __flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT); +#endif + +} +void clear_user_page(void *page, unsigned long vaddr, struct page *pg) +{ + clear_page(page); + + /* + * We shouldnt have to do this, but some versions of glibc + * require it (ld.so assumes zero filled pages are icache clean) + * - Anton + */ + flush_dcache_page(pg); +} +EXPORT_SYMBOL(clear_user_page); + +void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, + struct page *pg) +{ + copy_page(vto, vfrom); + + /* + * We should be able to use the following optimisation, however + * there are two problems. + * Firstly a bug in some versions of binutils meant PLT sections + * were not marked executable. + * Secondly the first word in the GOT section is blrl, used + * to establish the GOT address. Until recently the GOT was + * not marked executable. + * - Anton + */ +#if 0 + if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0)) + return; +#endif + + flush_dcache_page(pg); +} + +void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, + unsigned long addr, int len) +{ + unsigned long maddr; + + maddr = (unsigned long) kmap(page) + (addr & ~PAGE_MASK); + flush_icache_range(maddr, maddr + len); + kunmap(page); +} +EXPORT_SYMBOL(flush_icache_user_range); + +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a PTE in the linux page tables. + * We use it to preload an HPTE into the hash table corresponding to + * the updated linux PTE. + * + * This must always be called with the pte lock held. + */ +void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, + pte_t pte) +{ +#ifdef CONFIG_PPC_STD_MMU + unsigned long access = 0, trap; +#endif + unsigned long pfn = pte_pfn(pte); + + /* handle i-cache coherency */ + if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) && + !cpu_has_feature(CPU_FTR_NOEXECUTE) && + pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); +#ifdef CONFIG_8xx + /* On 8xx, cache control instructions (particularly + * "dcbst" from flush_dcache_icache) fault as write + * operation if there is an unpopulated TLB entry + * for the address in question. To workaround that, + * we invalidate the TLB here, thus avoiding dcbst + * misbehaviour. + */ + _tlbie(address, 0 /* 8xx doesn't care about PID */); +#endif + /* The _PAGE_USER test should really be _PAGE_EXEC, but + * older glibc versions execute some code from no-exec + * pages, which for now we are supporting. If exec-only + * pages are ever implemented, this will have to change. + */ + if (!PageReserved(page) && (pte_val(pte) & _PAGE_USER) + && !test_bit(PG_arch_1, &page->flags)) { + if (vma->vm_mm == current->active_mm) { + __flush_dcache_icache((void *) address); + } else + flush_dcache_icache_page(page); + set_bit(PG_arch_1, &page->flags); + } + } + +#ifdef CONFIG_PPC_STD_MMU + /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ + if (!pte_young(pte) || address >= TASK_SIZE) + return; + + /* We try to figure out if we are coming from an instruction + * access fault and pass that down to __hash_page so we avoid + * double-faulting on execution of fresh text. We have to test + * for regs NULL since init will get here first thing at boot + * + * We also avoid filling the hash if not coming from a fault + */ + if (current->thread.regs == NULL) + return; + trap = TRAP(current->thread.regs); + if (trap == 0x400) + access |= _PAGE_EXEC; + else if (trap != 0x300) + return; + hash_preload(vma->vm_mm, address, access, trap); +#endif /* CONFIG_PPC_STD_MMU */ +} diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c new file mode 100644 index 0000000..86010fc --- /dev/null +++ b/arch/powerpc/mm/mmap.c @@ -0,0 +1,85 @@ +/* + * flexible mmap layout support + * + * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + * Started by Ingo Molnar <mingo@elte.hu> + */ + +#include <linux/personality.h> +#include <linux/mm.h> +#include <linux/sched.h> + +/* + * Top of mmap area (just below the process stack). + * + * Leave an at least ~128 MB hole. + */ +#define MIN_GAP (128*1024*1024) +#define MAX_GAP (TASK_SIZE/6*5) + +static inline unsigned long mmap_base(void) +{ + unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; + + if (gap < MIN_GAP) + gap = MIN_GAP; + else if (gap > MAX_GAP) + gap = MAX_GAP; + + return TASK_SIZE - (gap & PAGE_MASK); +} + +static inline int mmap_is_legacy(void) +{ + /* + * Force standard allocation for 64 bit programs. + */ + if (!test_thread_flag(TIF_32BIT)) + return 1; + + if (current->personality & ADDR_COMPAT_LAYOUT) + return 1; + + if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) + return 1; + + return sysctl_legacy_va_layout; +} + +/* + * This function, called very early during the creation of a new + * process VM image, sets up which VM layout function to use: + */ +void arch_pick_mmap_layout(struct mm_struct *mm) +{ + /* + * Fall back to the standard layout if the personality + * bit is set, or if the expected stack growth is unlimited: + */ + if (mmap_is_legacy()) { + mm->mmap_base = TASK_UNMAPPED_BASE; + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; + } else { + mm->mmap_base = mmap_base(); + mm->get_unmapped_area = arch_get_unmapped_area_topdown; + mm->unmap_area = arch_unmap_area_topdown; + } +} diff --git a/arch/powerpc/mm/mmu_context_32.c b/arch/powerpc/mm/mmu_context_32.c new file mode 100644 index 0000000..cc32ba4 --- /dev/null +++ b/arch/powerpc/mm/mmu_context_32.c @@ -0,0 +1,84 @@ +/* + * This file contains the routines for handling the MMU on those + * PowerPC implementations where the MMU substantially follows the + * architecture specification. This includes the 6xx, 7xx, 7xxx, + * 8260, and POWER3 implementations but excludes the 8xx and 4xx. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/mm.h> +#include <linux/init.h> + +#include <asm/mmu_context.h> +#include <asm/tlbflush.h> + +unsigned long next_mmu_context; +unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1]; +#ifdef FEW_CONTEXTS +atomic_t nr_free_contexts; +struct mm_struct *context_mm[LAST_CONTEXT+1]; +void steal_context(void); +#endif /* FEW_CONTEXTS */ + +/* + * Initialize the context management stuff. + */ +void __init +mmu_context_init(void) +{ + /* + * Some processors have too few contexts to reserve one for + * init_mm, and require using context 0 for a normal task. + * Other processors reserve the use of context zero for the kernel. + * This code assumes FIRST_CONTEXT < 32. + */ + context_map[0] = (1 << FIRST_CONTEXT) - 1; + next_mmu_context = FIRST_CONTEXT; +#ifdef FEW_CONTEXTS + atomic_set(&nr_free_contexts, LAST_CONTEXT - FIRST_CONTEXT + 1); +#endif /* FEW_CONTEXTS */ +} + +#ifdef FEW_CONTEXTS +/* + * Steal a context from a task that has one at the moment. + * This is only used on 8xx and 4xx and we presently assume that + * they don't do SMP. If they do then this will have to check + * whether the MM we steal is in use. + * We also assume that this is only used on systems that don't + * use an MMU hash table - this is true for 8xx and 4xx. + * This isn't an LRU system, it just frees up each context in + * turn (sort-of pseudo-random replacement :). This would be the + * place to implement an LRU scheme if anyone was motivated to do it. + * -- paulus + */ +void +steal_context(void) +{ + struct mm_struct *mm; + + /* free up context `next_mmu_context' */ + /* if we shouldn't free context 0, don't... */ + if (next_mmu_context < FIRST_CONTEXT) + next_mmu_context = FIRST_CONTEXT; + mm = context_mm[next_mmu_context]; + flush_tlb_mm(mm); + destroy_context(mm); +} +#endif /* FEW_CONTEXTS */ diff --git a/arch/powerpc/mm/mmu_context_64.c b/arch/powerpc/mm/mmu_context_64.c new file mode 100644 index 0000000..1db38ba --- /dev/null +++ b/arch/powerpc/mm/mmu_context_64.c @@ -0,0 +1,70 @@ +/* + * MMU context allocation for 64-bit kernels. + * + * Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/idr.h> + +#include <asm/mmu_context.h> + +static DEFINE_SPINLOCK(mmu_context_lock); +static DEFINE_IDR(mmu_context_idr); + +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + int index; + int err; + +again: + if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL)) + return -ENOMEM; + + spin_lock(&mmu_context_lock); + err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index); + spin_unlock(&mmu_context_lock); + + if (err == -EAGAIN) + goto again; + else if (err) + return err; + + if (index > MAX_CONTEXT) { + spin_lock(&mmu_context_lock); + idr_remove(&mmu_context_idr, index); + spin_unlock(&mmu_context_lock); + return -ENOMEM; + } + + /* The old code would re-promote on fork, we don't do that + * when using slices as it could cause problem promoting slices + * that have been forced down to 4K + */ + if (slice_mm_new_context(mm)) + slice_set_user_psize(mm, mmu_virtual_psize); + mm->context.id = index; + + return 0; +} + +void destroy_context(struct mm_struct *mm) +{ + spin_lock(&mmu_context_lock); + idr_remove(&mmu_context_idr, mm->context.id); + spin_unlock(&mmu_context_lock); + + mm->context.id = NO_CONTEXT; +} diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h new file mode 100644 index 0000000..fab3cfa --- /dev/null +++ b/arch/powerpc/mm/mmu_decl.h @@ -0,0 +1,94 @@ +/* + * Declarations of procedures and variables shared between files + * in arch/ppc/mm/. + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ +#include <linux/mm.h> +#include <asm/tlbflush.h> +#include <asm/mmu.h> + +extern void hash_preload(struct mm_struct *mm, unsigned long ea, + unsigned long access, unsigned long trap); + + +#ifdef CONFIG_PPC32 +extern void mapin_ram(void); +extern int map_page(unsigned long va, phys_addr_t pa, int flags); +extern void setbat(int index, unsigned long virt, phys_addr_t phys, + unsigned int size, int flags); +extern void settlbcam(int index, unsigned long virt, phys_addr_t phys, + unsigned int size, int flags, unsigned int pid); +extern void invalidate_tlbcam_entry(int index); + +extern int __map_without_bats; +extern unsigned long ioremap_base; +extern unsigned int rtas_data, rtas_size; + +struct hash_pte; +extern struct hash_pte *Hash, *Hash_end; +extern unsigned long Hash_size, Hash_mask; + +extern unsigned int num_tlbcam_entries; +#endif + +extern unsigned long ioremap_bot; +extern unsigned long __max_low_memory; +extern phys_addr_t __initial_memory_limit_addr; +extern phys_addr_t total_memory; +extern phys_addr_t total_lowmem; +extern phys_addr_t memstart_addr; +extern phys_addr_t lowmem_end_addr; + +/* ...and now those things that may be slightly different between processor + * architectures. -- Dan + */ +#if defined(CONFIG_8xx) +#define flush_HPTE(X, va, pg) _tlbie(va, 0 /* 8xx doesn't care about PID */) +#define MMU_init_hw() do { } while(0) +#define mmu_mapin_ram() (0UL) + +#elif defined(CONFIG_4xx) +#define flush_HPTE(pid, va, pg) _tlbie(va, pid) +extern void MMU_init_hw(void); +extern unsigned long mmu_mapin_ram(void); + +#elif defined(CONFIG_FSL_BOOKE) +#define flush_HPTE(pid, va, pg) _tlbie(va, pid) +extern void MMU_init_hw(void); +extern unsigned long mmu_mapin_ram(void); +extern void adjust_total_lowmem(void); + +#elif defined(CONFIG_PPC32) +/* anything 32-bit except 4xx or 8xx */ +extern void MMU_init_hw(void); +extern unsigned long mmu_mapin_ram(void); + +/* Be careful....this needs to be updated if we ever encounter 603 SMPs, + * which includes all new 82xx processors. We need tlbie/tlbsync here + * in that case (I think). -- Dan. + */ +static inline void flush_HPTE(unsigned context, unsigned long va, + unsigned long pdval) +{ + if ((Hash != 0) && + cpu_has_feature(CPU_FTR_HPTE_TABLE)) + flush_hash_pages(0, va, pdval, 1); + else + _tlbie(va); +} +#endif diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c new file mode 100644 index 0000000..cf81049 --- /dev/null +++ b/arch/powerpc/mm/numa.c @@ -0,0 +1,1157 @@ +/* + * pSeries NUMA support + * + * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/threads.h> +#include <linux/bootmem.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/module.h> +#include <linux/nodemask.h> +#include <linux/cpu.h> +#include <linux/notifier.h> +#include <linux/lmb.h> +#include <linux/of.h> +#include <asm/sparsemem.h> +#include <asm/prom.h> +#include <asm/system.h> +#include <asm/smp.h> + +static int numa_enabled = 1; + +static char *cmdline __initdata; + +static int numa_debug; +#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } + +int numa_cpu_lookup_table[NR_CPUS]; +cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES]; +struct pglist_data *node_data[MAX_NUMNODES]; + +EXPORT_SYMBOL(numa_cpu_lookup_table); +EXPORT_SYMBOL(numa_cpumask_lookup_table); +EXPORT_SYMBOL(node_data); + +static int min_common_depth; +static int n_mem_addr_cells, n_mem_size_cells; + +static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn, + unsigned int *nid) +{ + unsigned long long mem; + char *p = cmdline; + static unsigned int fake_nid; + static unsigned long long curr_boundary; + + /* + * Modify node id, iff we started creating NUMA nodes + * We want to continue from where we left of the last time + */ + if (fake_nid) + *nid = fake_nid; + /* + * In case there are no more arguments to parse, the + * node_id should be the same as the last fake node id + * (we've handled this above). + */ + if (!p) + return 0; + + mem = memparse(p, &p); + if (!mem) + return 0; + + if (mem < curr_boundary) + return 0; + + curr_boundary = mem; + + if ((end_pfn << PAGE_SHIFT) > mem) { + /* + * Skip commas and spaces + */ + while (*p == ',' || *p == ' ' || *p == '\t') + p++; + + cmdline = p; + fake_nid++; + *nid = fake_nid; + dbg("created new fake_node with id %d\n", fake_nid); + return 1; + } + return 0; +} + +/* + * get_active_region_work_fn - A helper function for get_node_active_region + * Returns datax set to the start_pfn and end_pfn if they contain + * the initial value of datax->start_pfn between them + * @start_pfn: start page(inclusive) of region to check + * @end_pfn: end page(exclusive) of region to check + * @datax: comes in with ->start_pfn set to value to search for and + * goes out with active range if it contains it + * Returns 1 if search value is in range else 0 + */ +static int __init get_active_region_work_fn(unsigned long start_pfn, + unsigned long end_pfn, void *datax) +{ + struct node_active_region *data; + data = (struct node_active_region *)datax; + + if (start_pfn <= data->start_pfn && end_pfn > data->start_pfn) { + data->start_pfn = start_pfn; + data->end_pfn = end_pfn; + return 1; + } + return 0; + +} + +/* + * get_node_active_region - Return active region containing start_pfn + * Active range returned is empty if none found. + * @start_pfn: The page to return the region for. + * @node_ar: Returned set to the active region containing start_pfn + */ +static void __init get_node_active_region(unsigned long start_pfn, + struct node_active_region *node_ar) +{ + int nid = early_pfn_to_nid(start_pfn); + + node_ar->nid = nid; + node_ar->start_pfn = start_pfn; + node_ar->end_pfn = start_pfn; + work_with_active_regions(nid, get_active_region_work_fn, node_ar); +} + +static void __cpuinit map_cpu_to_node(int cpu, int node) +{ + numa_cpu_lookup_table[cpu] = node; + + dbg("adding cpu %d to node %d\n", cpu, node); + + if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) + cpu_set(cpu, numa_cpumask_lookup_table[node]); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void unmap_cpu_from_node(unsigned long cpu) +{ + int node = numa_cpu_lookup_table[cpu]; + + dbg("removing cpu %lu from node %d\n", cpu, node); + + if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) { + cpu_clear(cpu, numa_cpumask_lookup_table[node]); + } else { + printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", + cpu, node); + } +} +#endif /* CONFIG_HOTPLUG_CPU */ + +static struct device_node * __cpuinit find_cpu_node(unsigned int cpu) +{ + unsigned int hw_cpuid = get_hard_smp_processor_id(cpu); + struct device_node *cpu_node = NULL; + const unsigned int *interrupt_server, *reg; + int len; + + while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) { + /* Try interrupt server first */ + interrupt_server = of_get_property(cpu_node, + "ibm,ppc-interrupt-server#s", &len); + + len = len / sizeof(u32); + + if (interrupt_server && (len > 0)) { + while (len--) { + if (interrupt_server[len] == hw_cpuid) + return cpu_node; + } + } else { + reg = of_get_property(cpu_node, "reg", &len); + if (reg && (len > 0) && (reg[0] == hw_cpuid)) + return cpu_node; + } + } + + return NULL; +} + +/* must hold reference to node during call */ +static const int *of_get_associativity(struct device_node *dev) +{ + return of_get_property(dev, "ibm,associativity", NULL); +} + +/* + * Returns the property linux,drconf-usable-memory if + * it exists (the property exists only in kexec/kdump kernels, + * added by kexec-tools) + */ +static const u32 *of_get_usable_memory(struct device_node *memory) +{ + const u32 *prop; + u32 len; + prop = of_get_property(memory, "linux,drconf-usable-memory", &len); + if (!prop || len < sizeof(unsigned int)) + return 0; + return prop; +} + +/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa + * info is found. + */ +static int of_node_to_nid_single(struct device_node *device) +{ + int nid = -1; + const unsigned int *tmp; + + if (min_common_depth == -1) + goto out; + + tmp = of_get_associativity(device); + if (!tmp) + goto out; + + if (tmp[0] >= min_common_depth) + nid = tmp[min_common_depth]; + + /* POWER4 LPAR uses 0xffff as invalid node */ + if (nid == 0xffff || nid >= MAX_NUMNODES) + nid = -1; +out: + return nid; +} + +/* Walk the device tree upwards, looking for an associativity id */ +int of_node_to_nid(struct device_node *device) +{ + struct device_node *tmp; + int nid = -1; + + of_node_get(device); + while (device) { + nid = of_node_to_nid_single(device); + if (nid != -1) + break; + + tmp = device; + device = of_get_parent(tmp); + of_node_put(tmp); + } + of_node_put(device); + + return nid; +} +EXPORT_SYMBOL_GPL(of_node_to_nid); + +/* + * In theory, the "ibm,associativity" property may contain multiple + * associativity lists because a resource may be multiply connected + * into the machine. This resource then has different associativity + * characteristics relative to its multiple connections. We ignore + * this for now. We also assume that all cpu and memory sets have + * their distances represented at a common level. This won't be + * true for hierarchical NUMA. + * + * In any case the ibm,associativity-reference-points should give + * the correct depth for a normal NUMA system. + * + * - Dave Hansen <haveblue@us.ibm.com> + */ +static int __init find_min_common_depth(void) +{ + int depth; + const unsigned int *ref_points; + struct device_node *rtas_root; + unsigned int len; + + rtas_root = of_find_node_by_path("/rtas"); + + if (!rtas_root) + return -1; + + /* + * this property is 2 32-bit integers, each representing a level of + * depth in the associativity nodes. The first is for an SMP + * configuration (should be all 0's) and the second is for a normal + * NUMA configuration. + */ + ref_points = of_get_property(rtas_root, + "ibm,associativity-reference-points", &len); + + if ((len >= 1) && ref_points) { + depth = ref_points[1]; + } else { + dbg("NUMA: ibm,associativity-reference-points not found.\n"); + depth = -1; + } + of_node_put(rtas_root); + + return depth; +} + +static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells) +{ + struct device_node *memory = NULL; + + memory = of_find_node_by_type(memory, "memory"); + if (!memory) + panic("numa.c: No memory nodes found!"); + + *n_addr_cells = of_n_addr_cells(memory); + *n_size_cells = of_n_size_cells(memory); + of_node_put(memory); +} + +static unsigned long __devinit read_n_cells(int n, const unsigned int **buf) +{ + unsigned long result = 0; + + while (n--) { + result = (result << 32) | **buf; + (*buf)++; + } + return result; +} + +struct of_drconf_cell { + u64 base_addr; + u32 drc_index; + u32 reserved; + u32 aa_index; + u32 flags; +}; + +#define DRCONF_MEM_ASSIGNED 0x00000008 +#define DRCONF_MEM_AI_INVALID 0x00000040 +#define DRCONF_MEM_RESERVED 0x00000080 + +/* + * Read the next lmb list entry from the ibm,dynamic-memory property + * and return the information in the provided of_drconf_cell structure. + */ +static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp) +{ + const u32 *cp; + + drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp); + + cp = *cellp; + drmem->drc_index = cp[0]; + drmem->reserved = cp[1]; + drmem->aa_index = cp[2]; + drmem->flags = cp[3]; + + *cellp = cp + 4; +} + +/* + * Retreive and validate the ibm,dynamic-memory property of the device tree. + * + * The layout of the ibm,dynamic-memory property is a number N of lmb + * list entries followed by N lmb list entries. Each lmb list entry + * contains information as layed out in the of_drconf_cell struct above. + */ +static int of_get_drconf_memory(struct device_node *memory, const u32 **dm) +{ + const u32 *prop; + u32 len, entries; + + prop = of_get_property(memory, "ibm,dynamic-memory", &len); + if (!prop || len < sizeof(unsigned int)) + return 0; + + entries = *prop++; + + /* Now that we know the number of entries, revalidate the size + * of the property read in to ensure we have everything + */ + if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int)) + return 0; + + *dm = prop; + return entries; +} + +/* + * Retreive and validate the ibm,lmb-size property for drconf memory + * from the device tree. + */ +static u64 of_get_lmb_size(struct device_node *memory) +{ + const u32 *prop; + u32 len; + + prop = of_get_property(memory, "ibm,lmb-size", &len); + if (!prop || len < sizeof(unsigned int)) + return 0; + + return read_n_cells(n_mem_size_cells, &prop); +} + +struct assoc_arrays { + u32 n_arrays; + u32 array_sz; + const u32 *arrays; +}; + +/* + * Retreive and validate the list of associativity arrays for drconf + * memory from the ibm,associativity-lookup-arrays property of the + * device tree.. + * + * The layout of the ibm,associativity-lookup-arrays property is a number N + * indicating the number of associativity arrays, followed by a number M + * indicating the size of each associativity array, followed by a list + * of N associativity arrays. + */ +static int of_get_assoc_arrays(struct device_node *memory, + struct assoc_arrays *aa) +{ + const u32 *prop; + u32 len; + + prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len); + if (!prop || len < 2 * sizeof(unsigned int)) + return -1; + + aa->n_arrays = *prop++; + aa->array_sz = *prop++; + + /* Now that we know the number of arrrays and size of each array, + * revalidate the size of the property read in. + */ + if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int)) + return -1; + + aa->arrays = prop; + return 0; +} + +/* + * This is like of_node_to_nid_single() for memory represented in the + * ibm,dynamic-reconfiguration-memory node. + */ +static int of_drconf_to_nid_single(struct of_drconf_cell *drmem, + struct assoc_arrays *aa) +{ + int default_nid = 0; + int nid = default_nid; + int index; + + if (min_common_depth > 0 && min_common_depth <= aa->array_sz && + !(drmem->flags & DRCONF_MEM_AI_INVALID) && + drmem->aa_index < aa->n_arrays) { + index = drmem->aa_index * aa->array_sz + min_common_depth - 1; + nid = aa->arrays[index]; + + if (nid == 0xffff || nid >= MAX_NUMNODES) + nid = default_nid; + } + + return nid; +} + +/* + * Figure out to which domain a cpu belongs and stick it there. + * Return the id of the domain used. + */ +static int __cpuinit numa_setup_cpu(unsigned long lcpu) +{ + int nid = 0; + struct device_node *cpu = find_cpu_node(lcpu); + + if (!cpu) { + WARN_ON(1); + goto out; + } + + nid = of_node_to_nid_single(cpu); + + if (nid < 0 || !node_online(nid)) + nid = any_online_node(NODE_MASK_ALL); +out: + map_cpu_to_node(lcpu, nid); + + of_node_put(cpu); + + return nid; +} + +static int __cpuinit cpu_numa_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned long lcpu = (unsigned long)hcpu; + int ret = NOTIFY_DONE; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + numa_setup_cpu(lcpu); + ret = NOTIFY_OK; + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + case CPU_DEAD_FROZEN: + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + unmap_cpu_from_node(lcpu); + break; + ret = NOTIFY_OK; +#endif + } + return ret; +} + +/* + * Check and possibly modify a memory region to enforce the memory limit. + * + * Returns the size the region should have to enforce the memory limit. + * This will either be the original value of size, a truncated value, + * or zero. If the returned value of size is 0 the region should be + * discarded as it lies wholy above the memory limit. + */ +static unsigned long __init numa_enforce_memory_limit(unsigned long start, + unsigned long size) +{ + /* + * We use lmb_end_of_DRAM() in here instead of memory_limit because + * we've already adjusted it for the limit and it takes care of + * having memory holes below the limit. Also, in the case of + * iommu_is_off, memory_limit is not set but is implicitly enforced. + */ + + if (start + size <= lmb_end_of_DRAM()) + return size; + + if (start >= lmb_end_of_DRAM()) + return 0; + + return lmb_end_of_DRAM() - start; +} + +/* + * Reads the counter for a given entry in + * linux,drconf-usable-memory property + */ +static inline int __init read_usm_ranges(const u32 **usm) +{ + /* + * For each lmb in ibm,dynamic-memory a corresponding + * entry in linux,drconf-usable-memory property contains + * a counter followed by that many (base, size) duple. + * read the counter from linux,drconf-usable-memory + */ + return read_n_cells(n_mem_size_cells, usm); +} + +/* + * Extract NUMA information from the ibm,dynamic-reconfiguration-memory + * node. This assumes n_mem_{addr,size}_cells have been set. + */ +static void __init parse_drconf_memory(struct device_node *memory) +{ + const u32 *dm, *usm; + unsigned int n, rc, ranges, is_kexec_kdump = 0; + unsigned long lmb_size, base, size, sz; + int nid; + struct assoc_arrays aa; + + n = of_get_drconf_memory(memory, &dm); + if (!n) + return; + + lmb_size = of_get_lmb_size(memory); + if (!lmb_size) + return; + + rc = of_get_assoc_arrays(memory, &aa); + if (rc) + return; + + /* check if this is a kexec/kdump kernel */ + usm = of_get_usable_memory(memory); + if (usm != NULL) + is_kexec_kdump = 1; + + for (; n != 0; --n) { + struct of_drconf_cell drmem; + + read_drconf_cell(&drmem, &dm); + + /* skip this block if the reserved bit is set in flags (0x80) + or if the block is not assigned to this partition (0x8) */ + if ((drmem.flags & DRCONF_MEM_RESERVED) + || !(drmem.flags & DRCONF_MEM_ASSIGNED)) + continue; + + base = drmem.base_addr; + size = lmb_size; + ranges = 1; + + if (is_kexec_kdump) { + ranges = read_usm_ranges(&usm); + if (!ranges) /* there are no (base, size) duple */ + continue; + } + do { + if (is_kexec_kdump) { + base = read_n_cells(n_mem_addr_cells, &usm); + size = read_n_cells(n_mem_size_cells, &usm); + } + nid = of_drconf_to_nid_single(&drmem, &aa); + fake_numa_create_new_node( + ((base + size) >> PAGE_SHIFT), + &nid); + node_set_online(nid); + sz = numa_enforce_memory_limit(base, size); + if (sz) + add_active_range(nid, base >> PAGE_SHIFT, + (base >> PAGE_SHIFT) + + (sz >> PAGE_SHIFT)); + } while (--ranges); + } +} + +static int __init parse_numa_properties(void) +{ + struct device_node *cpu = NULL; + struct device_node *memory = NULL; + int default_nid = 0; + unsigned long i; + + if (numa_enabled == 0) { + printk(KERN_WARNING "NUMA disabled by user\n"); + return -1; + } + + min_common_depth = find_min_common_depth(); + + if (min_common_depth < 0) + return min_common_depth; + + dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); + + /* + * Even though we connect cpus to numa domains later in SMP + * init, we need to know the node ids now. This is because + * each node to be onlined must have NODE_DATA etc backing it. + */ + for_each_present_cpu(i) { + int nid; + + cpu = find_cpu_node(i); + BUG_ON(!cpu); + nid = of_node_to_nid_single(cpu); + of_node_put(cpu); + + /* + * Don't fall back to default_nid yet -- we will plug + * cpus into nodes once the memory scan has discovered + * the topology. + */ + if (nid < 0) + continue; + node_set_online(nid); + } + + get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); + memory = NULL; + while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { + unsigned long start; + unsigned long size; + int nid; + int ranges; + const unsigned int *memcell_buf; + unsigned int len; + + memcell_buf = of_get_property(memory, + "linux,usable-memory", &len); + if (!memcell_buf || len <= 0) + memcell_buf = of_get_property(memory, "reg", &len); + if (!memcell_buf || len <= 0) + continue; + + /* ranges in cell */ + ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); +new_range: + /* these are order-sensitive, and modify the buffer pointer */ + start = read_n_cells(n_mem_addr_cells, &memcell_buf); + size = read_n_cells(n_mem_size_cells, &memcell_buf); + + /* + * Assumption: either all memory nodes or none will + * have associativity properties. If none, then + * everything goes to default_nid. + */ + nid = of_node_to_nid_single(memory); + if (nid < 0) + nid = default_nid; + + fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid); + node_set_online(nid); + + if (!(size = numa_enforce_memory_limit(start, size))) { + if (--ranges) + goto new_range; + else + continue; + } + + add_active_range(nid, start >> PAGE_SHIFT, + (start >> PAGE_SHIFT) + (size >> PAGE_SHIFT)); + + if (--ranges) + goto new_range; + } + + /* + * Now do the same thing for each LMB listed in the ibm,dynamic-memory + * property in the ibm,dynamic-reconfiguration-memory node. + */ + memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (memory) + parse_drconf_memory(memory); + + return 0; +} + +static void __init setup_nonnuma(void) +{ + unsigned long top_of_ram = lmb_end_of_DRAM(); + unsigned long total_ram = lmb_phys_mem_size(); + unsigned long start_pfn, end_pfn; + unsigned int i, nid = 0; + + printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", + top_of_ram, total_ram); + printk(KERN_DEBUG "Memory hole size: %ldMB\n", + (top_of_ram - total_ram) >> 20); + + for (i = 0; i < lmb.memory.cnt; ++i) { + start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT; + end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i); + + fake_numa_create_new_node(end_pfn, &nid); + add_active_range(nid, start_pfn, end_pfn); + node_set_online(nid); + } +} + +void __init dump_numa_cpu_topology(void) +{ + unsigned int node; + unsigned int cpu, count; + + if (min_common_depth == -1 || !numa_enabled) + return; + + for_each_online_node(node) { + printk(KERN_DEBUG "Node %d CPUs:", node); + + count = 0; + /* + * If we used a CPU iterator here we would miss printing + * the holes in the cpumap. + */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) { + if (count == 0) + printk(" %u", cpu); + ++count; + } else { + if (count > 1) + printk("-%u", cpu - 1); + count = 0; + } + } + + if (count > 1) + printk("-%u", NR_CPUS - 1); + printk("\n"); + } +} + +static void __init dump_numa_memory_topology(void) +{ + unsigned int node; + unsigned int count; + + if (min_common_depth == -1 || !numa_enabled) + return; + + for_each_online_node(node) { + unsigned long i; + + printk(KERN_DEBUG "Node %d Memory:", node); + + count = 0; + + for (i = 0; i < lmb_end_of_DRAM(); + i += (1 << SECTION_SIZE_BITS)) { + if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) { + if (count == 0) + printk(" 0x%lx", i); + ++count; + } else { + if (count > 0) + printk("-0x%lx", i); + count = 0; + } + } + + if (count > 0) + printk("-0x%lx", i); + printk("\n"); + } +} + +/* + * Allocate some memory, satisfying the lmb or bootmem allocator where + * required. nid is the preferred node and end is the physical address of + * the highest address in the node. + * + * Returns the physical address of the memory. + */ +static void __init *careful_allocation(int nid, unsigned long size, + unsigned long align, + unsigned long end_pfn) +{ + int new_nid; + unsigned long ret = __lmb_alloc_base(size, align, end_pfn << PAGE_SHIFT); + + /* retry over all memory */ + if (!ret) + ret = __lmb_alloc_base(size, align, lmb_end_of_DRAM()); + + if (!ret) + panic("numa.c: cannot allocate %lu bytes on node %d", + size, nid); + + /* + * If the memory came from a previously allocated node, we must + * retry with the bootmem allocator. + */ + new_nid = early_pfn_to_nid(ret >> PAGE_SHIFT); + if (new_nid < nid) { + ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(new_nid), + size, align, 0); + + if (!ret) + panic("numa.c: cannot allocate %lu bytes on node %d", + size, new_nid); + + ret = __pa(ret); + + dbg("alloc_bootmem %lx %lx\n", ret, size); + } + + return (void *)ret; +} + +static struct notifier_block __cpuinitdata ppc64_numa_nb = { + .notifier_call = cpu_numa_callback, + .priority = 1 /* Must run before sched domains notifier. */ +}; + +static void mark_reserved_regions_for_nid(int nid) +{ + struct pglist_data *node = NODE_DATA(nid); + int i; + + for (i = 0; i < lmb.reserved.cnt; i++) { + unsigned long physbase = lmb.reserved.region[i].base; + unsigned long size = lmb.reserved.region[i].size; + unsigned long start_pfn = physbase >> PAGE_SHIFT; + unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT); + struct node_active_region node_ar; + unsigned long node_end_pfn = node->node_start_pfn + + node->node_spanned_pages; + + /* + * Check to make sure that this lmb.reserved area is + * within the bounds of the node that we care about. + * Checking the nid of the start and end points is not + * sufficient because the reserved area could span the + * entire node. + */ + if (end_pfn <= node->node_start_pfn || + start_pfn >= node_end_pfn) + continue; + + get_node_active_region(start_pfn, &node_ar); + while (start_pfn < end_pfn && + node_ar.start_pfn < node_ar.end_pfn) { + unsigned long reserve_size = size; + /* + * if reserved region extends past active region + * then trim size to active region + */ + if (end_pfn > node_ar.end_pfn) + reserve_size = (node_ar.end_pfn << PAGE_SHIFT) + - (start_pfn << PAGE_SHIFT); + /* + * Only worry about *this* node, others may not + * yet have valid NODE_DATA(). + */ + if (node_ar.nid == nid) { + dbg("reserve_bootmem %lx %lx nid=%d\n", + physbase, reserve_size, node_ar.nid); + reserve_bootmem_node(NODE_DATA(node_ar.nid), + physbase, reserve_size, + BOOTMEM_DEFAULT); + } + /* + * if reserved region is contained in the active region + * then done. + */ + if (end_pfn <= node_ar.end_pfn) + break; + + /* + * reserved region extends past the active region + * get next active region that contains this + * reserved region + */ + start_pfn = node_ar.end_pfn; + physbase = start_pfn << PAGE_SHIFT; + size = size - reserve_size; + get_node_active_region(start_pfn, &node_ar); + } + } +} + + +void __init do_init_bootmem(void) +{ + int nid; + + min_low_pfn = 0; + max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT; + max_pfn = max_low_pfn; + + if (parse_numa_properties()) + setup_nonnuma(); + else + dump_numa_memory_topology(); + + register_cpu_notifier(&ppc64_numa_nb); + cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE, + (void *)(unsigned long)boot_cpuid); + + for_each_online_node(nid) { + unsigned long start_pfn, end_pfn; + unsigned long bootmem_paddr; + unsigned long bootmap_pages; + + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + + /* + * Allocate the node structure node local if possible + * + * Be careful moving this around, as it relies on all + * previous nodes' bootmem to be initialized and have + * all reserved areas marked. + */ + NODE_DATA(nid) = careful_allocation(nid, + sizeof(struct pglist_data), + SMP_CACHE_BYTES, end_pfn); + NODE_DATA(nid) = __va(NODE_DATA(nid)); + memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); + + dbg("node %d\n", nid); + dbg("NODE_DATA() = %p\n", NODE_DATA(nid)); + + NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; + NODE_DATA(nid)->node_start_pfn = start_pfn; + NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; + + if (NODE_DATA(nid)->node_spanned_pages == 0) + continue; + + dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT); + dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT); + + bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); + bootmem_paddr = (unsigned long)careful_allocation(nid, + bootmap_pages << PAGE_SHIFT, + PAGE_SIZE, end_pfn); + memset(__va(bootmem_paddr), 0, bootmap_pages << PAGE_SHIFT); + + dbg("bootmap_paddr = %lx\n", bootmem_paddr); + + init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT, + start_pfn, end_pfn); + + free_bootmem_with_active_regions(nid, end_pfn); + /* + * Be very careful about moving this around. Future + * calls to careful_allocation() depend on this getting + * done correctly. + */ + mark_reserved_regions_for_nid(nid); + sparse_memory_present_with_active_regions(nid); + } +} + +void __init paging_init(void) +{ + unsigned long max_zone_pfns[MAX_NR_ZONES]; + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + max_zone_pfns[ZONE_DMA] = lmb_end_of_DRAM() >> PAGE_SHIFT; + free_area_init_nodes(max_zone_pfns); +} + +static int __init early_numa(char *p) +{ + if (!p) + return 0; + + if (strstr(p, "off")) + numa_enabled = 0; + + if (strstr(p, "debug")) + numa_debug = 1; + + p = strstr(p, "fake="); + if (p) + cmdline = p + strlen("fake="); + + return 0; +} +early_param("numa", early_numa); + +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * Validate the node associated with the memory section we are + * trying to add. + */ +int valid_hot_add_scn(int *nid, unsigned long start, u32 lmb_size, + unsigned long scn_addr) +{ + nodemask_t nodes; + + if (*nid < 0 || !node_online(*nid)) + *nid = any_online_node(NODE_MASK_ALL); + + if ((scn_addr >= start) && (scn_addr < (start + lmb_size))) { + nodes_setall(nodes); + while (NODE_DATA(*nid)->node_spanned_pages == 0) { + node_clear(*nid, nodes); + *nid = any_online_node(nodes); + } + + return 1; + } + + return 0; +} + +/* + * Find the node associated with a hot added memory section represented + * by the ibm,dynamic-reconfiguration-memory node. + */ +static int hot_add_drconf_scn_to_nid(struct device_node *memory, + unsigned long scn_addr) +{ + const u32 *dm; + unsigned int n, rc; + unsigned long lmb_size; + int default_nid = any_online_node(NODE_MASK_ALL); + int nid; + struct assoc_arrays aa; + + n = of_get_drconf_memory(memory, &dm); + if (!n) + return default_nid;; + + lmb_size = of_get_lmb_size(memory); + if (!lmb_size) + return default_nid; + + rc = of_get_assoc_arrays(memory, &aa); + if (rc) + return default_nid; + + for (; n != 0; --n) { + struct of_drconf_cell drmem; + + read_drconf_cell(&drmem, &dm); + + /* skip this block if it is reserved or not assigned to + * this partition */ + if ((drmem.flags & DRCONF_MEM_RESERVED) + || !(drmem.flags & DRCONF_MEM_ASSIGNED)) + continue; + + nid = of_drconf_to_nid_single(&drmem, &aa); + + if (valid_hot_add_scn(&nid, drmem.base_addr, lmb_size, + scn_addr)) + return nid; + } + + BUG(); /* section address should be found above */ + return 0; +} + +/* + * Find the node associated with a hot added memory section. Section + * corresponds to a SPARSEMEM section, not an LMB. It is assumed that + * sections are fully contained within a single LMB. + */ +int hot_add_scn_to_nid(unsigned long scn_addr) +{ + struct device_node *memory = NULL; + int nid; + + if (!numa_enabled || (min_common_depth < 0)) + return any_online_node(NODE_MASK_ALL); + + memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (memory) { + nid = hot_add_drconf_scn_to_nid(memory, scn_addr); + of_node_put(memory); + return nid; + } + + while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { + unsigned long start, size; + int ranges; + const unsigned int *memcell_buf; + unsigned int len; + + memcell_buf = of_get_property(memory, "reg", &len); + if (!memcell_buf || len <= 0) + continue; + + /* ranges in cell */ + ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); +ha_new_range: + start = read_n_cells(n_mem_addr_cells, &memcell_buf); + size = read_n_cells(n_mem_size_cells, &memcell_buf); + nid = of_node_to_nid_single(memory); + + if (valid_hot_add_scn(&nid, start, size, scn_addr)) { + of_node_put(memory); + return nid; + } + + if (--ranges) /* process all ranges in cell */ + goto ha_new_range; + } + BUG(); /* section address should be found above */ + return 0; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c new file mode 100644 index 0000000..2e664f3 --- /dev/null +++ b/arch/powerpc/mm/pgtable_32.c @@ -0,0 +1,422 @@ +/* + * This file contains the routines setting up the linux page tables. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/highmem.h> + +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/fixmap.h> +#include <asm/io.h> + +#include "mmu_decl.h" + +unsigned long ioremap_base; +unsigned long ioremap_bot; +EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */ + +#if defined(CONFIG_6xx) || defined(CONFIG_POWER3) +#define HAVE_BATS 1 +#endif + +#if defined(CONFIG_FSL_BOOKE) +#define HAVE_TLBCAM 1 +#endif + +extern char etext[], _stext[]; + +#ifdef CONFIG_SMP +extern void hash_page_sync(void); +#endif + +#ifdef HAVE_BATS +extern phys_addr_t v_mapped_by_bats(unsigned long va); +extern unsigned long p_mapped_by_bats(phys_addr_t pa); +void setbat(int index, unsigned long virt, phys_addr_t phys, + unsigned int size, int flags); + +#else /* !HAVE_BATS */ +#define v_mapped_by_bats(x) (0UL) +#define p_mapped_by_bats(x) (0UL) +#endif /* HAVE_BATS */ + +#ifdef HAVE_TLBCAM +extern unsigned int tlbcam_index; +extern phys_addr_t v_mapped_by_tlbcam(unsigned long va); +extern unsigned long p_mapped_by_tlbcam(phys_addr_t pa); +#else /* !HAVE_TLBCAM */ +#define v_mapped_by_tlbcam(x) (0UL) +#define p_mapped_by_tlbcam(x) (0UL) +#endif /* HAVE_TLBCAM */ + +#ifdef CONFIG_PTE_64BIT +/* Some processors use an 8kB pgdir because they have 8-byte Linux PTEs. */ +#define PGDIR_ORDER 1 +#else +#define PGDIR_ORDER 0 +#endif + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + pgd_t *ret; + + ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER); + return ret; +} + +void pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + free_pages((unsigned long)pgd, PGDIR_ORDER); +} + +__init_refok pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +{ + pte_t *pte; + extern int mem_init_done; + extern void *early_get_page(void); + + if (mem_init_done) { + pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); + } else { + pte = (pte_t *)early_get_page(); + if (pte) + clear_page(pte); + } + return pte; +} + +pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +{ + struct page *ptepage; + +#ifdef CONFIG_HIGHPTE + gfp_t flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_REPEAT | __GFP_ZERO; +#else + gfp_t flags = GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO; +#endif + + ptepage = alloc_pages(flags, 0); + if (!ptepage) + return NULL; + pgtable_page_ctor(ptepage); + return ptepage; +} + +void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ +#ifdef CONFIG_SMP + hash_page_sync(); +#endif + free_page((unsigned long)pte); +} + +void pte_free(struct mm_struct *mm, pgtable_t ptepage) +{ +#ifdef CONFIG_SMP + hash_page_sync(); +#endif + pgtable_page_dtor(ptepage); + __free_page(ptepage); +} + +void __iomem * +ioremap(phys_addr_t addr, unsigned long size) +{ + return __ioremap(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED); +} +EXPORT_SYMBOL(ioremap); + +void __iomem * +ioremap_flags(phys_addr_t addr, unsigned long size, unsigned long flags) +{ + /* writeable implies dirty for kernel addresses */ + if (flags & _PAGE_RW) + flags |= _PAGE_DIRTY | _PAGE_HWWRITE; + + /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ + flags &= ~(_PAGE_USER | _PAGE_EXEC | _PAGE_HWEXEC); + + return __ioremap(addr, size, flags); +} +EXPORT_SYMBOL(ioremap_flags); + +void __iomem * +__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags) +{ + unsigned long v, i; + phys_addr_t p; + int err; + + /* Make sure we have the base flags */ + if ((flags & _PAGE_PRESENT) == 0) + flags |= _PAGE_KERNEL; + + /* Non-cacheable page cannot be coherent */ + if (flags & _PAGE_NO_CACHE) + flags &= ~_PAGE_COHERENT; + + /* + * Choose an address to map it to. + * Once the vmalloc system is running, we use it. + * Before then, we use space going down from ioremap_base + * (ioremap_bot records where we're up to). + */ + p = addr & PAGE_MASK; + size = PAGE_ALIGN(addr + size) - p; + + /* + * If the address lies within the first 16 MB, assume it's in ISA + * memory space + */ + if (p < 16*1024*1024) + p += _ISA_MEM_BASE; + + /* + * Don't allow anybody to remap normal RAM that we're using. + * mem_init() sets high_memory so only do the check after that. + */ + if (mem_init_done && (p < virt_to_phys(high_memory))) { + printk("__ioremap(): phys addr 0x%llx is RAM lr %p\n", + (unsigned long long)p, __builtin_return_address(0)); + return NULL; + } + + if (size == 0) + return NULL; + + /* + * Is it already mapped? Perhaps overlapped by a previous + * BAT mapping. If the whole area is mapped then we're done, + * otherwise remap it since we want to keep the virt addrs for + * each request contiguous. + * + * We make the assumption here that if the bottom and top + * of the range we want are mapped then it's mapped to the + * same virt address (and this is contiguous). + * -- Cort + */ + if ((v = p_mapped_by_bats(p)) /*&& p_mapped_by_bats(p+size-1)*/ ) + goto out; + + if ((v = p_mapped_by_tlbcam(p))) + goto out; + + if (mem_init_done) { + struct vm_struct *area; + area = get_vm_area(size, VM_IOREMAP); + if (area == 0) + return NULL; + v = (unsigned long) area->addr; + } else { + v = (ioremap_bot -= size); + } + + /* + * Should check if it is a candidate for a BAT mapping + */ + + err = 0; + for (i = 0; i < size && err == 0; i += PAGE_SIZE) + err = map_page(v+i, p+i, flags); + if (err) { + if (mem_init_done) + vunmap((void *)v); + return NULL; + } + +out: + return (void __iomem *) (v + ((unsigned long)addr & ~PAGE_MASK)); +} +EXPORT_SYMBOL(__ioremap); + +void iounmap(volatile void __iomem *addr) +{ + /* + * If mapped by BATs then there is nothing to do. + * Calling vfree() generates a benign warning. + */ + if (v_mapped_by_bats((unsigned long)addr)) return; + + if (addr > high_memory && (unsigned long) addr < ioremap_bot) + vunmap((void *) (PAGE_MASK & (unsigned long)addr)); +} +EXPORT_SYMBOL(iounmap); + +int map_page(unsigned long va, phys_addr_t pa, int flags) +{ + pmd_t *pd; + pte_t *pg; + int err = -ENOMEM; + + /* Use upper 10 bits of VA to index the first level map */ + pd = pmd_offset(pud_offset(pgd_offset_k(va), va), va); + /* Use middle 10 bits of VA to index the second-level map */ + pg = pte_alloc_kernel(pd, va); + if (pg != 0) { + err = 0; + /* The PTE should never be already set nor present in the + * hash table + */ + BUG_ON(pte_val(*pg) & (_PAGE_PRESENT | _PAGE_HASHPTE)); + set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, + __pgprot(flags))); + } + return err; +} + +/* + * Map in a big chunk of physical memory starting at KERNELBASE. + */ +void __init mapin_ram(void) +{ + unsigned long v, s, f; + phys_addr_t p; + int ktext; + + s = mmu_mapin_ram(); + v = KERNELBASE + s; + p = memstart_addr + s; + for (; s < total_lowmem; s += PAGE_SIZE) { + ktext = ((char *) v >= _stext && (char *) v < etext); + f = ktext ?_PAGE_RAM_TEXT : _PAGE_RAM; + map_page(v, p, f); +#ifdef CONFIG_PPC_STD_MMU_32 + if (ktext) + hash_preload(&init_mm, v, 0, 0x300); +#endif + v += PAGE_SIZE; + p += PAGE_SIZE; + } +} + +/* Scan the real Linux page tables and return a PTE pointer for + * a virtual address in a context. + * Returns true (1) if PTE was found, zero otherwise. The pointer to + * the PTE pointer is unmodified if PTE is not found. + */ +int +get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int retval = 0; + + pgd = pgd_offset(mm, addr & PAGE_MASK); + if (pgd) { + pud = pud_offset(pgd, addr & PAGE_MASK); + if (pud && pud_present(*pud)) { + pmd = pmd_offset(pud, addr & PAGE_MASK); + if (pmd_present(*pmd)) { + pte = pte_offset_map(pmd, addr & PAGE_MASK); + if (pte) { + retval = 1; + *ptep = pte; + if (pmdp) + *pmdp = pmd; + /* XXX caller needs to do pte_unmap, yuck */ + } + } + } + } + return(retval); +} + +#ifdef CONFIG_DEBUG_PAGEALLOC + +static int __change_page_attr(struct page *page, pgprot_t prot) +{ + pte_t *kpte; + pmd_t *kpmd; + unsigned long address; + + BUG_ON(PageHighMem(page)); + address = (unsigned long)page_address(page); + + if (v_mapped_by_bats(address) || v_mapped_by_tlbcam(address)) + return 0; + if (!get_pteptr(&init_mm, address, &kpte, &kpmd)) + return -EINVAL; + set_pte_at(&init_mm, address, kpte, mk_pte(page, prot)); + wmb(); + flush_HPTE(0, address, pmd_val(*kpmd)); + pte_unmap(kpte); + + return 0; +} + +/* + * Change the page attributes of an page in the linear mapping. + * + * THIS CONFLICTS WITH BAT MAPPINGS, DEBUG USE ONLY + */ +static int change_page_attr(struct page *page, int numpages, pgprot_t prot) +{ + int i, err = 0; + unsigned long flags; + + local_irq_save(flags); + for (i = 0; i < numpages; i++, page++) { + err = __change_page_attr(page, prot); + if (err) + break; + } + local_irq_restore(flags); + return err; +} + + +void kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (PageHighMem(page)) + return; + + change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0)); +} +#endif /* CONFIG_DEBUG_PAGEALLOC */ + +static int fixmaps; +unsigned long FIXADDR_TOP = 0xfffff000; +EXPORT_SYMBOL(FIXADDR_TOP); + +void __set_fixmap (enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) +{ + unsigned long address = __fix_to_virt(idx); + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + + map_page(address, phys, pgprot_val(flags)); + fixmaps++; +} + +void __this_fixmap_does_not_exist(void) +{ + WARN_ON(1); +} diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c new file mode 100644 index 0000000..365e61a --- /dev/null +++ b/arch/powerpc/mm/pgtable_64.c @@ -0,0 +1,250 @@ +/* + * This file contains ioremap and related functions for 64-bit machines. + * + * Derived from arch/ppc64/mm/init.c + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@samba.org) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Dave Engebretsen <engebret@us.ibm.com> + * Rework for PPC64 port. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/stddef.h> +#include <linux/vmalloc.h> +#include <linux/init.h> + +#include <asm/pgalloc.h> +#include <asm/page.h> +#include <asm/prom.h> +#include <asm/io.h> +#include <asm/mmu_context.h> +#include <asm/pgtable.h> +#include <asm/mmu.h> +#include <asm/smp.h> +#include <asm/machdep.h> +#include <asm/tlb.h> +#include <asm/processor.h> +#include <asm/cputable.h> +#include <asm/sections.h> +#include <asm/system.h> +#include <asm/abs_addr.h> +#include <asm/firmware.h> + +#include "mmu_decl.h" + +unsigned long ioremap_bot = IOREMAP_BASE; + +/* + * map_io_page currently only called by __ioremap + * map_io_page adds an entry to the ioremap page table + * and adds an entry to the HPT, possibly bolting it + */ +static int map_io_page(unsigned long ea, unsigned long pa, int flags) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + if (mem_init_done) { + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); + if (!pudp) + return -ENOMEM; + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, + __pgprot(flags))); + } else { + /* + * If the mm subsystem is not fully up, we cannot create a + * linux page table entry for this mapping. Simply bolt an + * entry in the hardware page table. + * + */ + if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags, + mmu_io_psize, mmu_kernel_ssize)) { + printk(KERN_ERR "Failed to do bolted mapping IO " + "memory at %016lx !\n", pa); + return -ENOMEM; + } + } + return 0; +} + + +/** + * __ioremap_at - Low level function to establish the page tables + * for an IO mapping + */ +void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, + unsigned long flags) +{ + unsigned long i; + + /* Make sure we have the base flags */ + if ((flags & _PAGE_PRESENT) == 0) + flags |= pgprot_val(PAGE_KERNEL); + + /* Non-cacheable page cannot be coherent */ + if (flags & _PAGE_NO_CACHE) + flags &= ~_PAGE_COHERENT; + + /* We don't support the 4K PFN hack with ioremap */ + if (flags & _PAGE_4K_PFN) + return NULL; + + WARN_ON(pa & ~PAGE_MASK); + WARN_ON(((unsigned long)ea) & ~PAGE_MASK); + WARN_ON(size & ~PAGE_MASK); + + for (i = 0; i < size; i += PAGE_SIZE) + if (map_io_page((unsigned long)ea+i, pa+i, flags)) + return NULL; + + return (void __iomem *)ea; +} + +/** + * __iounmap_from - Low level function to tear down the page tables + * for an IO mapping. This is used for mappings that + * are manipulated manually, like partial unmapping of + * PCI IOs or ISA space. + */ +void __iounmap_at(void *ea, unsigned long size) +{ + WARN_ON(((unsigned long)ea) & ~PAGE_MASK); + WARN_ON(size & ~PAGE_MASK); + + unmap_kernel_range((unsigned long)ea, size); +} + +void __iomem * __ioremap(phys_addr_t addr, unsigned long size, + unsigned long flags) +{ + phys_addr_t paligned; + void __iomem *ret; + + /* + * Choose an address to map it to. + * Once the imalloc system is running, we use it. + * Before that, we map using addresses going + * up from ioremap_bot. imalloc will use + * the addresses from ioremap_bot through + * IMALLOC_END + * + */ + paligned = addr & PAGE_MASK; + size = PAGE_ALIGN(addr + size) - paligned; + + if ((size == 0) || (paligned == 0)) + return NULL; + + if (mem_init_done) { + struct vm_struct *area; + + area = __get_vm_area(size, VM_IOREMAP, + ioremap_bot, IOREMAP_END); + if (area == NULL) + return NULL; + ret = __ioremap_at(paligned, area->addr, size, flags); + if (!ret) + vunmap(area->addr); + } else { + ret = __ioremap_at(paligned, (void *)ioremap_bot, size, flags); + if (ret) + ioremap_bot += size; + } + + if (ret) + ret += addr & ~PAGE_MASK; + return ret; +} + + +void __iomem * ioremap(phys_addr_t addr, unsigned long size) +{ + unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED; + + if (ppc_md.ioremap) + return ppc_md.ioremap(addr, size, flags); + return __ioremap(addr, size, flags); +} + +void __iomem * ioremap_flags(phys_addr_t addr, unsigned long size, + unsigned long flags) +{ + /* writeable implies dirty for kernel addresses */ + if (flags & _PAGE_RW) + flags |= _PAGE_DIRTY; + + /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ + flags &= ~(_PAGE_USER | _PAGE_EXEC); + + if (ppc_md.ioremap) + return ppc_md.ioremap(addr, size, flags); + return __ioremap(addr, size, flags); +} + + +/* + * Unmap an IO region and remove it from imalloc'd list. + * Access to IO memory should be serialized by driver. + */ +void __iounmap(volatile void __iomem *token) +{ + void *addr; + + if (!mem_init_done) + return; + + addr = (void *) ((unsigned long __force) + PCI_FIX_ADDR(token) & PAGE_MASK); + if ((unsigned long)addr < ioremap_bot) { + printk(KERN_WARNING "Attempt to iounmap early bolted mapping" + " at 0x%p\n", addr); + return; + } + vunmap(addr); +} + +void iounmap(volatile void __iomem *token) +{ + if (ppc_md.iounmap) + ppc_md.iounmap(token); + else + __iounmap(token); +} + +EXPORT_SYMBOL(ioremap); +EXPORT_SYMBOL(ioremap_flags); +EXPORT_SYMBOL(__ioremap); +EXPORT_SYMBOL(__ioremap_at); +EXPORT_SYMBOL(iounmap); +EXPORT_SYMBOL(__iounmap); +EXPORT_SYMBOL(__iounmap_at); diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c new file mode 100644 index 0000000..6aa1208 --- /dev/null +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -0,0 +1,280 @@ +/* + * This file contains the routines for handling the MMU on those + * PowerPC implementations where the MMU substantially follows the + * architecture specification. This includes the 6xx, 7xx, 7xxx, + * 8260, and POWER3 implementations but excludes the 8xx and 4xx. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/lmb.h> + +#include <asm/prom.h> +#include <asm/mmu.h> +#include <asm/machdep.h> + +#include "mmu_decl.h" + +struct hash_pte *Hash, *Hash_end; +unsigned long Hash_size, Hash_mask; +unsigned long _SDR1; + +struct ppc_bat BATS[8][2]; /* 8 pairs of IBAT, DBAT */ + +struct batrange { /* stores address ranges mapped by BATs */ + unsigned long start; + unsigned long limit; + phys_addr_t phys; +} bat_addrs[8]; + +/* + * Return PA for this VA if it is mapped by a BAT, or 0 + */ +phys_addr_t v_mapped_by_bats(unsigned long va) +{ + int b; + for (b = 0; b < 4; ++b) + if (va >= bat_addrs[b].start && va < bat_addrs[b].limit) + return bat_addrs[b].phys + (va - bat_addrs[b].start); + return 0; +} + +/* + * Return VA for a given PA or 0 if not mapped + */ +unsigned long p_mapped_by_bats(phys_addr_t pa) +{ + int b; + for (b = 0; b < 4; ++b) + if (pa >= bat_addrs[b].phys + && pa < (bat_addrs[b].limit-bat_addrs[b].start) + +bat_addrs[b].phys) + return bat_addrs[b].start+(pa-bat_addrs[b].phys); + return 0; +} + +unsigned long __init mmu_mapin_ram(void) +{ +#ifdef CONFIG_POWER4 + return 0; +#else + unsigned long tot, bl, done; + unsigned long max_size = (256<<20); + + if (__map_without_bats) { + printk(KERN_DEBUG "RAM mapped without BATs\n"); + return 0; + } + + /* Set up BAT2 and if necessary BAT3 to cover RAM. */ + + /* Make sure we don't map a block larger than the + smallest alignment of the physical address. */ + tot = total_lowmem; + for (bl = 128<<10; bl < max_size; bl <<= 1) { + if (bl * 2 > tot) + break; + } + + setbat(2, KERNELBASE, 0, bl, _PAGE_RAM); + done = (unsigned long)bat_addrs[2].limit - KERNELBASE + 1; + if ((done < tot) && !bat_addrs[3].limit) { + /* use BAT3 to cover a bit more */ + tot -= done; + for (bl = 128<<10; bl < max_size; bl <<= 1) + if (bl * 2 > tot) + break; + setbat(3, KERNELBASE+done, done, bl, _PAGE_RAM); + done = (unsigned long)bat_addrs[3].limit - KERNELBASE + 1; + } + + return done; +#endif +} + +/* + * Set up one of the I/D BAT (block address translation) register pairs. + * The parameters are not checked; in particular size must be a power + * of 2 between 128k and 256M. + */ +void __init setbat(int index, unsigned long virt, phys_addr_t phys, + unsigned int size, int flags) +{ + unsigned int bl; + int wimgxpp; + struct ppc_bat *bat = BATS[index]; + + if (((flags & _PAGE_NO_CACHE) == 0) && + cpu_has_feature(CPU_FTR_NEED_COHERENT)) + flags |= _PAGE_COHERENT; + + bl = (size >> 17) - 1; + if (PVR_VER(mfspr(SPRN_PVR)) != 1) { + /* 603, 604, etc. */ + /* Do DBAT first */ + wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE + | _PAGE_COHERENT | _PAGE_GUARDED); + wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX; + bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ + bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp; +#ifndef CONFIG_KGDB /* want user access for breakpoints */ + if (flags & _PAGE_USER) +#endif + bat[1].batu |= 1; /* Vp = 1 */ + if (flags & _PAGE_GUARDED) { + /* G bit must be zero in IBATs */ + bat[0].batu = bat[0].batl = 0; + } else { + /* make IBAT same as DBAT */ + bat[0] = bat[1]; + } + } else { + /* 601 cpu */ + if (bl > BL_8M) + bl = BL_8M; + wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE + | _PAGE_COHERENT); + wimgxpp |= (flags & _PAGE_RW)? + ((flags & _PAGE_USER)? PP_RWRW: PP_RWXX): PP_RXRX; + bat->batu = virt | wimgxpp | 4; /* Ks=0, Ku=1 */ + bat->batl = phys | bl | 0x40; /* V=1 */ + } + + bat_addrs[index].start = virt; + bat_addrs[index].limit = virt + ((bl + 1) << 17) - 1; + bat_addrs[index].phys = phys; +} + +/* + * Preload a translation in the hash table + */ +void hash_preload(struct mm_struct *mm, unsigned long ea, + unsigned long access, unsigned long trap) +{ + pmd_t *pmd; + + if (Hash == 0) + return; + pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea); + if (!pmd_none(*pmd)) + add_hash_page(mm->context.id, ea, pmd_val(*pmd)); +} + +/* + * Initialize the hash table and patch the instructions in hashtable.S. + */ +void __init MMU_init_hw(void) +{ + unsigned int hmask, mb, mb2; + unsigned int n_hpteg, lg_n_hpteg; + + extern unsigned int hash_page_patch_A[]; + extern unsigned int hash_page_patch_B[], hash_page_patch_C[]; + extern unsigned int hash_page[]; + extern unsigned int flush_hash_patch_A[], flush_hash_patch_B[]; + + if (!cpu_has_feature(CPU_FTR_HPTE_TABLE)) { + /* + * Put a blr (procedure return) instruction at the + * start of hash_page, since we can still get DSI + * exceptions on a 603. + */ + hash_page[0] = 0x4e800020; + flush_icache_range((unsigned long) &hash_page[0], + (unsigned long) &hash_page[1]); + return; + } + + if ( ppc_md.progress ) ppc_md.progress("hash:enter", 0x105); + +#define LG_HPTEG_SIZE 6 /* 64 bytes per HPTEG */ +#define SDR1_LOW_BITS ((n_hpteg - 1) >> 10) +#define MIN_N_HPTEG 1024 /* min 64kB hash table */ + + /* + * Allow 1 HPTE (1/8 HPTEG) for each page of memory. + * This is less than the recommended amount, but then + * Linux ain't AIX. + */ + n_hpteg = total_memory / (PAGE_SIZE * 8); + if (n_hpteg < MIN_N_HPTEG) + n_hpteg = MIN_N_HPTEG; + lg_n_hpteg = __ilog2(n_hpteg); + if (n_hpteg & (n_hpteg - 1)) { + ++lg_n_hpteg; /* round up if not power of 2 */ + n_hpteg = 1 << lg_n_hpteg; + } + Hash_size = n_hpteg << LG_HPTEG_SIZE; + + /* + * Find some memory for the hash table. + */ + if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322); + Hash = __va(lmb_alloc_base(Hash_size, Hash_size, + __initial_memory_limit_addr)); + cacheable_memzero(Hash, Hash_size); + _SDR1 = __pa(Hash) | SDR1_LOW_BITS; + + Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size); + + printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n", + (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash); + + + /* + * Patch up the instructions in hashtable.S:create_hpte + */ + if ( ppc_md.progress ) ppc_md.progress("hash:patch", 0x345); + Hash_mask = n_hpteg - 1; + hmask = Hash_mask >> (16 - LG_HPTEG_SIZE); + mb2 = mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg; + if (lg_n_hpteg > 16) + mb2 = 16 - LG_HPTEG_SIZE; + + hash_page_patch_A[0] = (hash_page_patch_A[0] & ~0xffff) + | ((unsigned int)(Hash) >> 16); + hash_page_patch_A[1] = (hash_page_patch_A[1] & ~0x7c0) | (mb << 6); + hash_page_patch_A[2] = (hash_page_patch_A[2] & ~0x7c0) | (mb2 << 6); + hash_page_patch_B[0] = (hash_page_patch_B[0] & ~0xffff) | hmask; + hash_page_patch_C[0] = (hash_page_patch_C[0] & ~0xffff) | hmask; + + /* + * Ensure that the locations we've patched have been written + * out from the data cache and invalidated in the instruction + * cache, on those machines with split caches. + */ + flush_icache_range((unsigned long) &hash_page_patch_A[0], + (unsigned long) &hash_page_patch_C[1]); + + /* + * Patch up the instructions in hashtable.S:flush_hash_page + */ + flush_hash_patch_A[0] = (flush_hash_patch_A[0] & ~0xffff) + | ((unsigned int)(Hash) >> 16); + flush_hash_patch_A[1] = (flush_hash_patch_A[1] & ~0x7c0) | (mb << 6); + flush_hash_patch_A[2] = (flush_hash_patch_A[2] & ~0x7c0) | (mb2 << 6); + flush_hash_patch_B[0] = (flush_hash_patch_B[0] & ~0xffff) | hmask; + flush_icache_range((unsigned long) &flush_hash_patch_A[0], + (unsigned long) &flush_hash_patch_B[1]); + + if ( ppc_md.progress ) ppc_md.progress("hash:done", 0x205); +} diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c new file mode 100644 index 0000000..89497fb --- /dev/null +++ b/arch/powerpc/mm/slb.c @@ -0,0 +1,328 @@ +/* + * PowerPC64 SLB support. + * + * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM + * Based on earlier code writteh by: + * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com + * Copyright (c) 2001 Dave Engebretsen + * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include <asm/pgtable.h> +#include <asm/mmu.h> +#include <asm/mmu_context.h> +#include <asm/paca.h> +#include <asm/cputable.h> +#include <asm/cacheflush.h> +#include <asm/smp.h> +#include <asm/firmware.h> +#include <linux/compiler.h> +#include <asm/udbg.h> + +#ifdef DEBUG +#define DBG(fmt...) printk(fmt) +#else +#define DBG pr_debug +#endif + +extern void slb_allocate_realmode(unsigned long ea); +extern void slb_allocate_user(unsigned long ea); + +static void slb_allocate(unsigned long ea) +{ + /* Currently, we do real mode for all SLBs including user, but + * that will change if we bring back dynamic VSIDs + */ + slb_allocate_realmode(ea); +} + +#define slb_esid_mask(ssize) \ + (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T) + +static inline unsigned long mk_esid_data(unsigned long ea, int ssize, + unsigned long slot) +{ + return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | slot; +} + +#define slb_vsid_shift(ssize) \ + ((ssize) == MMU_SEGSIZE_256M? SLB_VSID_SHIFT: SLB_VSID_SHIFT_1T) + +static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, + unsigned long flags) +{ + return (get_kernel_vsid(ea, ssize) << slb_vsid_shift(ssize)) | flags | + ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT); +} + +static inline void slb_shadow_update(unsigned long ea, int ssize, + unsigned long flags, + unsigned long entry) +{ + /* + * Clear the ESID first so the entry is not valid while we are + * updating it. No write barriers are needed here, provided + * we only update the current CPU's SLB shadow buffer. + */ + get_slb_shadow()->save_area[entry].esid = 0; + get_slb_shadow()->save_area[entry].vsid = mk_vsid_data(ea, ssize, flags); + get_slb_shadow()->save_area[entry].esid = mk_esid_data(ea, ssize, entry); +} + +static inline void slb_shadow_clear(unsigned long entry) +{ + get_slb_shadow()->save_area[entry].esid = 0; +} + +static inline void create_shadowed_slbe(unsigned long ea, int ssize, + unsigned long flags, + unsigned long entry) +{ + /* + * Updating the shadow buffer before writing the SLB ensures + * we don't get a stale entry here if we get preempted by PHYP + * between these two statements. + */ + slb_shadow_update(ea, ssize, flags, entry); + + asm volatile("slbmte %0,%1" : + : "r" (mk_vsid_data(ea, ssize, flags)), + "r" (mk_esid_data(ea, ssize, entry)) + : "memory" ); +} + +void slb_flush_and_rebolt(void) +{ + /* If you change this make sure you change SLB_NUM_BOLTED + * appropriately too. */ + unsigned long linear_llp, vmalloc_llp, lflags, vflags; + unsigned long ksp_esid_data, ksp_vsid_data; + + WARN_ON(!irqs_disabled()); + + linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; + vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp; + lflags = SLB_VSID_KERNEL | linear_llp; + vflags = SLB_VSID_KERNEL | vmalloc_llp; + + ksp_esid_data = mk_esid_data(get_paca()->kstack, mmu_kernel_ssize, 2); + if ((ksp_esid_data & ~0xfffffffUL) <= PAGE_OFFSET) { + ksp_esid_data &= ~SLB_ESID_V; + ksp_vsid_data = 0; + slb_shadow_clear(2); + } else { + /* Update stack entry; others don't change */ + slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, 2); + ksp_vsid_data = get_slb_shadow()->save_area[2].vsid; + } + + /* + * We can't take a PMU exception in the following code, so hard + * disable interrupts. + */ + hard_irq_disable(); + + /* We need to do this all in asm, so we're sure we don't touch + * the stack between the slbia and rebolting it. */ + asm volatile("isync\n" + "slbia\n" + /* Slot 1 - first VMALLOC segment */ + "slbmte %0,%1\n" + /* Slot 2 - kernel stack */ + "slbmte %2,%3\n" + "isync" + :: "r"(mk_vsid_data(VMALLOC_START, mmu_kernel_ssize, vflags)), + "r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, 1)), + "r"(ksp_vsid_data), + "r"(ksp_esid_data) + : "memory"); +} + +void slb_vmalloc_update(void) +{ + unsigned long vflags; + + vflags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmalloc_psize].sllp; + slb_shadow_update(VMALLOC_START, mmu_kernel_ssize, vflags, 1); + slb_flush_and_rebolt(); +} + +/* Helper function to compare esids. There are four cases to handle. + * 1. The system is not 1T segment size capable. Use the GET_ESID compare. + * 2. The system is 1T capable, both addresses are < 1T, use the GET_ESID compare. + * 3. The system is 1T capable, only one of the two addresses is > 1T. This is not a match. + * 4. The system is 1T capable, both addresses are > 1T, use the GET_ESID_1T macro to compare. + */ +static inline int esids_match(unsigned long addr1, unsigned long addr2) +{ + int esid_1t_count; + + /* System is not 1T segment size capable. */ + if (!cpu_has_feature(CPU_FTR_1T_SEGMENT)) + return (GET_ESID(addr1) == GET_ESID(addr2)); + + esid_1t_count = (((addr1 >> SID_SHIFT_1T) != 0) + + ((addr2 >> SID_SHIFT_1T) != 0)); + + /* both addresses are < 1T */ + if (esid_1t_count == 0) + return (GET_ESID(addr1) == GET_ESID(addr2)); + + /* One address < 1T, the other > 1T. Not a match */ + if (esid_1t_count == 1) + return 0; + + /* Both addresses are > 1T. */ + return (GET_ESID_1T(addr1) == GET_ESID_1T(addr2)); +} + +/* Flush all user entries from the segment table of the current processor. */ +void switch_slb(struct task_struct *tsk, struct mm_struct *mm) +{ + unsigned long offset = get_paca()->slb_cache_ptr; + unsigned long slbie_data = 0; + unsigned long pc = KSTK_EIP(tsk); + unsigned long stack = KSTK_ESP(tsk); + unsigned long unmapped_base; + + if (!cpu_has_feature(CPU_FTR_NO_SLBIE_B) && + offset <= SLB_CACHE_ENTRIES) { + int i; + asm volatile("isync" : : : "memory"); + for (i = 0; i < offset; i++) { + slbie_data = (unsigned long)get_paca()->slb_cache[i] + << SID_SHIFT; /* EA */ + slbie_data |= user_segment_size(slbie_data) + << SLBIE_SSIZE_SHIFT; + slbie_data |= SLBIE_C; /* C set for user addresses */ + asm volatile("slbie %0" : : "r" (slbie_data)); + } + asm volatile("isync" : : : "memory"); + } else { + slb_flush_and_rebolt(); + } + + /* Workaround POWER5 < DD2.1 issue */ + if (offset == 1 || offset > SLB_CACHE_ENTRIES) + asm volatile("slbie %0" : : "r" (slbie_data)); + + get_paca()->slb_cache_ptr = 0; + get_paca()->context = mm->context; + + /* + * preload some userspace segments into the SLB. + */ + if (test_tsk_thread_flag(tsk, TIF_32BIT)) + unmapped_base = TASK_UNMAPPED_BASE_USER32; + else + unmapped_base = TASK_UNMAPPED_BASE_USER64; + + if (is_kernel_addr(pc)) + return; + slb_allocate(pc); + + if (esids_match(pc,stack)) + return; + + if (is_kernel_addr(stack)) + return; + slb_allocate(stack); + + if (esids_match(pc,unmapped_base) || esids_match(stack,unmapped_base)) + return; + + if (is_kernel_addr(unmapped_base)) + return; + slb_allocate(unmapped_base); +} + +static inline void patch_slb_encoding(unsigned int *insn_addr, + unsigned int immed) +{ + /* Assume the instruction had a "0" immediate value, just + * "or" in the new value + */ + *insn_addr |= immed; + flush_icache_range((unsigned long)insn_addr, 4+ + (unsigned long)insn_addr); +} + +void slb_initialize(void) +{ + unsigned long linear_llp, vmalloc_llp, io_llp; + unsigned long lflags, vflags; + static int slb_encoding_inited; + extern unsigned int *slb_miss_kernel_load_linear; + extern unsigned int *slb_miss_kernel_load_io; + extern unsigned int *slb_compare_rr_to_size; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + extern unsigned int *slb_miss_kernel_load_vmemmap; + unsigned long vmemmap_llp; +#endif + + /* Prepare our SLB miss handler based on our page size */ + linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; + io_llp = mmu_psize_defs[mmu_io_psize].sllp; + vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp; + get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp; +#endif + if (!slb_encoding_inited) { + slb_encoding_inited = 1; + patch_slb_encoding(slb_miss_kernel_load_linear, + SLB_VSID_KERNEL | linear_llp); + patch_slb_encoding(slb_miss_kernel_load_io, + SLB_VSID_KERNEL | io_llp); + patch_slb_encoding(slb_compare_rr_to_size, + mmu_slb_size); + + DBG("SLB: linear LLP = %04lx\n", linear_llp); + DBG("SLB: io LLP = %04lx\n", io_llp); + +#ifdef CONFIG_SPARSEMEM_VMEMMAP + patch_slb_encoding(slb_miss_kernel_load_vmemmap, + SLB_VSID_KERNEL | vmemmap_llp); + DBG("SLB: vmemmap LLP = %04lx\n", vmemmap_llp); +#endif + } + + get_paca()->stab_rr = SLB_NUM_BOLTED; + + /* On iSeries the bolted entries have already been set up by + * the hypervisor from the lparMap data in head.S */ + if (firmware_has_feature(FW_FEATURE_ISERIES)) + return; + + lflags = SLB_VSID_KERNEL | linear_llp; + vflags = SLB_VSID_KERNEL | vmalloc_llp; + + /* Invalidate the entire SLB (even slot 0) & all the ERATS */ + asm volatile("isync":::"memory"); + asm volatile("slbmte %0,%0"::"r" (0) : "memory"); + asm volatile("isync; slbia; isync":::"memory"); + create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, 0); + + create_shadowed_slbe(VMALLOC_START, mmu_kernel_ssize, vflags, 1); + + /* For the boot cpu, we're running on the stack in init_thread_union, + * which is in the first segment of the linear mapping, and also + * get_paca()->kstack hasn't been initialized yet. + * For secondary cpus, we need to bolt the kernel stack entry now. + */ + slb_shadow_clear(2); + if (raw_smp_processor_id() != boot_cpuid && + (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET) + create_shadowed_slbe(get_paca()->kstack, + mmu_kernel_ssize, lflags, 2); + + asm volatile("isync":::"memory"); +} diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S new file mode 100644 index 0000000..bc44dc4 --- /dev/null +++ b/arch/powerpc/mm/slb_low.S @@ -0,0 +1,301 @@ +/* + * Low-level SLB routines + * + * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM + * + * Based on earlier C version: + * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com + * Copyright (c) 2001 Dave Engebretsen + * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/cputable.h> +#include <asm/page.h> +#include <asm/mmu.h> +#include <asm/pgtable.h> +#include <asm/firmware.h> + +/* void slb_allocate_realmode(unsigned long ea); + * + * Create an SLB entry for the given EA (user or kernel). + * r3 = faulting address, r13 = PACA + * r9, r10, r11 are clobbered by this function + * No other registers are examined or changed. + */ +_GLOBAL(slb_allocate_realmode) + /* r3 = faulting address */ + + srdi r9,r3,60 /* get region */ + srdi r10,r3,28 /* get esid */ + cmpldi cr7,r9,0xc /* cmp PAGE_OFFSET for later use */ + + /* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */ + blt cr7,0f /* user or kernel? */ + + /* kernel address: proto-VSID = ESID */ + /* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but + * this code will generate the protoVSID 0xfffffffff for the + * top segment. That's ok, the scramble below will translate + * it to VSID 0, which is reserved as a bad VSID - one which + * will never have any pages in it. */ + + /* Check if hitting the linear mapping or some other kernel space + */ + bne cr7,1f + + /* Linear mapping encoding bits, the "li" instruction below will + * be patched by the kernel at boot + */ +_GLOBAL(slb_miss_kernel_load_linear) + li r11,0 +BEGIN_FTR_SECTION + b slb_finish_load +END_FTR_SECTION_IFCLR(CPU_FTR_1T_SEGMENT) + b slb_finish_load_1T + +1: +#ifdef CONFIG_SPARSEMEM_VMEMMAP + /* Check virtual memmap region. To be patches at kernel boot */ + cmpldi cr0,r9,0xf + bne 1f +_GLOBAL(slb_miss_kernel_load_vmemmap) + li r11,0 + b 6f +1: +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + + /* vmalloc/ioremap mapping encoding bits, the "li" instructions below + * will be patched by the kernel at boot + */ +BEGIN_FTR_SECTION + /* check whether this is in vmalloc or ioremap space */ + clrldi r11,r10,48 + cmpldi r11,(VMALLOC_SIZE >> 28) - 1 + bgt 5f + lhz r11,PACAVMALLOCSLLP(r13) + b 6f +5: +END_FTR_SECTION_IFCLR(CPU_FTR_CI_LARGE_PAGE) +_GLOBAL(slb_miss_kernel_load_io) + li r11,0 +6: +BEGIN_FTR_SECTION + b slb_finish_load +END_FTR_SECTION_IFCLR(CPU_FTR_1T_SEGMENT) + b slb_finish_load_1T + +0: /* user address: proto-VSID = context << 15 | ESID. First check + * if the address is within the boundaries of the user region + */ + srdi. r9,r10,USER_ESID_BITS + bne- 8f /* invalid ea bits set */ + + + /* when using slices, we extract the psize off the slice bitmaps + * and then we need to get the sllp encoding off the mmu_psize_defs + * array. + * + * XXX This is a bit inefficient especially for the normal case, + * so we should try to implement a fast path for the standard page + * size using the old sllp value so we avoid the array. We cannot + * really do dynamic patching unfortunately as processes might flip + * between 4k and 64k standard page size + */ +#ifdef CONFIG_PPC_MM_SLICES + cmpldi r10,16 + + /* Get the slice index * 4 in r11 and matching slice size mask in r9 */ + ld r9,PACALOWSLICESPSIZE(r13) + sldi r11,r10,2 + blt 5f + ld r9,PACAHIGHSLICEPSIZE(r13) + srdi r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT - 2) + andi. r11,r11,0x3c + +5: /* Extract the psize and multiply to get an array offset */ + srd r9,r9,r11 + andi. r9,r9,0xf + mulli r9,r9,MMUPSIZEDEFSIZE + + /* Now get to the array and obtain the sllp + */ + ld r11,PACATOC(r13) + ld r11,mmu_psize_defs@got(r11) + add r11,r11,r9 + ld r11,MMUPSIZESLLP(r11) + ori r11,r11,SLB_VSID_USER +#else + /* paca context sllp already contains the SLB_VSID_USER bits */ + lhz r11,PACACONTEXTSLLP(r13) +#endif /* CONFIG_PPC_MM_SLICES */ + + ld r9,PACACONTEXTID(r13) +BEGIN_FTR_SECTION + cmpldi r10,0x1000 +END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT) + rldimi r10,r9,USER_ESID_BITS,0 +BEGIN_FTR_SECTION + bge slb_finish_load_1T +END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT) + b slb_finish_load + +8: /* invalid EA */ + li r10,0 /* BAD_VSID */ + li r11,SLB_VSID_USER /* flags don't much matter */ + b slb_finish_load + +#ifdef __DISABLED__ + +/* void slb_allocate_user(unsigned long ea); + * + * Create an SLB entry for the given EA (user or kernel). + * r3 = faulting address, r13 = PACA + * r9, r10, r11 are clobbered by this function + * No other registers are examined or changed. + * + * It is called with translation enabled in order to be able to walk the + * page tables. This is not currently used. + */ +_GLOBAL(slb_allocate_user) + /* r3 = faulting address */ + srdi r10,r3,28 /* get esid */ + + crset 4*cr7+lt /* set "user" flag for later */ + + /* check if we fit in the range covered by the pagetables*/ + srdi. r9,r3,PGTABLE_EADDR_SIZE + crnot 4*cr0+eq,4*cr0+eq + beqlr + + /* now we need to get to the page tables in order to get the page + * size encoding from the PMD. In the future, we'll be able to deal + * with 1T segments too by getting the encoding from the PGD instead + */ + ld r9,PACAPGDIR(r13) + cmpldi cr0,r9,0 + beqlr + rlwinm r11,r10,8,25,28 + ldx r9,r9,r11 /* get pgd_t */ + cmpldi cr0,r9,0 + beqlr + rlwinm r11,r10,3,17,28 + ldx r9,r9,r11 /* get pmd_t */ + cmpldi cr0,r9,0 + beqlr + + /* build vsid flags */ + andi. r11,r9,SLB_VSID_LLP + ori r11,r11,SLB_VSID_USER + + /* get context to calculate proto-VSID */ + ld r9,PACACONTEXTID(r13) + rldimi r10,r9,USER_ESID_BITS,0 + + /* fall through slb_finish_load */ + +#endif /* __DISABLED__ */ + + +/* + * Finish loading of an SLB entry and return + * + * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET + */ +slb_finish_load: + ASM_VSID_SCRAMBLE(r10,r9,256M) + rldimi r11,r10,SLB_VSID_SHIFT,16 /* combine VSID and flags */ + + /* r3 = EA, r11 = VSID data */ + /* + * Find a slot, round robin. Previously we tried to find a + * free slot first but that took too long. Unfortunately we + * dont have any LRU information to help us choose a slot. + */ +#ifdef CONFIG_PPC_ISERIES +BEGIN_FW_FTR_SECTION + /* + * On iSeries, the "bolted" stack segment can be cast out on + * shared processor switch so we need to check for a miss on + * it and restore it to the right slot. + */ + ld r9,PACAKSAVE(r13) + clrrdi r9,r9,28 + clrrdi r3,r3,28 + li r10,SLB_NUM_BOLTED-1 /* Stack goes in last bolted slot */ + cmpld r9,r3 + beq 3f +END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES) +#endif /* CONFIG_PPC_ISERIES */ + +7: ld r10,PACASTABRR(r13) + addi r10,r10,1 + /* This gets soft patched on boot. */ +_GLOBAL(slb_compare_rr_to_size) + cmpldi r10,0 + + blt+ 4f + li r10,SLB_NUM_BOLTED + +4: + std r10,PACASTABRR(r13) + +3: + rldimi r3,r10,0,36 /* r3= EA[0:35] | entry */ + oris r10,r3,SLB_ESID_V@h /* r3 |= SLB_ESID_V */ + + /* r3 = ESID data, r11 = VSID data */ + + /* + * No need for an isync before or after this slbmte. The exception + * we enter with and the rfid we exit with are context synchronizing. + */ + slbmte r11,r10 + + /* we're done for kernel addresses */ + crclr 4*cr0+eq /* set result to "success" */ + bgelr cr7 + + /* Update the slb cache */ + lhz r3,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */ + cmpldi r3,SLB_CACHE_ENTRIES + bge 1f + + /* still room in the slb cache */ + sldi r11,r3,1 /* r11 = offset * sizeof(u16) */ + rldicl r10,r10,36,28 /* get low 16 bits of the ESID */ + add r11,r11,r13 /* r11 = (u16 *)paca + offset */ + sth r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */ + addi r3,r3,1 /* offset++ */ + b 2f +1: /* offset >= SLB_CACHE_ENTRIES */ + li r3,SLB_CACHE_ENTRIES+1 +2: + sth r3,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */ + crclr 4*cr0+eq /* set result to "success" */ + blr + +/* + * Finish loading of a 1T SLB entry (for the kernel linear mapping) and return. + * We assume legacy iSeries will never have 1T segments. + * + * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9 + */ +slb_finish_load_1T: + srdi r10,r10,40-28 /* get 1T ESID */ + ASM_VSID_SCRAMBLE(r10,r9,1T) + rldimi r11,r10,SLB_VSID_SHIFT_1T,16 /* combine VSID and flags */ + li r10,MMU_SEGSIZE_1T + rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */ + + /* r3 = EA, r11 = VSID data */ + clrrdi r3,r3,SID_SHIFT_1T /* clear out non-ESID bits */ + b 7b + diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c new file mode 100644 index 0000000..ba51948 --- /dev/null +++ b/arch/powerpc/mm/slice.c @@ -0,0 +1,734 @@ +/* + * address space "slices" (meta-segments) support + * + * Copyright (C) 2007 Benjamin Herrenschmidt, IBM Corporation. + * + * Based on hugetlb implementation + * + * Copyright (C) 2003 David Gibson, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#undef DEBUG + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/err.h> +#include <linux/spinlock.h> +#include <linux/module.h> +#include <asm/mman.h> +#include <asm/mmu.h> +#include <asm/spu.h> + +static DEFINE_SPINLOCK(slice_convert_lock); + + +#ifdef DEBUG +int _slice_debug = 1; + +static void slice_print_mask(const char *label, struct slice_mask mask) +{ + char *p, buf[16 + 3 + 16 + 1]; + int i; + + if (!_slice_debug) + return; + p = buf; + for (i = 0; i < SLICE_NUM_LOW; i++) + *(p++) = (mask.low_slices & (1 << i)) ? '1' : '0'; + *(p++) = ' '; + *(p++) = '-'; + *(p++) = ' '; + for (i = 0; i < SLICE_NUM_HIGH; i++) + *(p++) = (mask.high_slices & (1 << i)) ? '1' : '0'; + *(p++) = 0; + + printk(KERN_DEBUG "%s:%s\n", label, buf); +} + +#define slice_dbg(fmt...) do { if (_slice_debug) pr_debug(fmt); } while(0) + +#else + +static void slice_print_mask(const char *label, struct slice_mask mask) {} +#define slice_dbg(fmt...) + +#endif + +static struct slice_mask slice_range_to_mask(unsigned long start, + unsigned long len) +{ + unsigned long end = start + len - 1; + struct slice_mask ret = { 0, 0 }; + + if (start < SLICE_LOW_TOP) { + unsigned long mend = min(end, SLICE_LOW_TOP); + unsigned long mstart = min(start, SLICE_LOW_TOP); + + ret.low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1)) + - (1u << GET_LOW_SLICE_INDEX(mstart)); + } + + if ((start + len) > SLICE_LOW_TOP) + ret.high_slices = (1u << (GET_HIGH_SLICE_INDEX(end) + 1)) + - (1u << GET_HIGH_SLICE_INDEX(start)); + + return ret; +} + +static int slice_area_is_free(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + struct vm_area_struct *vma; + + if ((mm->task_size - len) < addr) + return 0; + vma = find_vma(mm, addr); + return (!vma || (addr + len) <= vma->vm_start); +} + +static int slice_low_has_vma(struct mm_struct *mm, unsigned long slice) +{ + return !slice_area_is_free(mm, slice << SLICE_LOW_SHIFT, + 1ul << SLICE_LOW_SHIFT); +} + +static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice) +{ + unsigned long start = slice << SLICE_HIGH_SHIFT; + unsigned long end = start + (1ul << SLICE_HIGH_SHIFT); + + /* Hack, so that each addresses is controlled by exactly one + * of the high or low area bitmaps, the first high area starts + * at 4GB, not 0 */ + if (start == 0) + start = SLICE_LOW_TOP; + + return !slice_area_is_free(mm, start, end - start); +} + +static struct slice_mask slice_mask_for_free(struct mm_struct *mm) +{ + struct slice_mask ret = { 0, 0 }; + unsigned long i; + + for (i = 0; i < SLICE_NUM_LOW; i++) + if (!slice_low_has_vma(mm, i)) + ret.low_slices |= 1u << i; + + if (mm->task_size <= SLICE_LOW_TOP) + return ret; + + for (i = 0; i < SLICE_NUM_HIGH; i++) + if (!slice_high_has_vma(mm, i)) + ret.high_slices |= 1u << i; + + return ret; +} + +static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize) +{ + struct slice_mask ret = { 0, 0 }; + unsigned long i; + u64 psizes; + + psizes = mm->context.low_slices_psize; + for (i = 0; i < SLICE_NUM_LOW; i++) + if (((psizes >> (i * 4)) & 0xf) == psize) + ret.low_slices |= 1u << i; + + psizes = mm->context.high_slices_psize; + for (i = 0; i < SLICE_NUM_HIGH; i++) + if (((psizes >> (i * 4)) & 0xf) == psize) + ret.high_slices |= 1u << i; + + return ret; +} + +static int slice_check_fit(struct slice_mask mask, struct slice_mask available) +{ + return (mask.low_slices & available.low_slices) == mask.low_slices && + (mask.high_slices & available.high_slices) == mask.high_slices; +} + +static void slice_flush_segments(void *parm) +{ + struct mm_struct *mm = parm; + unsigned long flags; + + if (mm != current->active_mm) + return; + + /* update the paca copy of the context struct */ + get_paca()->context = current->active_mm->context; + + local_irq_save(flags); + slb_flush_and_rebolt(); + local_irq_restore(flags); +} + +static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psize) +{ + /* Write the new slice psize bits */ + u64 lpsizes, hpsizes; + unsigned long i, flags; + + slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize); + slice_print_mask(" mask", mask); + + /* We need to use a spinlock here to protect against + * concurrent 64k -> 4k demotion ... + */ + spin_lock_irqsave(&slice_convert_lock, flags); + + lpsizes = mm->context.low_slices_psize; + for (i = 0; i < SLICE_NUM_LOW; i++) + if (mask.low_slices & (1u << i)) + lpsizes = (lpsizes & ~(0xful << (i * 4))) | + (((unsigned long)psize) << (i * 4)); + + hpsizes = mm->context.high_slices_psize; + for (i = 0; i < SLICE_NUM_HIGH; i++) + if (mask.high_slices & (1u << i)) + hpsizes = (hpsizes & ~(0xful << (i * 4))) | + (((unsigned long)psize) << (i * 4)); + + mm->context.low_slices_psize = lpsizes; + mm->context.high_slices_psize = hpsizes; + + slice_dbg(" lsps=%lx, hsps=%lx\n", + mm->context.low_slices_psize, + mm->context.high_slices_psize); + + spin_unlock_irqrestore(&slice_convert_lock, flags); + +#ifdef CONFIG_SPU_BASE + spu_flush_all_slbs(mm); +#endif +} + +static unsigned long slice_find_area_bottomup(struct mm_struct *mm, + unsigned long len, + struct slice_mask available, + int psize, int use_cache) +{ + struct vm_area_struct *vma; + unsigned long start_addr, addr; + struct slice_mask mask; + int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + + if (use_cache) { + if (len <= mm->cached_hole_size) { + start_addr = addr = TASK_UNMAPPED_BASE; + mm->cached_hole_size = 0; + } else + start_addr = addr = mm->free_area_cache; + } else + start_addr = addr = TASK_UNMAPPED_BASE; + +full_search: + for (;;) { + addr = _ALIGN_UP(addr, 1ul << pshift); + if ((TASK_SIZE - len) < addr) + break; + vma = find_vma(mm, addr); + BUG_ON(vma && (addr >= vma->vm_end)); + + mask = slice_range_to_mask(addr, len); + if (!slice_check_fit(mask, available)) { + if (addr < SLICE_LOW_TOP) + addr = _ALIGN_UP(addr + 1, 1ul << SLICE_LOW_SHIFT); + else + addr = _ALIGN_UP(addr + 1, 1ul << SLICE_HIGH_SHIFT); + continue; + } + if (!vma || addr + len <= vma->vm_start) { + /* + * Remember the place where we stopped the search: + */ + if (use_cache) + mm->free_area_cache = addr + len; + return addr; + } + if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start) + mm->cached_hole_size = vma->vm_start - addr; + addr = vma->vm_end; + } + + /* Make sure we didn't miss any holes */ + if (use_cache && start_addr != TASK_UNMAPPED_BASE) { + start_addr = addr = TASK_UNMAPPED_BASE; + mm->cached_hole_size = 0; + goto full_search; + } + return -ENOMEM; +} + +static unsigned long slice_find_area_topdown(struct mm_struct *mm, + unsigned long len, + struct slice_mask available, + int psize, int use_cache) +{ + struct vm_area_struct *vma; + unsigned long addr; + struct slice_mask mask; + int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + + /* check if free_area_cache is useful for us */ + if (use_cache) { + if (len <= mm->cached_hole_size) { + mm->cached_hole_size = 0; + mm->free_area_cache = mm->mmap_base; + } + + /* either no address requested or can't fit in requested + * address hole + */ + addr = mm->free_area_cache; + + /* make sure it can fit in the remaining address space */ + if (addr > len) { + addr = _ALIGN_DOWN(addr - len, 1ul << pshift); + mask = slice_range_to_mask(addr, len); + if (slice_check_fit(mask, available) && + slice_area_is_free(mm, addr, len)) + /* remember the address as a hint for + * next time + */ + return (mm->free_area_cache = addr); + } + } + + addr = mm->mmap_base; + while (addr > len) { + /* Go down by chunk size */ + addr = _ALIGN_DOWN(addr - len, 1ul << pshift); + + /* Check for hit with different page size */ + mask = slice_range_to_mask(addr, len); + if (!slice_check_fit(mask, available)) { + if (addr < SLICE_LOW_TOP) + addr = _ALIGN_DOWN(addr, 1ul << SLICE_LOW_SHIFT); + else if (addr < (1ul << SLICE_HIGH_SHIFT)) + addr = SLICE_LOW_TOP; + else + addr = _ALIGN_DOWN(addr, 1ul << SLICE_HIGH_SHIFT); + continue; + } + + /* + * Lookup failure means no vma is above this address, + * else if new region fits below vma->vm_start, + * return with success: + */ + vma = find_vma(mm, addr); + if (!vma || (addr + len) <= vma->vm_start) { + /* remember the address as a hint for next time */ + if (use_cache) + mm->free_area_cache = addr; + return addr; + } + + /* remember the largest hole we saw so far */ + if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start) + mm->cached_hole_size = vma->vm_start - addr; + + /* try just below the current vma->vm_start */ + addr = vma->vm_start; + } + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + addr = slice_find_area_bottomup(mm, len, available, psize, 0); + + /* + * Restore the topdown base: + */ + if (use_cache) { + mm->free_area_cache = mm->mmap_base; + mm->cached_hole_size = ~0UL; + } + + return addr; +} + + +static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len, + struct slice_mask mask, int psize, + int topdown, int use_cache) +{ + if (topdown) + return slice_find_area_topdown(mm, len, mask, psize, use_cache); + else + return slice_find_area_bottomup(mm, len, mask, psize, use_cache); +} + +#define or_mask(dst, src) do { \ + (dst).low_slices |= (src).low_slices; \ + (dst).high_slices |= (src).high_slices; \ +} while (0) + +#define andnot_mask(dst, src) do { \ + (dst).low_slices &= ~(src).low_slices; \ + (dst).high_slices &= ~(src).high_slices; \ +} while (0) + +#ifdef CONFIG_PPC_64K_PAGES +#define MMU_PAGE_BASE MMU_PAGE_64K +#else +#define MMU_PAGE_BASE MMU_PAGE_4K +#endif + +unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, + unsigned long flags, unsigned int psize, + int topdown, int use_cache) +{ + struct slice_mask mask = {0, 0}; + struct slice_mask good_mask; + struct slice_mask potential_mask = {0,0} /* silence stupid warning */; + struct slice_mask compat_mask = {0, 0}; + int fixed = (flags & MAP_FIXED); + int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + struct mm_struct *mm = current->mm; + unsigned long newaddr; + + /* Sanity checks */ + BUG_ON(mm->task_size == 0); + + slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize); + slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d, use_cache=%d\n", + addr, len, flags, topdown, use_cache); + + if (len > mm->task_size) + return -ENOMEM; + if (len & ((1ul << pshift) - 1)) + return -EINVAL; + if (fixed && (addr & ((1ul << pshift) - 1))) + return -EINVAL; + if (fixed && addr > (mm->task_size - len)) + return -EINVAL; + + /* If hint, make sure it matches our alignment restrictions */ + if (!fixed && addr) { + addr = _ALIGN_UP(addr, 1ul << pshift); + slice_dbg(" aligned addr=%lx\n", addr); + /* Ignore hint if it's too large or overlaps a VMA */ + if (addr > mm->task_size - len || + !slice_area_is_free(mm, addr, len)) + addr = 0; + } + + /* First make up a "good" mask of slices that have the right size + * already + */ + good_mask = slice_mask_for_size(mm, psize); + slice_print_mask(" good_mask", good_mask); + + /* + * Here "good" means slices that are already the right page size, + * "compat" means slices that have a compatible page size (i.e. + * 4k in a 64k pagesize kernel), and "free" means slices without + * any VMAs. + * + * If MAP_FIXED: + * check if fits in good | compat => OK + * check if fits in good | compat | free => convert free + * else bad + * If have hint: + * check if hint fits in good => OK + * check if hint fits in good | free => convert free + * Otherwise: + * search in good, found => OK + * search in good | free, found => convert free + * search in good | compat | free, found => convert free. + */ + +#ifdef CONFIG_PPC_64K_PAGES + /* If we support combo pages, we can allow 64k pages in 4k slices */ + if (psize == MMU_PAGE_64K) { + compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K); + if (fixed) + or_mask(good_mask, compat_mask); + } +#endif + + /* First check hint if it's valid or if we have MAP_FIXED */ + if (addr != 0 || fixed) { + /* Build a mask for the requested range */ + mask = slice_range_to_mask(addr, len); + slice_print_mask(" mask", mask); + + /* Check if we fit in the good mask. If we do, we just return, + * nothing else to do + */ + if (slice_check_fit(mask, good_mask)) { + slice_dbg(" fits good !\n"); + return addr; + } + } else { + /* Now let's see if we can find something in the existing + * slices for that size + */ + newaddr = slice_find_area(mm, len, good_mask, psize, topdown, + use_cache); + if (newaddr != -ENOMEM) { + /* Found within the good mask, we don't have to setup, + * we thus return directly + */ + slice_dbg(" found area at 0x%lx\n", newaddr); + return newaddr; + } + } + + /* We don't fit in the good mask, check what other slices are + * empty and thus can be converted + */ + potential_mask = slice_mask_for_free(mm); + or_mask(potential_mask, good_mask); + slice_print_mask(" potential", potential_mask); + + if ((addr != 0 || fixed) && slice_check_fit(mask, potential_mask)) { + slice_dbg(" fits potential !\n"); + goto convert; + } + + /* If we have MAP_FIXED and failed the above steps, then error out */ + if (fixed) + return -EBUSY; + + slice_dbg(" search...\n"); + + /* If we had a hint that didn't work out, see if we can fit + * anywhere in the good area. + */ + if (addr) { + addr = slice_find_area(mm, len, good_mask, psize, topdown, + use_cache); + if (addr != -ENOMEM) { + slice_dbg(" found area at 0x%lx\n", addr); + return addr; + } + } + + /* Now let's see if we can find something in the existing slices + * for that size plus free slices + */ + addr = slice_find_area(mm, len, potential_mask, psize, topdown, + use_cache); + +#ifdef CONFIG_PPC_64K_PAGES + if (addr == -ENOMEM && psize == MMU_PAGE_64K) { + /* retry the search with 4k-page slices included */ + or_mask(potential_mask, compat_mask); + addr = slice_find_area(mm, len, potential_mask, psize, + topdown, use_cache); + } +#endif + + if (addr == -ENOMEM) + return -ENOMEM; + + mask = slice_range_to_mask(addr, len); + slice_dbg(" found potential area at 0x%lx\n", addr); + slice_print_mask(" mask", mask); + + convert: + andnot_mask(mask, good_mask); + andnot_mask(mask, compat_mask); + if (mask.low_slices || mask.high_slices) { + slice_convert(mm, mask, psize); + if (psize > MMU_PAGE_BASE) + on_each_cpu(slice_flush_segments, mm, 1); + } + return addr; + +} +EXPORT_SYMBOL_GPL(slice_get_unmapped_area); + +unsigned long arch_get_unmapped_area(struct file *filp, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags) +{ + return slice_get_unmapped_area(addr, len, flags, + current->mm->context.user_psize, + 0, 1); +} + +unsigned long arch_get_unmapped_area_topdown(struct file *filp, + const unsigned long addr0, + const unsigned long len, + const unsigned long pgoff, + const unsigned long flags) +{ + return slice_get_unmapped_area(addr0, len, flags, + current->mm->context.user_psize, + 1, 1); +} + +unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) +{ + u64 psizes; + int index; + + if (addr < SLICE_LOW_TOP) { + psizes = mm->context.low_slices_psize; + index = GET_LOW_SLICE_INDEX(addr); + } else { + psizes = mm->context.high_slices_psize; + index = GET_HIGH_SLICE_INDEX(addr); + } + + return (psizes >> (index * 4)) & 0xf; +} +EXPORT_SYMBOL_GPL(get_slice_psize); + +/* + * This is called by hash_page when it needs to do a lazy conversion of + * an address space from real 64K pages to combo 4K pages (typically + * when hitting a non cacheable mapping on a processor or hypervisor + * that won't allow them for 64K pages). + * + * This is also called in init_new_context() to change back the user + * psize from whatever the parent context had it set to + * N.B. This may be called before mm->context.id has been set. + * + * This function will only change the content of the {low,high)_slice_psize + * masks, it will not flush SLBs as this shall be handled lazily by the + * caller. + */ +void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) +{ + unsigned long flags, lpsizes, hpsizes; + unsigned int old_psize; + int i; + + slice_dbg("slice_set_user_psize(mm=%p, psize=%d)\n", mm, psize); + + spin_lock_irqsave(&slice_convert_lock, flags); + + old_psize = mm->context.user_psize; + slice_dbg(" old_psize=%d\n", old_psize); + if (old_psize == psize) + goto bail; + + mm->context.user_psize = psize; + wmb(); + + lpsizes = mm->context.low_slices_psize; + for (i = 0; i < SLICE_NUM_LOW; i++) + if (((lpsizes >> (i * 4)) & 0xf) == old_psize) + lpsizes = (lpsizes & ~(0xful << (i * 4))) | + (((unsigned long)psize) << (i * 4)); + + hpsizes = mm->context.high_slices_psize; + for (i = 0; i < SLICE_NUM_HIGH; i++) + if (((hpsizes >> (i * 4)) & 0xf) == old_psize) + hpsizes = (hpsizes & ~(0xful << (i * 4))) | + (((unsigned long)psize) << (i * 4)); + + mm->context.low_slices_psize = lpsizes; + mm->context.high_slices_psize = hpsizes; + + slice_dbg(" lsps=%lx, hsps=%lx\n", + mm->context.low_slices_psize, + mm->context.high_slices_psize); + + bail: + spin_unlock_irqrestore(&slice_convert_lock, flags); +} + +void slice_set_psize(struct mm_struct *mm, unsigned long address, + unsigned int psize) +{ + unsigned long i, flags; + u64 *p; + + spin_lock_irqsave(&slice_convert_lock, flags); + if (address < SLICE_LOW_TOP) { + i = GET_LOW_SLICE_INDEX(address); + p = &mm->context.low_slices_psize; + } else { + i = GET_HIGH_SLICE_INDEX(address); + p = &mm->context.high_slices_psize; + } + *p = (*p & ~(0xful << (i * 4))) | ((unsigned long) psize << (i * 4)); + spin_unlock_irqrestore(&slice_convert_lock, flags); + +#ifdef CONFIG_SPU_BASE + spu_flush_all_slbs(mm); +#endif +} + +void slice_set_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long len, unsigned int psize) +{ + struct slice_mask mask = slice_range_to_mask(start, len); + + slice_convert(mm, mask, psize); +} + +/* + * is_hugepage_only_range() is used by generic code to verify wether + * a normal mmap mapping (non hugetlbfs) is valid on a given area. + * + * until the generic code provides a more generic hook and/or starts + * calling arch get_unmapped_area for MAP_FIXED (which our implementation + * here knows how to deal with), we hijack it to keep standard mappings + * away from us. + * + * because of that generic code limitation, MAP_FIXED mapping cannot + * "convert" back a slice with no VMAs to the standard page size, only + * get_unmapped_area() can. It would be possible to fix it here but I + * prefer working on fixing the generic code instead. + * + * WARNING: This will not work if hugetlbfs isn't enabled since the + * generic code will redefine that function as 0 in that. This is ok + * for now as we only use slices with hugetlbfs enabled. This should + * be fixed as the generic code gets fixed. + */ +int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + struct slice_mask mask, available; + unsigned int psize = mm->context.user_psize; + + mask = slice_range_to_mask(addr, len); + available = slice_mask_for_size(mm, psize); +#ifdef CONFIG_PPC_64K_PAGES + /* We need to account for 4k slices too */ + if (psize == MMU_PAGE_64K) { + struct slice_mask compat_mask; + compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K); + or_mask(available, compat_mask); + } +#endif + +#if 0 /* too verbose */ + slice_dbg("is_hugepage_only_range(mm=%p, addr=%lx, len=%lx)\n", + mm, addr, len); + slice_print_mask(" mask", mask); + slice_print_mask(" available", available); +#endif + return !slice_check_fit(mask, available); +} + diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c new file mode 100644 index 0000000..60e6032 --- /dev/null +++ b/arch/powerpc/mm/stab.c @@ -0,0 +1,287 @@ +/* + * PowerPC64 Segment Translation Support. + * + * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com + * Copyright (c) 2001 Dave Engebretsen + * + * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/lmb.h> + +#include <asm/pgtable.h> +#include <asm/mmu.h> +#include <asm/mmu_context.h> +#include <asm/paca.h> +#include <asm/cputable.h> +#include <asm/prom.h> +#include <asm/abs_addr.h> +#include <asm/firmware.h> +#include <asm/iseries/hv_call.h> + +struct stab_entry { + unsigned long esid_data; + unsigned long vsid_data; +}; + +#define NR_STAB_CACHE_ENTRIES 8 +static DEFINE_PER_CPU(long, stab_cache_ptr); +static DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]); + +/* + * Create a segment table entry for the given esid/vsid pair. + */ +static int make_ste(unsigned long stab, unsigned long esid, unsigned long vsid) +{ + unsigned long esid_data, vsid_data; + unsigned long entry, group, old_esid, castout_entry, i; + unsigned int global_entry; + struct stab_entry *ste, *castout_ste; + unsigned long kernel_segment = (esid << SID_SHIFT) >= PAGE_OFFSET; + + vsid_data = vsid << STE_VSID_SHIFT; + esid_data = esid << SID_SHIFT | STE_ESID_KP | STE_ESID_V; + if (! kernel_segment) + esid_data |= STE_ESID_KS; + + /* Search the primary group first. */ + global_entry = (esid & 0x1f) << 3; + ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7)); + + /* Find an empty entry, if one exists. */ + for (group = 0; group < 2; group++) { + for (entry = 0; entry < 8; entry++, ste++) { + if (!(ste->esid_data & STE_ESID_V)) { + ste->vsid_data = vsid_data; + eieio(); + ste->esid_data = esid_data; + return (global_entry | entry); + } + } + /* Now search the secondary group. */ + global_entry = ((~esid) & 0x1f) << 3; + ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7)); + } + + /* + * Could not find empty entry, pick one with a round robin selection. + * Search all entries in the two groups. + */ + castout_entry = get_paca()->stab_rr; + for (i = 0; i < 16; i++) { + if (castout_entry < 8) { + global_entry = (esid & 0x1f) << 3; + ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7)); + castout_ste = ste + castout_entry; + } else { + global_entry = ((~esid) & 0x1f) << 3; + ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7)); + castout_ste = ste + (castout_entry - 8); + } + + /* Dont cast out the first kernel segment */ + if ((castout_ste->esid_data & ESID_MASK) != PAGE_OFFSET) + break; + + castout_entry = (castout_entry + 1) & 0xf; + } + + get_paca()->stab_rr = (castout_entry + 1) & 0xf; + + /* Modify the old entry to the new value. */ + + /* Force previous translations to complete. DRENG */ + asm volatile("isync" : : : "memory"); + + old_esid = castout_ste->esid_data >> SID_SHIFT; + castout_ste->esid_data = 0; /* Invalidate old entry */ + + asm volatile("sync" : : : "memory"); /* Order update */ + + castout_ste->vsid_data = vsid_data; + eieio(); /* Order update */ + castout_ste->esid_data = esid_data; + + asm volatile("slbie %0" : : "r" (old_esid << SID_SHIFT)); + /* Ensure completion of slbie */ + asm volatile("sync" : : : "memory"); + + return (global_entry | (castout_entry & 0x7)); +} + +/* + * Allocate a segment table entry for the given ea and mm + */ +static int __ste_allocate(unsigned long ea, struct mm_struct *mm) +{ + unsigned long vsid; + unsigned char stab_entry; + unsigned long offset; + + /* Kernel or user address? */ + if (is_kernel_addr(ea)) { + vsid = get_kernel_vsid(ea, MMU_SEGSIZE_256M); + } else { + if ((ea >= TASK_SIZE_USER64) || (! mm)) + return 1; + + vsid = get_vsid(mm->context.id, ea, MMU_SEGSIZE_256M); + } + + stab_entry = make_ste(get_paca()->stab_addr, GET_ESID(ea), vsid); + + if (!is_kernel_addr(ea)) { + offset = __get_cpu_var(stab_cache_ptr); + if (offset < NR_STAB_CACHE_ENTRIES) + __get_cpu_var(stab_cache[offset++]) = stab_entry; + else + offset = NR_STAB_CACHE_ENTRIES+1; + __get_cpu_var(stab_cache_ptr) = offset; + + /* Order update */ + asm volatile("sync":::"memory"); + } + + return 0; +} + +int ste_allocate(unsigned long ea) +{ + return __ste_allocate(ea, current->mm); +} + +/* + * Do the segment table work for a context switch: flush all user + * entries from the table, then preload some probably useful entries + * for the new task + */ +void switch_stab(struct task_struct *tsk, struct mm_struct *mm) +{ + struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr; + struct stab_entry *ste; + unsigned long offset = __get_cpu_var(stab_cache_ptr); + unsigned long pc = KSTK_EIP(tsk); + unsigned long stack = KSTK_ESP(tsk); + unsigned long unmapped_base; + + /* Force previous translations to complete. DRENG */ + asm volatile("isync" : : : "memory"); + + if (offset <= NR_STAB_CACHE_ENTRIES) { + int i; + + for (i = 0; i < offset; i++) { + ste = stab + __get_cpu_var(stab_cache[i]); + ste->esid_data = 0; /* invalidate entry */ + } + } else { + unsigned long entry; + + /* Invalidate all entries. */ + ste = stab; + + /* Never flush the first entry. */ + ste += 1; + for (entry = 1; + entry < (HW_PAGE_SIZE / sizeof(struct stab_entry)); + entry++, ste++) { + unsigned long ea; + ea = ste->esid_data & ESID_MASK; + if (!is_kernel_addr(ea)) { + ste->esid_data = 0; + } + } + } + + asm volatile("sync; slbia; sync":::"memory"); + + __get_cpu_var(stab_cache_ptr) = 0; + + /* Now preload some entries for the new task */ + if (test_tsk_thread_flag(tsk, TIF_32BIT)) + unmapped_base = TASK_UNMAPPED_BASE_USER32; + else + unmapped_base = TASK_UNMAPPED_BASE_USER64; + + __ste_allocate(pc, mm); + + if (GET_ESID(pc) == GET_ESID(stack)) + return; + + __ste_allocate(stack, mm); + + if ((GET_ESID(pc) == GET_ESID(unmapped_base)) + || (GET_ESID(stack) == GET_ESID(unmapped_base))) + return; + + __ste_allocate(unmapped_base, mm); + + /* Order update */ + asm volatile("sync" : : : "memory"); +} + +/* + * Allocate segment tables for secondary CPUs. These must all go in + * the first (bolted) segment, so that do_stab_bolted won't get a + * recursive segment miss on the segment table itself. + */ +void __init stabs_alloc(void) +{ + int cpu; + + if (cpu_has_feature(CPU_FTR_SLB)) + return; + + for_each_possible_cpu(cpu) { + unsigned long newstab; + + if (cpu == 0) + continue; /* stab for CPU 0 is statically allocated */ + + newstab = lmb_alloc_base(HW_PAGE_SIZE, HW_PAGE_SIZE, + 1<<SID_SHIFT); + newstab = (unsigned long)__va(newstab); + + memset((void *)newstab, 0, HW_PAGE_SIZE); + + paca[cpu].stab_addr = newstab; + paca[cpu].stab_real = virt_to_abs(newstab); + printk(KERN_INFO "Segment table for CPU %d at 0x%lx " + "virtual, 0x%lx absolute\n", + cpu, paca[cpu].stab_addr, paca[cpu].stab_real); + } +} + +/* + * Build an entry for the base kernel segment and put it into + * the segment table or SLB. All other segment table or SLB + * entries are faulted in. + */ +void stab_initialize(unsigned long stab) +{ + unsigned long vsid = get_kernel_vsid(PAGE_OFFSET, MMU_SEGSIZE_256M); + unsigned long stabreal; + + asm volatile("isync; slbia; isync":::"memory"); + make_ste(stab, GET_ESID(PAGE_OFFSET), vsid); + + /* Order update */ + asm volatile("sync":::"memory"); + + /* Set ASR */ + stabreal = get_paca()->stab_real | 0x1ul; + +#ifdef CONFIG_PPC_ISERIES + if (firmware_has_feature(FW_FEATURE_ISERIES)) { + HvCall1(HvCallBaseSetASR, stabreal); + return; + } +#endif /* CONFIG_PPC_ISERIES */ + + mtspr(SPRN_ASR, stabreal); +} diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c new file mode 100644 index 0000000..4cafc0c --- /dev/null +++ b/arch/powerpc/mm/subpage-prot.c @@ -0,0 +1,213 @@ +/* + * Copyright 2007-2008 Paul Mackerras, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> + +#include <asm/pgtable.h> +#include <asm/uaccess.h> +#include <asm/tlbflush.h> + +/* + * Free all pages allocated for subpage protection maps and pointers. + * Also makes sure that the subpage_prot_table structure is + * reinitialized for the next user. + */ +void subpage_prot_free(pgd_t *pgd) +{ + struct subpage_prot_table *spt = pgd_subpage_prot(pgd); + unsigned long i, j, addr; + u32 **p; + + for (i = 0; i < 4; ++i) { + if (spt->low_prot[i]) { + free_page((unsigned long)spt->low_prot[i]); + spt->low_prot[i] = NULL; + } + } + addr = 0; + for (i = 0; i < 2; ++i) { + p = spt->protptrs[i]; + if (!p) + continue; + spt->protptrs[i] = NULL; + for (j = 0; j < SBP_L2_COUNT && addr < spt->maxaddr; + ++j, addr += PAGE_SIZE) + if (p[j]) + free_page((unsigned long)p[j]); + free_page((unsigned long)p); + } + spt->maxaddr = 0; +} + +static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, + int npages) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; + + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd)) + return; + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) + return; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return; + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + arch_enter_lazy_mmu_mode(); + for (; npages > 0; --npages) { + pte_update(mm, addr, pte, 0, 0); + addr += PAGE_SIZE; + ++pte; + } + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(pte - 1, ptl); +} + +/* + * Clear the subpage protection map for an address range, allowing + * all accesses that are allowed by the pte permissions. + */ +static void subpage_prot_clear(unsigned long addr, unsigned long len) +{ + struct mm_struct *mm = current->mm; + struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd); + u32 **spm, *spp; + int i, nw; + unsigned long next, limit; + + down_write(&mm->mmap_sem); + limit = addr + len; + if (limit > spt->maxaddr) + limit = spt->maxaddr; + for (; addr < limit; addr = next) { + next = pmd_addr_end(addr, limit); + if (addr < 0x100000000) { + spm = spt->low_prot; + } else { + spm = spt->protptrs[addr >> SBP_L3_SHIFT]; + if (!spm) + continue; + } + spp = spm[(addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)]; + if (!spp) + continue; + spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1); + + i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + nw = PTRS_PER_PTE - i; + if (addr + (nw << PAGE_SHIFT) > next) + nw = (next - addr) >> PAGE_SHIFT; + + memset(spp, 0, nw * sizeof(u32)); + + /* now flush any existing HPTEs for the range */ + hpte_flush_range(mm, addr, nw); + } + up_write(&mm->mmap_sem); +} + +/* + * Copy in a subpage protection map for an address range. + * The map has 2 bits per 4k subpage, so 32 bits per 64k page. + * Each 2-bit field is 0 to allow any access, 1 to prevent writes, + * 2 or 3 to prevent all accesses. + * Note that the normal page protections also apply; the subpage + * protection mechanism is an additional constraint, so putting 0 + * in a 2-bit field won't allow writes to a page that is otherwise + * write-protected. + */ +long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map) +{ + struct mm_struct *mm = current->mm; + struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd); + u32 **spm, *spp; + int i, nw; + unsigned long next, limit; + int err; + + /* Check parameters */ + if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) || + addr >= TASK_SIZE || len >= TASK_SIZE || addr + len > TASK_SIZE) + return -EINVAL; + + if (is_hugepage_only_range(mm, addr, len)) + return -EINVAL; + + if (!map) { + /* Clear out the protection map for the address range */ + subpage_prot_clear(addr, len); + return 0; + } + + if (!access_ok(VERIFY_READ, map, (len >> PAGE_SHIFT) * sizeof(u32))) + return -EFAULT; + + down_write(&mm->mmap_sem); + for (limit = addr + len; addr < limit; addr = next) { + next = pmd_addr_end(addr, limit); + err = -ENOMEM; + if (addr < 0x100000000) { + spm = spt->low_prot; + } else { + spm = spt->protptrs[addr >> SBP_L3_SHIFT]; + if (!spm) { + spm = (u32 **)get_zeroed_page(GFP_KERNEL); + if (!spm) + goto out; + spt->protptrs[addr >> SBP_L3_SHIFT] = spm; + } + } + spm += (addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1); + spp = *spm; + if (!spp) { + spp = (u32 *)get_zeroed_page(GFP_KERNEL); + if (!spp) + goto out; + *spm = spp; + } + spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1); + + local_irq_disable(); + demote_segment_4k(mm, addr); + local_irq_enable(); + + i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + nw = PTRS_PER_PTE - i; + if (addr + (nw << PAGE_SHIFT) > next) + nw = (next - addr) >> PAGE_SHIFT; + + up_write(&mm->mmap_sem); + err = -EFAULT; + if (__copy_from_user(spp, map, nw * sizeof(u32))) + goto out2; + map += nw; + down_write(&mm->mmap_sem); + + /* now flush any existing HPTEs for the range */ + hpte_flush_range(mm, addr, nw); + } + if (limit > spt->maxaddr) + spt->maxaddr = limit; + err = 0; + out: + up_write(&mm->mmap_sem); + out2: + return err; +} diff --git a/arch/powerpc/mm/tlb_32.c b/arch/powerpc/mm/tlb_32.c new file mode 100644 index 0000000..f9a47fe --- /dev/null +++ b/arch/powerpc/mm/tlb_32.c @@ -0,0 +1,190 @@ +/* + * This file contains the routines for TLB flushing. + * On machines where the MMU uses a hash table to store virtual to + * physical translations, these routines flush entries from the + * hash table also. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> + +#include <asm/tlbflush.h> +#include <asm/tlb.h> + +#include "mmu_decl.h" + +/* + * Called when unmapping pages to flush entries from the TLB/hash table. + */ +void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr) +{ + unsigned long ptephys; + + if (Hash != 0) { + ptephys = __pa(ptep) & PAGE_MASK; + flush_hash_pages(mm->context.id, addr, ptephys, 1); + } +} +EXPORT_SYMBOL(flush_hash_entry); + +/* + * Called by ptep_set_access_flags, must flush on CPUs for which the + * DSI handler can't just "fixup" the TLB on a write fault + */ +void flush_tlb_page_nohash(struct vm_area_struct *vma, unsigned long addr) +{ + if (Hash != 0) + return; + _tlbie(addr); +} + +/* + * Called at the end of a mmu_gather operation to make sure the + * TLB flush is completely done. + */ +void tlb_flush(struct mmu_gather *tlb) +{ + if (Hash == 0) { + /* + * 603 needs to flush the whole TLB here since + * it doesn't use a hash table. + */ + _tlbia(); + } +} + +/* + * TLB flushing: + * + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes kernel pages + * + * since the hardware hash table functions as an extension of the + * tlb as far as the linux tables are concerned, flush it too. + * -- Cort + */ + +/* + * 750 SMP is a Bad Idea because the 750 doesn't broadcast all + * the cache operations on the bus. Hence we need to use an IPI + * to get the other CPU(s) to invalidate their TLBs. + */ +#ifdef CONFIG_SMP_750 +#define FINISH_FLUSH smp_send_tlb_invalidate(0) +#else +#define FINISH_FLUSH do { } while (0) +#endif + +static void flush_range(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + pmd_t *pmd; + unsigned long pmd_end; + int count; + unsigned int ctx = mm->context.id; + + if (Hash == 0) { + _tlbia(); + return; + } + start &= PAGE_MASK; + if (start >= end) + return; + end = (end - 1) | ~PAGE_MASK; + pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start); + for (;;) { + pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1; + if (pmd_end > end) + pmd_end = end; + if (!pmd_none(*pmd)) { + count = ((pmd_end - start) >> PAGE_SHIFT) + 1; + flush_hash_pages(ctx, start, pmd_val(*pmd), count); + } + if (pmd_end == end) + break; + start = pmd_end + 1; + ++pmd; + } +} + +/* + * Flush kernel TLB entries in the given range + */ +void flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + flush_range(&init_mm, start, end); + FINISH_FLUSH; +} + +/* + * Flush all the (user) entries for the address space described by mm. + */ +void flush_tlb_mm(struct mm_struct *mm) +{ + struct vm_area_struct *mp; + + if (Hash == 0) { + _tlbia(); + return; + } + + /* + * It is safe to go down the mm's list of vmas when called + * from dup_mmap, holding mmap_sem. It would also be safe from + * unmap_region or exit_mmap, but not from vmtruncate on SMP - + * but it seems dup_mmap is the only SMP case which gets here. + */ + for (mp = mm->mmap; mp != NULL; mp = mp->vm_next) + flush_range(mp->vm_mm, mp->vm_start, mp->vm_end); + FINISH_FLUSH; +} + +void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + struct mm_struct *mm; + pmd_t *pmd; + + if (Hash == 0) { + _tlbie(vmaddr); + return; + } + mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm; + pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr); + if (!pmd_none(*pmd)) + flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1); + FINISH_FLUSH; +} + +/* + * For each address in the range, find the pte for the address + * and check _PAGE_HASHPTE bit; if it is set, find and destroy + * the corresponding HPTE. + */ +void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) +{ + flush_range(vma->vm_mm, start, end); + FINISH_FLUSH; +} diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_64.c new file mode 100644 index 0000000..be7dd42 --- /dev/null +++ b/arch/powerpc/mm/tlb_64.c @@ -0,0 +1,297 @@ +/* + * This file contains the routines for flushing entries from the + * TLB and MMU hash table. + * + * Derived from arch/ppc64/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Dave Engebretsen <engebret@us.ibm.com> + * Rework for PPC64 port. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/percpu.h> +#include <linux/hardirq.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/tlb.h> +#include <asm/bug.h> + +DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); + +/* This is declared as we are using the more or less generic + * arch/powerpc/include/asm/tlb.h file -- tgall + */ +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +static DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); +static unsigned long pte_freelist_forced_free; + +struct pte_freelist_batch +{ + struct rcu_head rcu; + unsigned int index; + pgtable_free_t tables[0]; +}; + +#define PTE_FREELIST_SIZE \ + ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \ + / sizeof(pgtable_free_t)) + +static void pte_free_smp_sync(void *arg) +{ + /* Do nothing, just ensure we sync with all CPUs */ +} + +/* This is only called when we are critically out of memory + * (and fail to get a page in pte_free_tlb). + */ +static void pgtable_free_now(pgtable_free_t pgf) +{ + pte_freelist_forced_free++; + + smp_call_function(pte_free_smp_sync, NULL, 1); + + pgtable_free(pgf); +} + +static void pte_free_rcu_callback(struct rcu_head *head) +{ + struct pte_freelist_batch *batch = + container_of(head, struct pte_freelist_batch, rcu); + unsigned int i; + + for (i = 0; i < batch->index; i++) + pgtable_free(batch->tables[i]); + + free_page((unsigned long)batch); +} + +static void pte_free_submit(struct pte_freelist_batch *batch) +{ + INIT_RCU_HEAD(&batch->rcu); + call_rcu(&batch->rcu, pte_free_rcu_callback); +} + +void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) +{ + /* This is safe since tlb_gather_mmu has disabled preemption */ + cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id()); + struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); + + if (atomic_read(&tlb->mm->mm_users) < 2 || + cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) { + pgtable_free(pgf); + return; + } + + if (*batchp == NULL) { + *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC); + if (*batchp == NULL) { + pgtable_free_now(pgf); + return; + } + (*batchp)->index = 0; + } + (*batchp)->tables[(*batchp)->index++] = pgf; + if ((*batchp)->index == PTE_FREELIST_SIZE) { + pte_free_submit(*batchp); + *batchp = NULL; + } +} + +/* + * A linux PTE was changed and the corresponding hash table entry + * neesd to be flushed. This function will either perform the flush + * immediately or will batch it up if the current CPU has an active + * batch on it. + * + * Must be called from within some kind of spinlock/non-preempt region... + */ +void hpte_need_flush(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long pte, int huge) +{ + struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); + unsigned long vsid, vaddr; + unsigned int psize; + int ssize; + real_pte_t rpte; + int i; + + i = batch->index; + + /* We mask the address for the base page size. Huge pages will + * have applied their own masking already + */ + addr &= PAGE_MASK; + + /* Get page size (maybe move back to caller). + * + * NOTE: when using special 64K mappings in 4K environment like + * for SPEs, we obtain the page size from the slice, which thus + * must still exist (and thus the VMA not reused) at the time + * of this call + */ + if (huge) { +#ifdef CONFIG_HUGETLB_PAGE + psize = get_slice_psize(mm, addr);; +#else + BUG(); + psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */ +#endif + } else + psize = pte_pagesize_index(mm, addr, pte); + + /* Build full vaddr */ + if (!is_kernel_addr(addr)) { + ssize = user_segment_size(addr); + vsid = get_vsid(mm->context.id, addr, ssize); + WARN_ON(vsid == 0); + } else { + vsid = get_kernel_vsid(addr, mmu_kernel_ssize); + ssize = mmu_kernel_ssize; + } + vaddr = hpt_va(addr, vsid, ssize); + rpte = __real_pte(__pte(pte), ptep); + + /* + * Check if we have an active batch on this CPU. If not, just + * flush now and return. For now, we don global invalidates + * in that case, might be worth testing the mm cpu mask though + * and decide to use local invalidates instead... + */ + if (!batch->active) { + flush_hash_page(vaddr, rpte, psize, ssize, 0); + return; + } + + /* + * This can happen when we are in the middle of a TLB batch and + * we encounter memory pressure (eg copy_page_range when it tries + * to allocate a new pte). If we have to reclaim memory and end + * up scanning and resetting referenced bits then our batch context + * will change mid stream. + * + * We also need to ensure only one page size is present in a given + * batch + */ + if (i != 0 && (mm != batch->mm || batch->psize != psize || + batch->ssize != ssize)) { + __flush_tlb_pending(batch); + i = 0; + } + if (i == 0) { + batch->mm = mm; + batch->psize = psize; + batch->ssize = ssize; + } + batch->pte[i] = rpte; + batch->vaddr[i] = vaddr; + batch->index = ++i; + if (i >= PPC64_TLB_BATCH_NR) + __flush_tlb_pending(batch); +} + +/* + * This function is called when terminating an mmu batch or when a batch + * is full. It will perform the flush of all the entries currently stored + * in a batch. + * + * Must be called from within some kind of spinlock/non-preempt region... + */ +void __flush_tlb_pending(struct ppc64_tlb_batch *batch) +{ + cpumask_t tmp; + int i, local = 0; + + i = batch->index; + tmp = cpumask_of_cpu(smp_processor_id()); + if (cpus_equal(batch->mm->cpu_vm_mask, tmp)) + local = 1; + if (i == 1) + flush_hash_page(batch->vaddr[0], batch->pte[0], + batch->psize, batch->ssize, local); + else + flush_hash_range(i, local); + batch->index = 0; +} + +void pte_free_finish(void) +{ + /* This is safe since tlb_gather_mmu has disabled preemption */ + struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); + + if (*batchp == NULL) + return; + pte_free_submit(*batchp); + *batchp = NULL; +} + +/** + * __flush_hash_table_range - Flush all HPTEs for a given address range + * from the hash table (and the TLB). But keeps + * the linux PTEs intact. + * + * @mm : mm_struct of the target address space (generally init_mm) + * @start : starting address + * @end : ending address (not included in the flush) + * + * This function is mostly to be used by some IO hotplug code in order + * to remove all hash entries from a given address range used to map IO + * space on a removed PCI-PCI bidge without tearing down the full mapping + * since 64K pages may overlap with other bridges when using 64K pages + * with 4K HW pages on IO space. + * + * Because of that usage pattern, it's only available with CONFIG_HOTPLUG + * and is implemented for small size rather than speed. + */ +#ifdef CONFIG_HOTPLUG + +void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + unsigned long flags; + + start = _ALIGN_DOWN(start, PAGE_SIZE); + end = _ALIGN_UP(end, PAGE_SIZE); + + BUG_ON(!mm->pgd); + + /* Note: Normally, we should only ever use a batch within a + * PTE locked section. This violates the rule, but will work + * since we don't actually modify the PTEs, we just flush the + * hash while leaving the PTEs intact (including their reference + * to being hashed). This is not the most performance oriented + * way to do things but is fine for our needs here. + */ + local_irq_save(flags); + arch_enter_lazy_mmu_mode(); + for (; start < end; start += PAGE_SIZE) { + pte_t *ptep = find_linux_pte(mm->pgd, start); + unsigned long pte; + + if (ptep == NULL) + continue; + pte = pte_val(*ptep); + if (!(pte & _PAGE_HASHPTE)) + continue; + hpte_need_flush(mm, start, ptep, pte, 0); + } + arch_leave_lazy_mmu_mode(); + local_irq_restore(flags); +} + +#endif /* CONFIG_HOTPLUG */ |