Initial import of modified Linux 2.6.28 tree

Original upstream URL: git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git | branch linux-2.6.28.y
author: Timothy Pearson <tpearson@raptorengineering.com> 2017-08-23 14:45:25 -0500
committer: Timothy Pearson <tpearson@raptorengineering.com> 2017-08-23 14:45:25 -0500
commit: fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204 (patch)
tree: 22962a4387943edc841c72a4e636a068c66d58fd /arch/powerpc/mm
download: ast2050-linux-kernel-fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204.zip
ast2050-linux-kernel-fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204.tar.gz
29 files changed, 11218 insertions, 0 deletions
diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c
new file mode 100644
index 0000000..29954dc
--- /dev/null
+++ b/arch/powerpc/mm/40x_mmu.c
@@ -0,0 +1,146 @@
+/*
+ * This file contains the routines for initializing the MMU
+ * on the 4xx series of chips.
+ *  -- paulus
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/highmem.h>
+
+#include <asm/pgalloc.h>
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/uaccess.h>
+#include <asm/smp.h>
+#include <asm/bootx.h>
+#include <asm/machdep.h>
+#include <asm/setup.h>
+#include "mmu_decl.h"
+
+extern int __map_without_ltlbs;
+/*
+ * MMU_init_hw does the chip-specific initialization of the MMU hardware.
+ */
+void __init MMU_init_hw(void)
+{
+	/*
+	 * The Zone Protection Register (ZPR) defines how protection will
+	 * be applied to every page which is a member of a given zone. At
+	 * present, we utilize only two of the 4xx's zones.
+	 * The zone index bits (of ZSEL) in the PTE are used for software
+	 * indicators, except the LSB.  For user access, zone 1 is used,
+	 * for kernel access, zone 0 is used.  We set all but zone 1
+	 * to zero, allowing only kernel access as indicated in the PTE.
+	 * For zone 1, we set a 01 binary (a value of 10 will not work)
+	 * to allow user access as indicated in the PTE.  This also allows
+	 * kernel access as indicated in the PTE.
+	 */
+
+        mtspr(SPRN_ZPR, 0x10000000);
+
+	flush_instruction_cache();
+
+	/*
+	 * Set up the real-mode cache parameters for the exception vector
+	 * handlers (which are run in real-mode).
+	 */
+
+        mtspr(SPRN_DCWR, 0x00000000);	/* All caching is write-back */
+
+        /*
+	 * Cache instruction and data space where the exception
+	 * vectors and the kernel live in real-mode.
+	 */
+
+        mtspr(SPRN_DCCR, 0xF0000000);	/* 512 MB of data space at 0x0. */
+        mtspr(SPRN_ICCR, 0xF0000000);	/* 512 MB of instr. space at 0x0. */
+}
+
+#define LARGE_PAGE_SIZE_16M	(1<<24)
+#define LARGE_PAGE_SIZE_4M	(1<<22)
+
+unsigned long __init mmu_mapin_ram(void)
+{
+	unsigned long v, s, mapped;
+	phys_addr_t p;
+
+	v = KERNELBASE;
+	p = 0;
+	s = total_lowmem;
+
+	if (__map_without_ltlbs)
+		return 0;
+
+	while (s >= LARGE_PAGE_SIZE_16M) {
+		pmd_t *pmdp;
+		unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE;
+
+		pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
+		pmd_val(*pmdp++) = val;
+		pmd_val(*pmdp++) = val;
+		pmd_val(*pmdp++) = val;
+		pmd_val(*pmdp++) = val;
+
+		v += LARGE_PAGE_SIZE_16M;
+		p += LARGE_PAGE_SIZE_16M;
+		s -= LARGE_PAGE_SIZE_16M;
+	}
+
+	while (s >= LARGE_PAGE_SIZE_4M) {
+		pmd_t *pmdp;
+		unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE;
+
+		pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
+		pmd_val(*pmdp) = val;
+
+		v += LARGE_PAGE_SIZE_4M;
+		p += LARGE_PAGE_SIZE_4M;
+		s -= LARGE_PAGE_SIZE_4M;
+	}
+
+	mapped = total_lowmem - s;
+
+	/* If the size of RAM is not an exact power of two, we may not
+	 * have covered RAM in its entirety with 16 and 4 MiB
+	 * pages. Consequently, restrict the top end of RAM currently
+	 * allocable so that calls to the LMB to allocate PTEs for "tail"
+	 * coverage with normal-sized pages (or other reasons) do not
+	 * attempt to allocate outside the allowed range.
+	 */
+
+	__initial_memory_limit_addr = memstart_addr + mapped;
+
+	return mapped;
+}
diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c
new file mode 100644
index 0000000..98052ac
--- /dev/null
+++ b/arch/powerpc/mm/44x_mmu.c
@@ -0,0 +1,102 @@
+/*
+ * Modifications by Matt Porter (mporter@mvista.com) to support
+ * PPC44x Book E processors.
+ *
+ * This file contains the routines for initializing the MMU
+ * on the 4xx series of chips.
+ *  -- paulus
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/init.h>
+#include <asm/mmu.h>
+#include <asm/system.h>
+#include <asm/page.h>
+#include <asm/cacheflush.h>
+
+#include "mmu_decl.h"
+
+/* Used by the 44x TLB replacement exception handler.
+ * Just needed it declared someplace.
+ */
+unsigned int tlb_44x_index; /* = 0 */
+unsigned int tlb_44x_hwater = PPC44x_TLB_SIZE - 1 - PPC44x_EARLY_TLBS;
+int icache_44x_need_flush;
+
+static void __init ppc44x_update_tlb_hwater(void)
+{
+	extern unsigned int tlb_44x_patch_hwater_D[];
+	extern unsigned int tlb_44x_patch_hwater_I[];
+
+	/* The TLB miss handlers hard codes the watermark in a cmpli
+	 * instruction to improve performances rather than loading it
+	 * from the global variable. Thus, we patch the instructions
+	 * in the 2 TLB miss handlers when updating the value
+	 */
+	tlb_44x_patch_hwater_D[0] = (tlb_44x_patch_hwater_D[0] & 0xffff0000) |
+		tlb_44x_hwater;
+	flush_icache_range((unsigned long)&tlb_44x_patch_hwater_D[0],
+			   (unsigned long)&tlb_44x_patch_hwater_D[1]);
+	tlb_44x_patch_hwater_I[0] = (tlb_44x_patch_hwater_I[0] & 0xffff0000) |
+		tlb_44x_hwater;
+	flush_icache_range((unsigned long)&tlb_44x_patch_hwater_I[0],
+			   (unsigned long)&tlb_44x_patch_hwater_I[1]);
+}
+
+/*
+ * "Pins" a 256MB TLB entry in AS0 for kernel lowmem
+ */
+static void __init ppc44x_pin_tlb(unsigned int virt, unsigned int phys)
+{
+	unsigned int entry = tlb_44x_hwater--;
+
+	ppc44x_update_tlb_hwater();
+
+	__asm__ __volatile__(
+		"tlbwe	%2,%3,%4\n"
+		"tlbwe	%1,%3,%5\n"
+		"tlbwe	%0,%3,%6\n"
+	:
+	: "r" (PPC44x_TLB_SW | PPC44x_TLB_SR | PPC44x_TLB_SX | PPC44x_TLB_G),
+	  "r" (phys),
+	  "r" (virt | PPC44x_TLB_VALID | PPC44x_TLB_256M),
+	  "r" (entry),
+	  "i" (PPC44x_TLB_PAGEID),
+	  "i" (PPC44x_TLB_XLAT),
+	  "i" (PPC44x_TLB_ATTRIB));
+}
+
+void __init MMU_init_hw(void)
+{
+	ppc44x_update_tlb_hwater();
+
+	flush_instruction_cache();
+}
+
+unsigned long __init mmu_mapin_ram(void)
+{
+	unsigned long addr;
+
+	/* Pin in enough TLBs to cover any lowmem not covered by the
+	 * initial 256M mapping established in head_44x.S */
+	for (addr = PPC_PIN_SIZE; addr < lowmem_end_addr;
+	     addr += PPC_PIN_SIZE)
+		ppc44x_pin_tlb(addr + PAGE_OFFSET, addr);
+
+	return total_lowmem;
+}
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
new file mode 100644
index 0000000..e7392b4
--- /dev/null
+++ b/arch/powerpc/mm/Makefile
@@ -0,0 +1,26 @@
+#
+# Makefile for the linux ppc-specific parts of the memory manager.
+#
+
+ifeq ($(CONFIG_PPC64),y)
+EXTRA_CFLAGS	+= -mno-minimal-toc
+endif
+
+obj-y				:= fault.o mem.o \
+				   init_$(CONFIG_WORD_SIZE).o \
+				   pgtable_$(CONFIG_WORD_SIZE).o \
+				   mmu_context_$(CONFIG_WORD_SIZE).o
+hash-$(CONFIG_PPC_NATIVE)	:= hash_native_64.o
+obj-$(CONFIG_PPC64)		+= hash_utils_64.o \
+				   slb_low.o slb.o stab.o \
+				   gup.o mmap.o $(hash-y)
+obj-$(CONFIG_PPC_STD_MMU_32)	+= ppc_mmu_32.o
+obj-$(CONFIG_PPC_STD_MMU)	+= hash_low_$(CONFIG_WORD_SIZE).o \
+				   tlb_$(CONFIG_WORD_SIZE).o
+obj-$(CONFIG_40x)		+= 40x_mmu.o
+obj-$(CONFIG_44x)		+= 44x_mmu.o
+obj-$(CONFIG_FSL_BOOKE)		+= fsl_booke_mmu.o
+obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
+obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
+obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
+obj-$(CONFIG_PPC_SUBPAGE_PROT)	+= subpage-prot.o
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
new file mode 100644
index 0000000..565b7a2
--- /dev/null
+++ b/arch/powerpc/mm/fault.c
@@ -0,0 +1,412 @@
+/*
+ *  PowerPC version
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Derived from "arch/i386/mm/fault.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  Modified by Cort Dougan and Paul Mackerras.
+ *
+ *  Modified for PPC64 by Dave Engebretsen (engebret@ibm.com)
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/tlbflush.h>
+#include <asm/siginfo.h>
+
+
+#ifdef CONFIG_KPROBES
+static inline int notify_page_fault(struct pt_regs *regs)
+{
+	int ret = 0;
+
+	/* kprobe_running() needs smp_processor_id() */
+	if (!user_mode(regs)) {
+		preempt_disable();
+		if (kprobe_running() && kprobe_fault_handler(regs, 11))
+			ret = 1;
+		preempt_enable();
+	}
+
+	return ret;
+}
+#else
+static inline int notify_page_fault(struct pt_regs *regs)
+{
+	return 0;
+}
+#endif
+
+/*
+ * Check whether the instruction at regs->nip is a store using
+ * an update addressing form which will update r1.
+ */
+static int store_updates_sp(struct pt_regs *regs)
+{
+	unsigned int inst;
+
+	if (get_user(inst, (unsigned int __user *)regs->nip))
+		return 0;
+	/* check for 1 in the rA field */
+	if (((inst >> 16) & 0x1f) != 1)
+		return 0;
+	/* check major opcode */
+	switch (inst >> 26) {
+	case 37:	/* stwu */
+	case 39:	/* stbu */
+	case 45:	/* sthu */
+	case 53:	/* stfsu */
+	case 55:	/* stfdu */
+		return 1;
+	case 62:	/* std or stdu */
+		return (inst & 3) == 1;
+	case 31:
+		/* check minor opcode */
+		switch ((inst >> 1) & 0x3ff) {
+		case 181:	/* stdux */
+		case 183:	/* stwux */
+		case 247:	/* stbux */
+		case 439:	/* sthux */
+		case 695:	/* stfsux */
+		case 759:	/* stfdux */
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * For 600- and 800-family processors, the error_code parameter is DSISR
+ * for a data fault, SRR1 for an instruction fault. For 400-family processors
+ * the error_code parameter is ESR for a data fault, 0 for an instruction
+ * fault.
+ * For 64-bit processors, the error_code parameter is
+ *  - DSISR for a non-SLB data access fault,
+ *  - SRR1 & 0x08000000 for a non-SLB instruction access fault
+ *  - 0 any SLB fault.
+ *
+ * The return value is 0 if the fault was handled, or the signal
+ * number if this is a kernel fault that can't be handled here.
+ */
+int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
+			    unsigned long error_code)
+{
+	struct vm_area_struct * vma;
+	struct mm_struct *mm = current->mm;
+	siginfo_t info;
+	int code = SEGV_MAPERR;
+	int is_write = 0, ret;
+	int trap = TRAP(regs);
+ 	int is_exec = trap == 0x400;
+
+#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
+	/*
+	 * Fortunately the bit assignments in SRR1 for an instruction
+	 * fault and DSISR for a data fault are mostly the same for the
+	 * bits we are interested in.  But there are some bits which
+	 * indicate errors in DSISR but can validly be set in SRR1.
+	 */
+	if (trap == 0x400)
+		error_code &= 0x48200000;
+	else
+		is_write = error_code & DSISR_ISSTORE;
+#else
+	is_write = error_code & ESR_DST;
+#endif /* CONFIG_4xx || CONFIG_BOOKE */
+
+	if (notify_page_fault(regs))
+		return 0;
+
+	if (unlikely(debugger_fault_handler(regs)))
+		return 0;
+
+	/* On a kernel SLB miss we can only check for a valid exception entry */
+	if (!user_mode(regs) && (address >= TASK_SIZE))
+		return SIGSEGV;
+
+#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
+  	if (error_code & DSISR_DABRMATCH) {
+		/* DABR match */
+		do_dabr(regs, address, error_code);
+		return 0;
+	}
+#endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/
+
+	if (in_atomic() || mm == NULL) {
+		if (!user_mode(regs))
+			return SIGSEGV;
+		/* in_atomic() in user mode is really bad,
+		   as is current->mm == NULL. */
+		printk(KERN_EMERG "Page fault in user mode with "
+		       "in_atomic() = %d mm = %p\n", in_atomic(), mm);
+		printk(KERN_EMERG "NIP = %lx  MSR = %lx\n",
+		       regs->nip, regs->msr);
+		die("Weird page fault", regs, SIGSEGV);
+	}
+
+	/* When running in the kernel we expect faults to occur only to
+	 * addresses in user space.  All other faults represent errors in the
+	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
+	 * erroneous fault occurring in a code path which already holds mmap_sem
+	 * we will deadlock attempting to validate the fault against the
+	 * address space.  Luckily the kernel only validly references user
+	 * space from well defined areas of code, which are listed in the
+	 * exceptions table.
+	 *
+	 * As the vast majority of faults will be valid we will only perform
+	 * the source reference check when there is a possibility of a deadlock.
+	 * Attempt to lock the address space, if we cannot we then validate the
+	 * source.  If this is invalid we can skip the address space check,
+	 * thus avoiding the deadlock.
+	 */
+	if (!down_read_trylock(&mm->mmap_sem)) {
+		if (!user_mode(regs) && !search_exception_tables(regs->nip))
+			goto bad_area_nosemaphore;
+
+		down_read(&mm->mmap_sem);
+	}
+
+	vma = find_vma(mm, address);
+	if (!vma)
+		goto bad_area;
+	if (vma->vm_start <= address)
+		goto good_area;
+	if (!(vma->vm_flags & VM_GROWSDOWN))
+		goto bad_area;
+
+	/*
+	 * N.B. The POWER/Open ABI allows programs to access up to
+	 * 288 bytes below the stack pointer.
+	 * The kernel signal delivery code writes up to about 1.5kB
+	 * below the stack pointer (r1) before decrementing it.
+	 * The exec code can write slightly over 640kB to the stack
+	 * before setting the user r1.  Thus we allow the stack to
+	 * expand to 1MB without further checks.
+	 */
+	if (address + 0x100000 < vma->vm_end) {
+		/* get user regs even if this fault is in kernel mode */
+		struct pt_regs *uregs = current->thread.regs;
+		if (uregs == NULL)
+			goto bad_area;
+
+		/*
+		 * A user-mode access to an address a long way below
+		 * the stack pointer is only valid if the instruction
+		 * is one which would update the stack pointer to the
+		 * address accessed if the instruction completed,
+		 * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
+		 * (or the byte, halfword, float or double forms).
+		 *
+		 * If we don't check this then any write to the area
+		 * between the last mapped region and the stack will
+		 * expand the stack rather than segfaulting.
+		 */
+		if (address + 2048 < uregs->gpr[1]
+		    && (!user_mode(regs) || !store_updates_sp(regs)))
+			goto bad_area;
+	}
+	if (expand_stack(vma, address))
+		goto bad_area;
+
+good_area:
+	code = SEGV_ACCERR;
+#if defined(CONFIG_6xx)
+	if (error_code & 0x95700000)
+		/* an error such as lwarx to I/O controller space,
+		   address matching DABR, eciwx, etc. */
+		goto bad_area;
+#endif /* CONFIG_6xx */
+#if defined(CONFIG_8xx)
+        /* The MPC8xx seems to always set 0x80000000, which is
+         * "undefined".  Of those that can be set, this is the only
+         * one which seems bad.
+         */
+	if (error_code & 0x10000000)
+                /* Guarded storage error. */
+		goto bad_area;
+#endif /* CONFIG_8xx */
+
+	if (is_exec) {
+#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
+		/* protection fault */
+		if (error_code & DSISR_PROTFAULT)
+			goto bad_area;
+		/*
+		 * Allow execution from readable areas if the MMU does not
+		 * provide separate controls over reading and executing.
+		 */
+		if (!(vma->vm_flags & VM_EXEC) &&
+		    (cpu_has_feature(CPU_FTR_NOEXECUTE) ||
+		     !(vma->vm_flags & (VM_READ | VM_WRITE))))
+			goto bad_area;
+#else
+		pte_t *ptep;
+		pmd_t *pmdp;
+
+		/* Since 4xx/Book-E supports per-page execute permission,
+		 * we lazily flush dcache to icache. */
+		ptep = NULL;
+		if (get_pteptr(mm, address, &ptep, &pmdp)) {
+			spinlock_t *ptl = pte_lockptr(mm, pmdp);
+			spin_lock(ptl);
+			if (pte_present(*ptep)) {
+				struct page *page = pte_page(*ptep);
+
+				if (!test_bit(PG_arch_1, &page->flags)) {
+					flush_dcache_icache_page(page);
+					set_bit(PG_arch_1, &page->flags);
+				}
+				pte_update(ptep, 0, _PAGE_HWEXEC |
+					   _PAGE_ACCESSED);
+				_tlbie(address, mm->context.id);
+				pte_unmap_unlock(ptep, ptl);
+				up_read(&mm->mmap_sem);
+				return 0;
+			}
+			pte_unmap_unlock(ptep, ptl);
+		}
+#endif
+	/* a write */
+	} else if (is_write) {
+		if (!(vma->vm_flags & VM_WRITE))
+			goto bad_area;
+	/* a read */
+	} else {
+		/* protection fault */
+		if (error_code & 0x08000000)
+			goto bad_area;
+		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+			goto bad_area;
+	}
+
+	/*
+	 * If for any reason at all we couldn't handle the fault,
+	 * make sure we exit gracefully rather than endlessly redo
+	 * the fault.
+	 */
+ survive:
+	ret = handle_mm_fault(mm, vma, address, is_write);
+	if (unlikely(ret & VM_FAULT_ERROR)) {
+		if (ret & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (ret & VM_FAULT_SIGBUS)
+			goto do_sigbus;
+		BUG();
+	}
+	if (ret & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
+	up_read(&mm->mmap_sem);
+	return 0;
+
+bad_area:
+	up_read(&mm->mmap_sem);
+
+bad_area_nosemaphore:
+	/* User mode accesses cause a SIGSEGV */
+	if (user_mode(regs)) {
+		_exception(SIGSEGV, regs, code, address);
+		return 0;
+	}
+
+	if (is_exec && (error_code & DSISR_PROTFAULT)
+	    && printk_ratelimit())
+		printk(KERN_CRIT "kernel tried to execute NX-protected"
+		       " page (%lx) - exploit attempt? (uid: %d)\n",
+		       address, current->uid);
+
+	return SIGSEGV;
+
+/*
+ * We ran out of memory, or some other thing happened to us that made
+ * us unable to handle the page fault gracefully.
+ */
+out_of_memory:
+	up_read(&mm->mmap_sem);
+	if (is_global_init(current)) {
+		yield();
+		down_read(&mm->mmap_sem);
+		goto survive;
+	}
+	printk("VM: killing process %s\n", current->comm);
+	if (user_mode(regs))
+		do_group_exit(SIGKILL);
+	return SIGKILL;
+
+do_sigbus:
+	up_read(&mm->mmap_sem);
+	if (user_mode(regs)) {
+		info.si_signo = SIGBUS;
+		info.si_errno = 0;
+		info.si_code = BUS_ADRERR;
+		info.si_addr = (void __user *)address;
+		force_sig_info(SIGBUS, &info, current);
+		return 0;
+	}
+	return SIGBUS;
+}
+
+/*
+ * bad_page_fault is called when we have a bad access from the kernel.
+ * It is called from the DSI and ISI handlers in head.S and from some
+ * of the procedures in traps.c.
+ */
+void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
+{
+	const struct exception_table_entry *entry;
+
+	/* Are we prepared to handle this fault?  */
+	if ((entry = search_exception_tables(regs->nip)) != NULL) {
+		regs->nip = entry->fixup;
+		return;
+	}
+
+	/* kernel has accessed a bad area */
+
+	switch (regs->trap) {
+	case 0x300:
+	case 0x380:
+		printk(KERN_ALERT "Unable to handle kernel paging request for "
+			"data at address 0x%08lx\n", regs->dar);
+		break;
+	case 0x400:
+	case 0x480:
+		printk(KERN_ALERT "Unable to handle kernel paging request for "
+			"instruction fetch\n");
+		break;
+	default:
+		printk(KERN_ALERT "Unable to handle kernel paging request for "
+			"unknown fault\n");
+		break;
+	}
+	printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n",
+		regs->nip);
+
+	die("Kernel access of bad area", regs, sig);
+}
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
new file mode 100644
index 0000000..6046e51
--- /dev/null
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -0,0 +1,232 @@
+/*
+ * Modifications by Kumar Gala (galak@kernel.crashing.org) to support
+ * E500 Book E processors.
+ *
+ * Copyright 2004 Freescale Semiconductor, Inc
+ *
+ * This file contains the routines for initializing the MMU
+ * on the 4xx series of chips.
+ *  -- paulus
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/highmem.h>
+
+#include <asm/pgalloc.h>
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/uaccess.h>
+#include <asm/smp.h>
+#include <asm/machdep.h>
+#include <asm/setup.h>
+
+#include "mmu_decl.h"
+
+extern void loadcam_entry(unsigned int index);
+unsigned int tlbcam_index;
+unsigned int num_tlbcam_entries;
+static unsigned long __cam0, __cam1, __cam2;
+
+#define NUM_TLBCAMS	(16)
+
+struct tlbcam {
+   	u32	MAS0;
+	u32	MAS1;
+	u32	MAS2;
+	u32	MAS3;
+	u32	MAS7;
+} TLBCAM[NUM_TLBCAMS];
+
+struct tlbcamrange {
+   	unsigned long start;
+	unsigned long limit;
+	phys_addr_t phys;
+} tlbcam_addrs[NUM_TLBCAMS];
+
+extern unsigned int tlbcam_index;
+
+/*
+ * Return PA for this VA if it is mapped by a CAM, or 0
+ */
+phys_addr_t v_mapped_by_tlbcam(unsigned long va)
+{
+	int b;
+	for (b = 0; b < tlbcam_index; ++b)
+		if (va >= tlbcam_addrs[b].start && va < tlbcam_addrs[b].limit)
+			return tlbcam_addrs[b].phys + (va - tlbcam_addrs[b].start);
+	return 0;
+}
+
+/*
+ * Return VA for a given PA or 0 if not mapped
+ */
+unsigned long p_mapped_by_tlbcam(phys_addr_t pa)
+{
+	int b;
+	for (b = 0; b < tlbcam_index; ++b)
+		if (pa >= tlbcam_addrs[b].phys
+	    	    && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start)
+		              +tlbcam_addrs[b].phys)
+			return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys);
+	return 0;
+}
+
+/*
+ * Set up one of the I/D BAT (block address translation) register pairs.
+ * The parameters are not checked; in particular size must be a power
+ * of 4 between 4k and 256M.
+ */
+void settlbcam(int index, unsigned long virt, phys_addr_t phys,
+		unsigned int size, int flags, unsigned int pid)
+{
+	unsigned int tsize, lz;
+
+	asm ("cntlzw %0,%1" : "=r" (lz) : "r" (size));
+	tsize = (21 - lz) / 2;
+
+#ifdef CONFIG_SMP
+	if ((flags & _PAGE_NO_CACHE) == 0)
+		flags |= _PAGE_COHERENT;
+#endif
+
+	TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index) | MAS0_NV(index+1);
+	TLBCAM[index].MAS1 = MAS1_VALID | MAS1_IPROT | MAS1_TSIZE(tsize) | MAS1_TID(pid);
+	TLBCAM[index].MAS2 = virt & PAGE_MASK;
+
+	TLBCAM[index].MAS2 |= (flags & _PAGE_WRITETHRU) ? MAS2_W : 0;
+	TLBCAM[index].MAS2 |= (flags & _PAGE_NO_CACHE) ? MAS2_I : 0;
+	TLBCAM[index].MAS2 |= (flags & _PAGE_COHERENT) ? MAS2_M : 0;
+	TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0;
+	TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0;
+
+	TLBCAM[index].MAS3 = (phys & PAGE_MASK) | MAS3_SX | MAS3_SR;
+	TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0);
+
+#ifndef CONFIG_KGDB /* want user access for breakpoints */
+	if (flags & _PAGE_USER) {
+	   TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR;
+	   TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0);
+	}
+#else
+	TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR;
+	TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0);
+#endif
+
+	tlbcam_addrs[index].start = virt;
+	tlbcam_addrs[index].limit = virt + size - 1;
+	tlbcam_addrs[index].phys = phys;
+
+	loadcam_entry(index);
+}
+
+void invalidate_tlbcam_entry(int index)
+{
+	TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index);
+	TLBCAM[index].MAS1 = ~MAS1_VALID;
+
+	loadcam_entry(index);
+}
+
+void __init cam_mapin_ram(unsigned long cam0, unsigned long cam1,
+		unsigned long cam2)
+{
+	settlbcam(0, PAGE_OFFSET, memstart_addr, cam0, _PAGE_KERNEL, 0);
+	tlbcam_index++;
+	if (cam1) {
+		tlbcam_index++;
+		settlbcam(1, PAGE_OFFSET+cam0, memstart_addr+cam0, cam1, _PAGE_KERNEL, 0);
+	}
+	if (cam2) {
+		tlbcam_index++;
+		settlbcam(2, PAGE_OFFSET+cam0+cam1, memstart_addr+cam0+cam1, cam2, _PAGE_KERNEL, 0);
+	}
+}
+
+/*
+ * MMU_init_hw does the chip-specific initialization of the MMU hardware.
+ */
+void __init MMU_init_hw(void)
+{
+	flush_instruction_cache();
+}
+
+unsigned long __init mmu_mapin_ram(void)
+{
+	cam_mapin_ram(__cam0, __cam1, __cam2);
+
+	return __cam0 + __cam1 + __cam2;
+}
+
+
+void __init
+adjust_total_lowmem(void)
+{
+	phys_addr_t max_lowmem_size = __max_low_memory;
+	phys_addr_t cam_max_size = 0x10000000;
+	phys_addr_t ram;
+
+	/* adjust CAM size to max_lowmem_size */
+	if (max_lowmem_size < cam_max_size)
+		cam_max_size = max_lowmem_size;
+
+	/* adjust lowmem size to max_lowmem_size */
+	ram = min(max_lowmem_size, total_lowmem);
+
+	/* Calculate CAM values */
+	__cam0 = 1UL << 2 * (__ilog2(ram) / 2);
+	if (__cam0 > cam_max_size)
+		__cam0 = cam_max_size;
+	ram -= __cam0;
+	if (ram) {
+		__cam1 = 1UL << 2 * (__ilog2(ram) / 2);
+		if (__cam1 > cam_max_size)
+			__cam1 = cam_max_size;
+		ram -= __cam1;
+	}
+	if (ram) {
+		__cam2 = 1UL << 2 * (__ilog2(ram) / 2);
+		if (__cam2 > cam_max_size)
+			__cam2 = cam_max_size;
+		ram -= __cam2;
+	}
+
+	printk(KERN_INFO "Memory CAM mapping: CAM0=%ldMb, CAM1=%ldMb,"
+			" CAM2=%ldMb residual: %ldMb\n",
+			__cam0 >> 20, __cam1 >> 20, __cam2 >> 20,
+			(long int)((total_lowmem - __cam0 - __cam1 - __cam2)
+				   >> 20));
+	__max_low_memory = __cam0 + __cam1 + __cam2;
+	__initial_memory_limit_addr = memstart_addr + __max_low_memory;
+}
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
new file mode 100644
index 0000000..28a114d
--- /dev/null
+++ b/arch/powerpc/mm/gup.c
@@ -0,0 +1,281 @@
+/*
+ * Lockless get_user_pages_fast for powerpc
+ *
+ * Copyright (C) 2008 Nick Piggin
+ * Copyright (C) 2008 Novell Inc.
+ */
+#undef DEBUG
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/vmstat.h>
+#include <linux/pagemap.h>
+#include <linux/rwsem.h>
+#include <asm/pgtable.h>
+
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
+		unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long mask, result;
+	pte_t *ptep;
+
+	result = _PAGE_PRESENT|_PAGE_USER;
+	if (write)
+		result |= _PAGE_RW;
+	mask = result | _PAGE_SPECIAL;
+
+	ptep = pte_offset_kernel(&pmd, addr);
+	do {
+		pte_t pte = *ptep;
+		struct page *page;
+
+		if ((pte_val(pte) & mask) != result)
+			return 0;
+		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+		page = pte_page(pte);
+		if (!page_cache_get_speculative(page))
+			return 0;
+		if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+			put_page(page);
+			return 0;
+		}
+		pages[*nr] = page;
+		(*nr)++;
+
+	} while (ptep++, addr += PAGE_SIZE, addr != end);
+
+	return 1;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate,
+				 unsigned long *addr, unsigned long end,
+				 int write, struct page **pages, int *nr)
+{
+	unsigned long mask;
+	unsigned long pte_end;
+	struct page *head, *page;
+	pte_t pte;
+	int refs;
+
+	pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate);
+	if (pte_end < end)
+		end = pte_end;
+
+	pte = *ptep;
+	mask = _PAGE_PRESENT|_PAGE_USER;
+	if (write)
+		mask |= _PAGE_RW;
+	if ((pte_val(pte) & mask) != mask)
+		return 0;
+	/* hugepages are never "special" */
+	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+	refs = 0;
+	head = pte_page(pte);
+	page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT);
+	do {
+		VM_BUG_ON(compound_head(page) != head);
+		pages[*nr] = page;
+		(*nr)++;
+		page++;
+		refs++;
+	} while (*addr += PAGE_SIZE, *addr != end);
+
+	if (!page_cache_add_speculative(head, refs)) {
+		*nr -= refs;
+		return 0;
+	}
+	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+		/* Could be optimized better */
+		while (*nr) {
+			put_page(page);
+			(*nr)--;
+		}
+	}
+
+	return 1;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+		int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pmd_t *pmdp;
+
+	pmdp = pmd_offset(&pud, addr);
+	do {
+		pmd_t pmd = *pmdp;
+
+		next = pmd_addr_end(addr, end);
+		if (pmd_none(pmd))
+			return 0;
+		if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+			return 0;
+	} while (pmdp++, addr = next, addr != end);
+
+	return 1;
+}
+
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+		int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pud_t *pudp;
+
+	pudp = pud_offset(&pgd, addr);
+	do {
+		pud_t pud = *pudp;
+
+		next = pud_addr_end(addr, end);
+		if (pud_none(pud))
+			return 0;
+		if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+			return 0;
+	} while (pudp++, addr = next, addr != end);
+
+	return 1;
+}
+
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr, len, end;
+	unsigned long next;
+	pgd_t *pgdp;
+	int psize, nr = 0;
+	unsigned int shift;
+
+	pr_debug("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
+
+	start &= PAGE_MASK;
+	addr = start;
+	len = (unsigned long) nr_pages << PAGE_SHIFT;
+	end = start + len;
+
+	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+					start, len)))
+		goto slow_irqon;
+
+	pr_debug("  aligned: %lx .. %lx\n", start, end);
+
+#ifdef CONFIG_HUGETLB_PAGE
+	/* We bail out on slice boundary crossing when hugetlb is
+	 * enabled in order to not have to deal with two different
+	 * page table formats
+	 */
+	if (addr < SLICE_LOW_TOP) {
+		if (end > SLICE_LOW_TOP)
+			goto slow_irqon;
+
+		if (unlikely(GET_LOW_SLICE_INDEX(addr) !=
+			     GET_LOW_SLICE_INDEX(end - 1)))
+			goto slow_irqon;
+	} else {
+		if (unlikely(GET_HIGH_SLICE_INDEX(addr) !=
+			     GET_HIGH_SLICE_INDEX(end - 1)))
+			goto slow_irqon;
+	}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+	/*
+	 * XXX: batch / limit 'nr', to avoid large irq off latency
+	 * needs some instrumenting to determine the common sizes used by
+	 * important workloads (eg. DB2), and whether limiting the batch size
+	 * will decrease performance.
+	 *
+	 * It seems like we're in the clear for the moment. Direct-IO is
+	 * the main guy that batches up lots of get_user_pages, and even
+	 * they are limited to 64-at-a-time which is not so many.
+	 */
+	/*
+	 * This doesn't prevent pagetable teardown, but does prevent
+	 * the pagetables from being freed on powerpc.
+	 *
+	 * So long as we atomically load page table pointers versus teardown,
+	 * we can follow the address down to the the page and take a ref on it.
+	 */
+	local_irq_disable();
+
+	psize = get_slice_psize(mm, addr);
+	shift = mmu_psize_defs[psize].shift;
+
+#ifdef CONFIG_HUGETLB_PAGE
+	if (unlikely(mmu_huge_psizes[psize])) {
+		pte_t *ptep;
+		unsigned long a = addr;
+		unsigned long sz = ((1UL) << shift);
+		struct hstate *hstate = size_to_hstate(sz);
+
+		BUG_ON(!hstate);
+		/*
+		 * XXX: could be optimized to avoid hstate
+		 * lookup entirely (just use shift)
+		 */
+
+		do {
+			VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift);
+			ptep = huge_pte_offset(mm, a);
+			pr_debug(" %016lx: huge ptep %p\n", a, ptep);
+			if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages,
+						   &nr))
+				goto slow;
+		} while (a != end);
+	} else
+#endif /* CONFIG_HUGETLB_PAGE */
+	{
+		pgdp = pgd_offset(mm, addr);
+		do {
+			pgd_t pgd = *pgdp;
+
+			VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift);
+			pr_debug("  %016lx: normal pgd %p\n", addr,
+				 (void *)pgd_val(pgd));
+			next = pgd_addr_end(addr, end);
+			if (pgd_none(pgd))
+				goto slow;
+			if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+				goto slow;
+		} while (pgdp++, addr = next, addr != end);
+	}
+	local_irq_enable();
+
+	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+	return nr;
+
+	{
+		int ret;
+
+slow:
+		local_irq_enable();
+slow_irqon:
+		pr_debug("  slow path ! nr = %d\n", nr);
+
+		/* Try to get the remaining pages with get_user_pages */
+		start += nr << PAGE_SHIFT;
+		pages += nr;
+
+		down_read(&mm->mmap_sem);
+		ret = get_user_pages(current, mm, start,
+			(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
+		up_read(&mm->mmap_sem);
+
+		/* Have to be a bit careful with return values */
+		if (nr > 0) {
+			if (ret < 0)
+				ret = nr;
+			else
+				ret += nr;
+		}
+
+		return ret;
+	}
+}
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
new file mode 100644
index 0000000..7bffb70
--- /dev/null
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -0,0 +1,665 @@
+/*
+ *  PowerPC version
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *  Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP
+ *    Copyright (C) 1996 Cort Dougan <cort@cs.nmt.edu>
+ *  Adapted for Power Macintosh by Paul Mackerras.
+ *  Low-level exception handlers and MMU support
+ *  rewritten by Paul Mackerras.
+ *    Copyright (C) 1996 Paul Mackerras.
+ *
+ *  This file contains low-level assembler routines for managing
+ *  the PowerPC MMU hash table.  (PPC 8xx processors don't use a
+ *  hash table, so this file is not used on them.)
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/cputable.h>
+#include <asm/ppc_asm.h>
+#include <asm/thread_info.h>
+#include <asm/asm-offsets.h>
+
+#ifdef CONFIG_SMP
+	.section .bss
+	.align	2
+	.globl mmu_hash_lock
+mmu_hash_lock:
+	.space	4
+#endif /* CONFIG_SMP */
+
+/*
+ * Sync CPUs with hash_page taking & releasing the hash
+ * table lock
+ */
+#ifdef CONFIG_SMP
+	.text
+_GLOBAL(hash_page_sync)
+	mfmsr   r10
+	rlwinm  r0,r10,0,17,15          /* clear bit 16 (MSR_EE) */
+	mtmsr   r0
+	lis	r8,mmu_hash_lock@h
+	ori	r8,r8,mmu_hash_lock@l
+	lis	r0,0x0fff
+	b	10f
+11:	lwz	r6,0(r8)
+	cmpwi	0,r6,0
+	bne	11b
+10:	lwarx	r6,0,r8
+	cmpwi	0,r6,0
+	bne-	11b
+	stwcx.	r0,0,r8
+	bne-	10b
+	isync
+	eieio
+	li	r0,0
+	stw	r0,0(r8)
+	mtmsr	r10
+	blr
+#endif /* CONFIG_SMP */
+
+/*
+ * Load a PTE into the hash table, if possible.
+ * The address is in r4, and r3 contains an access flag:
+ * _PAGE_RW (0x400) if a write.
+ * r9 contains the SRR1 value, from which we use the MSR_PR bit.
+ * SPRG3 contains the physical address of the current task's thread.
+ *
+ * Returns to the caller if the access is illegal or there is no
+ * mapping for the address.  Otherwise it places an appropriate PTE
+ * in the hash table and returns from the exception.
+ * Uses r0, r3 - r8, r10, ctr, lr.
+ */
+	.text
+_GLOBAL(hash_page)
+	tophys(r7,0)			/* gets -KERNELBASE into r7 */
+#ifdef CONFIG_SMP
+	addis	r8,r7,mmu_hash_lock@h
+	ori	r8,r8,mmu_hash_lock@l
+	lis	r0,0x0fff
+	b	10f
+11:	lwz	r6,0(r8)
+	cmpwi	0,r6,0
+	bne	11b
+10:	lwarx	r6,0,r8
+	cmpwi	0,r6,0
+	bne-	11b
+	stwcx.	r0,0,r8
+	bne-	10b
+	isync
+#endif
+	/* Get PTE (linux-style) and check access */
+	lis	r0,KERNELBASE@h		/* check if kernel address */
+	cmplw	0,r4,r0
+	mfspr	r8,SPRN_SPRG3		/* current task's THREAD (phys) */
+	ori	r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */
+	lwz	r5,PGDIR(r8)		/* virt page-table root */
+	blt+	112f			/* assume user more likely */
+	lis	r5,swapper_pg_dir@ha	/* if kernel address, use */
+	addi	r5,r5,swapper_pg_dir@l	/* kernel page table */
+	rlwimi	r3,r9,32-12,29,29	/* MSR_PR -> _PAGE_USER */
+112:	add	r5,r5,r7		/* convert to phys addr */
+#ifndef CONFIG_PTE_64BIT
+	rlwimi	r5,r4,12,20,29		/* insert top 10 bits of address */
+	lwz	r8,0(r5)		/* get pmd entry */
+	rlwinm.	r8,r8,0,0,19		/* extract address of pte page */
+#else
+	rlwinm	r8,r4,13,19,29		/* Compute pgdir/pmd offset */
+	lwzx	r8,r8,r5		/* Get L1 entry */
+	rlwinm.	r8,r8,0,0,20		/* extract pt base address */
+#endif
+#ifdef CONFIG_SMP
+	beq-	hash_page_out		/* return if no mapping */
+#else
+	/* XXX it seems like the 601 will give a machine fault on the
+	   rfi if its alignment is wrong (bottom 4 bits of address are
+	   8 or 0xc) and we have had a not-taken conditional branch
+	   to the address following the rfi. */
+	beqlr-
+#endif
+#ifndef CONFIG_PTE_64BIT
+	rlwimi	r8,r4,22,20,29		/* insert next 10 bits of address */
+#else
+	rlwimi	r8,r4,23,20,28		/* compute pte address */
+#endif
+	rlwinm	r0,r3,32-3,24,24	/* _PAGE_RW access -> _PAGE_DIRTY */
+	ori	r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE
+
+	/*
+	 * Update the linux PTE atomically.  We do the lwarx up-front
+	 * because almost always, there won't be a permission violation
+	 * and there won't already be an HPTE, and thus we will have
+	 * to update the PTE to set _PAGE_HASHPTE.  -- paulus.
+	 *
+	 * If PTE_64BIT is set, the low word is the flags word; use that
+	 * word for locking since it contains all the interesting bits.
+	 */
+#if (PTE_FLAGS_OFFSET != 0)
+	addi	r8,r8,PTE_FLAGS_OFFSET
+#endif
+retry:
+	lwarx	r6,0,r8			/* get linux-style pte, flag word */
+	andc.	r5,r3,r6		/* check access & ~permission */
+#ifdef CONFIG_SMP
+	bne-	hash_page_out		/* return if access not permitted */
+#else
+	bnelr-
+#endif
+	or	r5,r0,r6		/* set accessed/dirty bits */
+#ifdef CONFIG_PTE_64BIT
+#ifdef CONFIG_SMP
+	subf	r10,r6,r8		/* create false data dependency */
+	subi	r10,r10,PTE_FLAGS_OFFSET
+	lwzx	r10,r6,r10		/* Get upper PTE word */
+#else
+	lwz	r10,-PTE_FLAGS_OFFSET(r8)
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_PTE_64BIT */
+	stwcx.	r5,0,r8			/* attempt to update PTE */
+	bne-	retry			/* retry if someone got there first */
+
+	mfsrin	r3,r4			/* get segment reg for segment */
+	mfctr	r0
+	stw	r0,_CTR(r11)
+	bl	create_hpte		/* add the hash table entry */
+
+#ifdef CONFIG_SMP
+	eieio
+	addis	r8,r7,mmu_hash_lock@ha
+	li	r0,0
+	stw	r0,mmu_hash_lock@l(r8)
+#endif
+
+	/* Return from the exception */
+	lwz	r5,_CTR(r11)
+	mtctr	r5
+	lwz	r0,GPR0(r11)
+	lwz	r7,GPR7(r11)
+	lwz	r8,GPR8(r11)
+	b	fast_exception_return
+
+#ifdef CONFIG_SMP
+hash_page_out:
+	eieio
+	addis	r8,r7,mmu_hash_lock@ha
+	li	r0,0
+	stw	r0,mmu_hash_lock@l(r8)
+	blr
+#endif /* CONFIG_SMP */
+
+/*
+ * Add an entry for a particular page to the hash table.
+ *
+ * add_hash_page(unsigned context, unsigned long va, unsigned long pmdval)
+ *
+ * We assume any necessary modifications to the pte (e.g. setting
+ * the accessed bit) have already been done and that there is actually
+ * a hash table in use (i.e. we're not on a 603).
+ */
+_GLOBAL(add_hash_page)
+	mflr	r0
+	stw	r0,4(r1)
+
+	/* Convert context and va to VSID */
+	mulli	r3,r3,897*16		/* multiply context by context skew */
+	rlwinm	r0,r4,4,28,31		/* get ESID (top 4 bits of va) */
+	mulli	r0,r0,0x111		/* multiply by ESID skew */
+	add	r3,r3,r0		/* note create_hpte trims to 24 bits */
+
+#ifdef CONFIG_SMP
+	rlwinm	r8,r1,0,0,(31-THREAD_SHIFT) /* use cpu number to make tag */
+	lwz	r8,TI_CPU(r8)		/* to go in mmu_hash_lock */
+	oris	r8,r8,12
+#endif /* CONFIG_SMP */
+
+	/*
+	 * We disable interrupts here, even on UP, because we don't
+	 * want to race with hash_page, and because we want the
+	 * _PAGE_HASHPTE bit to be a reliable indication of whether
+	 * the HPTE exists (or at least whether one did once).
+	 * We also turn off the MMU for data accesses so that we
+	 * we can't take a hash table miss (assuming the code is
+	 * covered by a BAT).  -- paulus
+	 */
+	mfmsr	r9
+	SYNC
+	rlwinm	r0,r9,0,17,15		/* clear bit 16 (MSR_EE) */
+	rlwinm	r0,r0,0,28,26		/* clear MSR_DR */
+	mtmsr	r0
+	SYNC_601
+	isync
+
+	tophys(r7,0)
+
+#ifdef CONFIG_SMP
+	addis	r6,r7,mmu_hash_lock@ha
+	addi	r6,r6,mmu_hash_lock@l
+10:	lwarx	r0,0,r6			/* take the mmu_hash_lock */
+	cmpi	0,r0,0
+	bne-	11f
+	stwcx.	r8,0,r6
+	beq+	12f
+11:	lwz	r0,0(r6)
+	cmpi	0,r0,0
+	beq	10b
+	b	11b
+12:	isync
+#endif
+
+	/*
+	 * Fetch the linux pte and test and set _PAGE_HASHPTE atomically.
+	 * If _PAGE_HASHPTE was already set, we don't replace the existing
+	 * HPTE, so we just unlock and return.
+	 */
+	mr	r8,r5
+#ifndef CONFIG_PTE_64BIT
+	rlwimi	r8,r4,22,20,29
+#else
+	rlwimi	r8,r4,23,20,28
+	addi	r8,r8,PTE_FLAGS_OFFSET
+#endif
+1:	lwarx	r6,0,r8
+	andi.	r0,r6,_PAGE_HASHPTE
+	bne	9f			/* if HASHPTE already set, done */
+#ifdef CONFIG_PTE_64BIT
+#ifdef CONFIG_SMP
+	subf	r10,r6,r8		/* create false data dependency */
+	subi	r10,r10,PTE_FLAGS_OFFSET
+	lwzx	r10,r6,r10		/* Get upper PTE word */
+#else
+	lwz	r10,-PTE_FLAGS_OFFSET(r8)
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_PTE_64BIT */
+	ori	r5,r6,_PAGE_HASHPTE
+	stwcx.	r5,0,r8
+	bne-	1b
+
+	bl	create_hpte
+
+9:
+#ifdef CONFIG_SMP
+	addis	r6,r7,mmu_hash_lock@ha
+	addi	r6,r6,mmu_hash_lock@l
+	eieio
+	li	r0,0
+	stw	r0,0(r6)		/* clear mmu_hash_lock */
+#endif
+
+	/* reenable interrupts and DR */
+	mtmsr	r9
+	SYNC_601
+	isync
+
+	lwz	r0,4(r1)
+	mtlr	r0
+	blr
+
+/*
+ * This routine adds a hardware PTE to the hash table.
+ * It is designed to be called with the MMU either on or off.
+ * r3 contains the VSID, r4 contains the virtual address,
+ * r5 contains the linux PTE, r6 contains the old value of the
+ * linux PTE (before setting _PAGE_HASHPTE) and r7 contains the
+ * offset to be added to addresses (0 if the MMU is on,
+ * -KERNELBASE if it is off).  r10 contains the upper half of
+ * the PTE if CONFIG_PTE_64BIT.
+ * On SMP, the caller should have the mmu_hash_lock held.
+ * We assume that the caller has (or will) set the _PAGE_HASHPTE
+ * bit in the linux PTE in memory.  The value passed in r6 should
+ * be the old linux PTE value; if it doesn't have _PAGE_HASHPTE set
+ * this routine will skip the search for an existing HPTE.
+ * This procedure modifies r0, r3 - r6, r8, cr0.
+ *  -- paulus.
+ *
+ * For speed, 4 of the instructions get patched once the size and
+ * physical address of the hash table are known.  These definitions
+ * of Hash_base and Hash_bits below are just an example.
+ */
+Hash_base = 0xc0180000
+Hash_bits = 12				/* e.g. 256kB hash table */
+Hash_msk = (((1 << Hash_bits) - 1) * 64)
+
+/* defines for the PTE format for 32-bit PPCs */
+#define HPTE_SIZE	8
+#define PTEG_SIZE	64
+#define LG_PTEG_SIZE	6
+#define LDPTEu		lwzu
+#define LDPTE		lwz
+#define STPTE		stw
+#define CMPPTE		cmpw
+#define PTE_H		0x40
+#define PTE_V		0x80000000
+#define TST_V(r)	rlwinm. r,r,0,0,0
+#define SET_V(r)	oris r,r,PTE_V@h
+#define CLR_V(r,t)	rlwinm r,r,0,1,31
+
+#define HASH_LEFT	31-(LG_PTEG_SIZE+Hash_bits-1)
+#define HASH_RIGHT	31-LG_PTEG_SIZE
+
+_GLOBAL(create_hpte)
+	/* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */
+	rlwinm	r8,r5,32-10,31,31	/* _PAGE_RW -> PP lsb */
+	rlwinm	r0,r5,32-7,31,31	/* _PAGE_DIRTY -> PP lsb */
+	and	r8,r8,r0		/* writable if _RW & _DIRTY */
+	rlwimi	r5,r5,32-1,30,30	/* _PAGE_USER -> PP msb */
+	rlwimi	r5,r5,32-2,31,31	/* _PAGE_USER -> PP lsb */
+	ori	r8,r8,0xe14		/* clear out reserved bits and M */
+	andc	r8,r5,r8		/* PP = user? (rw&dirty? 2: 3): 0 */
+BEGIN_FTR_SECTION
+	ori	r8,r8,_PAGE_COHERENT	/* set M (coherence required) */
+END_FTR_SECTION_IFSET(CPU_FTR_NEED_COHERENT)
+#ifdef CONFIG_PTE_64BIT
+	/* Put the XPN bits into the PTE */
+	rlwimi	r8,r10,8,20,22
+	rlwimi	r8,r10,2,29,29
+#endif
+
+	/* Construct the high word of the PPC-style PTE (r5) */
+	rlwinm	r5,r3,7,1,24		/* put VSID in 0x7fffff80 bits */
+	rlwimi	r5,r4,10,26,31		/* put in API (abbrev page index) */
+	SET_V(r5)			/* set V (valid) bit */
+
+	/* Get the address of the primary PTE group in the hash table (r3) */
+_GLOBAL(hash_page_patch_A)
+	addis	r0,r7,Hash_base@h	/* base address of hash table */
+	rlwimi	r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT    /* VSID -> hash */
+	rlwinm	r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
+	xor	r3,r3,r0		/* make primary hash */
+	li	r0,8			/* PTEs/group */
+
+	/*
+	 * Test the _PAGE_HASHPTE bit in the old linux PTE, and skip the search
+	 * if it is clear, meaning that the HPTE isn't there already...
+	 */
+	andi.	r6,r6,_PAGE_HASHPTE
+	beq+	10f			/* no PTE: go look for an empty slot */
+	tlbie	r4
+
+	addis	r4,r7,htab_hash_searches@ha
+	lwz	r6,htab_hash_searches@l(r4)
+	addi	r6,r6,1			/* count how many searches we do */
+	stw	r6,htab_hash_searches@l(r4)
+
+	/* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
+	mtctr	r0
+	addi	r4,r3,-HPTE_SIZE
+1:	LDPTEu	r6,HPTE_SIZE(r4)	/* get next PTE */
+	CMPPTE	0,r6,r5
+	bdnzf	2,1b			/* loop while ctr != 0 && !cr0.eq */
+	beq+	found_slot
+
+	/* Search the secondary PTEG for a matching PTE */
+	ori	r5,r5,PTE_H		/* set H (secondary hash) bit */
+_GLOBAL(hash_page_patch_B)
+	xoris	r4,r3,Hash_msk>>16	/* compute secondary hash */
+	xori	r4,r4,(-PTEG_SIZE & 0xffff)
+	addi	r4,r4,-HPTE_SIZE
+	mtctr	r0
+2:	LDPTEu	r6,HPTE_SIZE(r4)
+	CMPPTE	0,r6,r5
+	bdnzf	2,2b
+	beq+	found_slot
+	xori	r5,r5,PTE_H		/* clear H bit again */
+
+	/* Search the primary PTEG for an empty slot */
+10:	mtctr	r0
+	addi	r4,r3,-HPTE_SIZE	/* search primary PTEG */
+1:	LDPTEu	r6,HPTE_SIZE(r4)	/* get next PTE */
+	TST_V(r6)			/* test valid bit */
+	bdnzf	2,1b			/* loop while ctr != 0 && !cr0.eq */
+	beq+	found_empty
+
+	/* update counter of times that the primary PTEG is full */
+	addis	r4,r7,primary_pteg_full@ha
+	lwz	r6,primary_pteg_full@l(r4)
+	addi	r6,r6,1
+	stw	r6,primary_pteg_full@l(r4)
+
+	/* Search the secondary PTEG for an empty slot */
+	ori	r5,r5,PTE_H		/* set H (secondary hash) bit */
+_GLOBAL(hash_page_patch_C)
+	xoris	r4,r3,Hash_msk>>16	/* compute secondary hash */
+	xori	r4,r4,(-PTEG_SIZE & 0xffff)
+	addi	r4,r4,-HPTE_SIZE
+	mtctr	r0
+2:	LDPTEu	r6,HPTE_SIZE(r4)
+	TST_V(r6)
+	bdnzf	2,2b
+	beq+	found_empty
+	xori	r5,r5,PTE_H		/* clear H bit again */
+
+	/*
+	 * Choose an arbitrary slot in the primary PTEG to overwrite.
+	 * Since both the primary and secondary PTEGs are full, and we
+	 * have no information that the PTEs in the primary PTEG are
+	 * more important or useful than those in the secondary PTEG,
+	 * and we know there is a definite (although small) speed
+	 * advantage to putting the PTE in the primary PTEG, we always
+	 * put the PTE in the primary PTEG.
+	 *
+	 * In addition, we skip any slot that is mapping kernel text in
+	 * order to avoid a deadlock when not using BAT mappings if
+	 * trying to hash in the kernel hash code itself after it has
+	 * already taken the hash table lock. This works in conjunction
+	 * with pre-faulting of the kernel text.
+	 *
+	 * If the hash table bucket is full of kernel text entries, we'll
+	 * lockup here but that shouldn't happen
+	 */
+
+1:	addis	r4,r7,next_slot@ha		/* get next evict slot */
+	lwz	r6,next_slot@l(r4)
+	addi	r6,r6,HPTE_SIZE			/* search for candidate */
+	andi.	r6,r6,7*HPTE_SIZE
+	stw	r6,next_slot@l(r4)
+	add	r4,r3,r6
+	LDPTE	r0,HPTE_SIZE/2(r4)		/* get PTE second word */
+	clrrwi	r0,r0,12
+	lis	r6,etext@h
+	ori	r6,r6,etext@l			/* get etext */
+	tophys(r6,r6)
+	cmpl	cr0,r0,r6			/* compare and try again */
+	blt	1b
+
+#ifndef CONFIG_SMP
+	/* Store PTE in PTEG */
+found_empty:
+	STPTE	r5,0(r4)
+found_slot:
+	STPTE	r8,HPTE_SIZE/2(r4)
+
+#else /* CONFIG_SMP */
+/*
+ * Between the tlbie above and updating the hash table entry below,
+ * another CPU could read the hash table entry and put it in its TLB.
+ * There are 3 cases:
+ * 1. using an empty slot
+ * 2. updating an earlier entry to change permissions (i.e. enable write)
+ * 3. taking over the PTE for an unrelated address
+ *
+ * In each case it doesn't really matter if the other CPUs have the old
+ * PTE in their TLB.  So we don't need to bother with another tlbie here,
+ * which is convenient as we've overwritten the register that had the
+ * address. :-)  The tlbie above is mainly to make sure that this CPU comes
+ * and gets the new PTE from the hash table.
+ *
+ * We do however have to make sure that the PTE is never in an invalid
+ * state with the V bit set.
+ */
+found_empty:
+found_slot:
+	CLR_V(r5,r0)		/* clear V (valid) bit in PTE */
+	STPTE	r5,0(r4)
+	sync
+	TLBSYNC
+	STPTE	r8,HPTE_SIZE/2(r4) /* put in correct RPN, WIMG, PP bits */
+	sync
+	SET_V(r5)
+	STPTE	r5,0(r4)	/* finally set V bit in PTE */
+#endif /* CONFIG_SMP */
+
+	sync		/* make sure pte updates get to memory */
+	blr
+
+	.section .bss
+	.align	2
+next_slot:
+	.space	4
+primary_pteg_full:
+	.space	4
+htab_hash_searches:
+	.space	4
+	.previous
+
+/*
+ * Flush the entry for a particular page from the hash table.
+ *
+ * flush_hash_pages(unsigned context, unsigned long va, unsigned long pmdval,
+ *		    int count)
+ *
+ * We assume that there is a hash table in use (Hash != 0).
+ */
+_GLOBAL(flush_hash_pages)
+	tophys(r7,0)
+
+	/*
+	 * We disable interrupts here, even on UP, because we want
+	 * the _PAGE_HASHPTE bit to be a reliable indication of
+	 * whether the HPTE exists (or at least whether one did once).
+	 * We also turn off the MMU for data accesses so that we
+	 * we can't take a hash table miss (assuming the code is
+	 * covered by a BAT).  -- paulus
+	 */
+	mfmsr	r10
+	SYNC
+	rlwinm	r0,r10,0,17,15		/* clear bit 16 (MSR_EE) */
+	rlwinm	r0,r0,0,28,26		/* clear MSR_DR */
+	mtmsr	r0
+	SYNC_601
+	isync
+
+	/* First find a PTE in the range that has _PAGE_HASHPTE set */
+#ifndef CONFIG_PTE_64BIT
+	rlwimi	r5,r4,22,20,29
+#else
+	rlwimi	r5,r4,23,20,28
+#endif
+1:	lwz	r0,PTE_FLAGS_OFFSET(r5)
+	cmpwi	cr1,r6,1
+	andi.	r0,r0,_PAGE_HASHPTE
+	bne	2f
+	ble	cr1,19f
+	addi	r4,r4,0x1000
+	addi	r5,r5,PTE_SIZE
+	addi	r6,r6,-1
+	b	1b
+
+	/* Convert context and va to VSID */
+2:	mulli	r3,r3,897*16		/* multiply context by context skew */
+	rlwinm	r0,r4,4,28,31		/* get ESID (top 4 bits of va) */
+	mulli	r0,r0,0x111		/* multiply by ESID skew */
+	add	r3,r3,r0		/* note code below trims to 24 bits */
+
+	/* Construct the high word of the PPC-style PTE (r11) */
+	rlwinm	r11,r3,7,1,24		/* put VSID in 0x7fffff80 bits */
+	rlwimi	r11,r4,10,26,31		/* put in API (abbrev page index) */
+	SET_V(r11)			/* set V (valid) bit */
+
+#ifdef CONFIG_SMP
+	addis	r9,r7,mmu_hash_lock@ha
+	addi	r9,r9,mmu_hash_lock@l
+	rlwinm	r8,r1,0,0,(31-THREAD_SHIFT)
+	add	r8,r8,r7
+	lwz	r8,TI_CPU(r8)
+	oris	r8,r8,9
+10:	lwarx	r0,0,r9
+	cmpi	0,r0,0
+	bne-	11f
+	stwcx.	r8,0,r9
+	beq+	12f
+11:	lwz	r0,0(r9)
+	cmpi	0,r0,0
+	beq	10b
+	b	11b
+12:	isync
+#endif
+
+	/*
+	 * Check the _PAGE_HASHPTE bit in the linux PTE.  If it is
+	 * already clear, we're done (for this pte).  If not,
+	 * clear it (atomically) and proceed.  -- paulus.
+	 */
+#if (PTE_FLAGS_OFFSET != 0)
+	addi	r5,r5,PTE_FLAGS_OFFSET
+#endif
+33:	lwarx	r8,0,r5			/* fetch the pte flags word */
+	andi.	r0,r8,_PAGE_HASHPTE
+	beq	8f			/* done if HASHPTE is already clear */
+	rlwinm	r8,r8,0,31,29		/* clear HASHPTE bit */
+	stwcx.	r8,0,r5			/* update the pte */
+	bne-	33b
+
+	/* Get the address of the primary PTE group in the hash table (r3) */
+_GLOBAL(flush_hash_patch_A)
+	addis	r8,r7,Hash_base@h	/* base address of hash table */
+	rlwimi	r8,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT    /* VSID -> hash */
+	rlwinm	r0,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
+	xor	r8,r0,r8		/* make primary hash */
+
+	/* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
+	li	r0,8			/* PTEs/group */
+	mtctr	r0
+	addi	r12,r8,-HPTE_SIZE
+1:	LDPTEu	r0,HPTE_SIZE(r12)	/* get next PTE */
+	CMPPTE	0,r0,r11
+	bdnzf	2,1b			/* loop while ctr != 0 && !cr0.eq */
+	beq+	3f
+
+	/* Search the secondary PTEG for a matching PTE */
+	ori	r11,r11,PTE_H		/* set H (secondary hash) bit */
+	li	r0,8			/* PTEs/group */
+_GLOBAL(flush_hash_patch_B)
+	xoris	r12,r8,Hash_msk>>16	/* compute secondary hash */
+	xori	r12,r12,(-PTEG_SIZE & 0xffff)
+	addi	r12,r12,-HPTE_SIZE
+	mtctr	r0
+2:	LDPTEu	r0,HPTE_SIZE(r12)
+	CMPPTE	0,r0,r11
+	bdnzf	2,2b
+	xori	r11,r11,PTE_H		/* clear H again */
+	bne-	4f			/* should rarely fail to find it */
+
+3:	li	r0,0
+	STPTE	r0,0(r12)		/* invalidate entry */
+4:	sync
+	tlbie	r4			/* in hw tlb too */
+	sync
+
+8:	ble	cr1,9f			/* if all ptes checked */
+81:	addi	r6,r6,-1
+	addi	r5,r5,PTE_SIZE
+	addi	r4,r4,0x1000
+	lwz	r0,0(r5)		/* check next pte */
+	cmpwi	cr1,r6,1
+	andi.	r0,r0,_PAGE_HASHPTE
+	bne	33b
+	bgt	cr1,81b
+
+9:
+#ifdef CONFIG_SMP
+	TLBSYNC
+	li	r0,0
+	stw	r0,0(r9)		/* clear mmu_hash_lock */
+#endif
+
+19:	mtmsr	r10
+	SYNC_601
+	isync
+	blr
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
new file mode 100644
index 0000000..a719f53
--- /dev/null
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -0,0 +1,958 @@
+/*
+ * ppc64 MMU hashtable management routines
+ *
+ * (c) Copyright IBM Corp. 2003, 2005
+ *
+ * Maintained by: Benjamin Herrenschmidt
+ *                <benh@kernel.crashing.org>
+ *
+ * This file is covered by the GNU Public Licence v2 as
+ * described in the kernel's COPYING file.
+ */
+
+#include <asm/reg.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/page.h>
+#include <asm/types.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/cputable.h>
+
+	.text
+
+/*
+ * Stackframe:
+ *		
+ *         +-> Back chain			(SP + 256)
+ *         |   General register save area	(SP + 112)
+ *         |   Parameter save area		(SP + 48)
+ *         |   TOC save area			(SP + 40)
+ *         |   link editor doubleword		(SP + 32)
+ *         |   compiler doubleword		(SP + 24)
+ *         |   LR save area			(SP + 16)
+ *         |   CR save area			(SP + 8)
+ * SP ---> +-- Back chain			(SP + 0)
+ */
+#define STACKFRAMESIZE	256
+
+/* Save parameters offsets */
+#define STK_PARM(i)	(STACKFRAMESIZE + 48 + ((i)-3)*8)
+
+/* Save non-volatile offsets */
+#define STK_REG(i)	(112 + ((i)-14)*8)
+
+
+#ifndef CONFIG_PPC_64K_PAGES
+
+/*****************************************************************************
+ *                                                                           *
+ *           4K SW & 4K HW pages implementation                              *
+ *                                                                           *
+ *****************************************************************************/
+
+
+/*
+ * _hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
+ *		 pte_t *ptep, unsigned long trap, int local, int ssize)
+ *
+ * Adds a 4K page to the hash table in a segment of 4K pages only
+ */
+
+_GLOBAL(__hash_page_4K)
+	mflr	r0
+	std	r0,16(r1)
+	stdu	r1,-STACKFRAMESIZE(r1)
+	/* Save all params that we need after a function call */
+	std	r6,STK_PARM(r6)(r1)
+	std	r8,STK_PARM(r8)(r1)
+	std	r9,STK_PARM(r9)(r1)
+	
+	/* Add _PAGE_PRESENT to access */
+	ori	r4,r4,_PAGE_PRESENT
+
+	/* Save non-volatile registers.
+	 * r31 will hold "old PTE"
+	 * r30 is "new PTE"
+	 * r29 is "va"
+	 * r28 is a hash value
+	 * r27 is hashtab mask (maybe dynamic patched instead ?)
+	 */
+	std	r27,STK_REG(r27)(r1)
+	std	r28,STK_REG(r28)(r1)
+	std	r29,STK_REG(r29)(r1)
+	std	r30,STK_REG(r30)(r1)
+	std	r31,STK_REG(r31)(r1)
+	
+	/* Step 1:
+	 *
+	 * Check permissions, atomically mark the linux PTE busy
+	 * and hashed.
+	 */ 
+1:
+	ldarx	r31,0,r6
+	/* Check access rights (access & ~(pte_val(*ptep))) */
+	andc.	r0,r4,r31
+	bne-	htab_wrong_access
+	/* Check if PTE is busy */
+	andi.	r0,r31,_PAGE_BUSY
+	/* If so, just bail out and refault if needed. Someone else
+	 * is changing this PTE anyway and might hash it.
+	 */
+	bne-	htab_bail_ok
+
+	/* Prepare new PTE value (turn access RW into DIRTY, then
+	 * add BUSY,HASHPTE and ACCESSED)
+	 */
+	rlwinm	r30,r4,32-9+7,31-7,31-7	/* _PAGE_RW -> _PAGE_DIRTY */
+	or	r30,r30,r31
+	ori	r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE
+	/* Write the linux PTE atomically (setting busy) */
+	stdcx.	r30,0,r6
+	bne-	1b
+	isync
+
+	/* Step 2:
+	 *
+	 * Insert/Update the HPTE in the hash table. At this point,
+	 * r4 (access) is re-useable, we use it for the new HPTE flags
+	 */
+
+BEGIN_FTR_SECTION
+	cmpdi	r9,0			/* check segment size */
+	bne	3f
+END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
+	/* Calc va and put it in r29 */
+	rldicr	r29,r5,28,63-28
+	rldicl	r3,r3,0,36
+	or	r29,r3,r29
+
+	/* Calculate hash value for primary slot and store it in r28 */
+	rldicl	r5,r5,0,25		/* vsid & 0x0000007fffffffff */
+	rldicl	r0,r3,64-12,48		/* (ea >> 12) & 0xffff */
+	xor	r28,r5,r0
+	b	4f
+
+3:	/* Calc VA and hash in r29 and r28 for 1T segment */
+	sldi	r29,r5,40		/* vsid << 40 */
+	clrldi	r3,r3,24		/* ea & 0xffffffffff */
+	rldic	r28,r5,25,25		/* (vsid << 25) & 0x7fffffffff */
+	clrldi	r5,r5,40		/* vsid & 0xffffff */
+	rldicl	r0,r3,64-12,36		/* (ea >> 12) & 0xfffffff */
+	xor	r28,r28,r5
+	or	r29,r3,r29		/* VA */
+	xor	r28,r28,r0		/* hash */
+
+	/* Convert linux PTE bits into HW equivalents */
+4:	andi.	r3,r30,0x1fe		/* Get basic set of flags */
+	xori	r3,r3,HPTE_R_N		/* _PAGE_EXEC -> NOEXEC */
+	rlwinm	r0,r30,32-9+1,30,30	/* _PAGE_RW -> _PAGE_USER (r0) */
+	rlwinm	r4,r30,32-7+1,30,30	/* _PAGE_DIRTY -> _PAGE_USER (r4) */
+	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
+	andc	r0,r30,r0		/* r0 = pte & ~r0 */
+	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
+	ori	r3,r3,HPTE_R_C		/* Always add "C" bit for perf. */
+
+	/* We eventually do the icache sync here (maybe inline that
+	 * code rather than call a C function...) 
+	 */
+BEGIN_FTR_SECTION
+	mr	r4,r30
+	mr	r5,r7
+	bl	.hash_page_do_lazy_icache
+END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
+
+	/* At this point, r3 contains new PP bits, save them in
+	 * place of "access" in the param area (sic)
+	 */
+	std	r3,STK_PARM(r4)(r1)
+
+	/* Get htab_hash_mask */
+	ld	r4,htab_hash_mask@got(2)
+	ld	r27,0(r4)	/* htab_hash_mask -> r27 */
+
+	/* Check if we may already be in the hashtable, in this case, we
+	 * go to out-of-line code to try to modify the HPTE
+	 */
+	andi.	r0,r31,_PAGE_HASHPTE
+	bne	htab_modify_pte
+
+htab_insert_pte:
+	/* Clear hpte bits in new pte (we also clear BUSY btw) and
+	 * add _PAGE_HASHPTE
+	 */
+	lis	r0,_PAGE_HPTEFLAGS@h
+	ori	r0,r0,_PAGE_HPTEFLAGS@l
+	andc	r30,r30,r0
+	ori	r30,r30,_PAGE_HASHPTE
+
+	/* physical address r5 */
+	rldicl	r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
+	sldi	r5,r5,PAGE_SHIFT
+
+	/* Calculate primary group hash */
+	and	r0,r28,r27
+	rldicr	r3,r0,3,63-3		/* r3 = (hash & mask) << 3 */
+
+	/* Call ppc_md.hpte_insert */
+	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
+	mr	r4,r29			/* Retreive va */
+	li	r7,0			/* !bolted, !secondary */
+	li	r8,MMU_PAGE_4K		/* page size */
+	ld	r9,STK_PARM(r9)(r1)	/* segment size */
+_GLOBAL(htab_call_hpte_insert1)
+	bl	.			/* Patched by htab_finish_init() */
+	cmpdi	0,r3,0
+	bge	htab_pte_insert_ok	/* Insertion successful */
+	cmpdi	0,r3,-2			/* Critical failure */
+	beq-	htab_pte_insert_failure
+
+	/* Now try secondary slot */
+	
+	/* physical address r5 */
+	rldicl	r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
+	sldi	r5,r5,PAGE_SHIFT
+
+	/* Calculate secondary group hash */
+	andc	r0,r27,r28
+	rldicr	r3,r0,3,63-3	/* r0 = (~hash & mask) << 3 */
+	
+	/* Call ppc_md.hpte_insert */
+	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
+	mr	r4,r29			/* Retreive va */
+	li	r7,HPTE_V_SECONDARY	/* !bolted, secondary */
+	li	r8,MMU_PAGE_4K		/* page size */
+	ld	r9,STK_PARM(r9)(r1)	/* segment size */
+_GLOBAL(htab_call_hpte_insert2)
+	bl	.			/* Patched by htab_finish_init() */
+	cmpdi	0,r3,0
+	bge+	htab_pte_insert_ok	/* Insertion successful */
+	cmpdi	0,r3,-2			/* Critical failure */
+	beq-	htab_pte_insert_failure
+
+	/* Both are full, we need to evict something */
+	mftb	r0
+	/* Pick a random group based on TB */
+	andi.	r0,r0,1
+	mr	r5,r28
+	bne	2f
+	not	r5,r5
+2:	and	r0,r5,r27
+	rldicr	r3,r0,3,63-3	/* r0 = (hash & mask) << 3 */	
+	/* Call ppc_md.hpte_remove */
+_GLOBAL(htab_call_hpte_remove)
+	bl	.			/* Patched by htab_finish_init() */
+
+	/* Try all again */
+	b	htab_insert_pte	
+
+htab_bail_ok:
+	li	r3,0
+	b	htab_bail
+
+htab_pte_insert_ok:
+	/* Insert slot number & secondary bit in PTE */
+	rldimi	r30,r3,12,63-15
+		
+	/* Write out the PTE with a normal write
+	 * (maybe add eieio may be good still ?)
+	 */
+htab_write_out_pte:
+	ld	r6,STK_PARM(r6)(r1)
+	std	r30,0(r6)
+	li	r3, 0
+htab_bail:
+	ld	r27,STK_REG(r27)(r1)
+	ld	r28,STK_REG(r28)(r1)
+	ld	r29,STK_REG(r29)(r1)
+	ld      r30,STK_REG(r30)(r1)
+	ld      r31,STK_REG(r31)(r1)
+	addi    r1,r1,STACKFRAMESIZE
+	ld      r0,16(r1)
+	mtlr    r0
+	blr
+
+htab_modify_pte:
+	/* Keep PP bits in r4 and slot idx from the PTE around in r3 */
+	mr	r4,r3
+	rlwinm	r3,r31,32-12,29,31
+
+	/* Secondary group ? if yes, get a inverted hash value */
+	mr	r5,r28
+	andi.	r0,r31,_PAGE_SECONDARY
+	beq	1f
+	not	r5,r5
+1:
+	/* Calculate proper slot value for ppc_md.hpte_updatepp */
+	and	r0,r5,r27
+	rldicr	r0,r0,3,63-3	/* r0 = (hash & mask) << 3 */
+	add	r3,r0,r3	/* add slot idx */
+
+	/* Call ppc_md.hpte_updatepp */
+	mr	r5,r29			/* va */
+	li	r6,MMU_PAGE_4K		/* page size */
+	ld	r7,STK_PARM(r9)(r1)	/* segment size */
+	ld	r8,STK_PARM(r8)(r1)	/* get "local" param */
+_GLOBAL(htab_call_hpte_updatepp)
+	bl	.			/* Patched by htab_finish_init() */
+
+	/* if we failed because typically the HPTE wasn't really here
+	 * we try an insertion. 
+	 */
+	cmpdi	0,r3,-1
+	beq-	htab_insert_pte
+
+	/* Clear the BUSY bit and Write out the PTE */
+	li	r0,_PAGE_BUSY
+	andc	r30,r30,r0
+	b	htab_write_out_pte
+
+htab_wrong_access:
+	/* Bail out clearing reservation */
+	stdcx.	r31,0,r6
+	li	r3,1
+	b	htab_bail
+
+htab_pte_insert_failure:
+	/* Bail out restoring old PTE */
+	ld	r6,STK_PARM(r6)(r1)
+	std	r31,0(r6)
+	li	r3,-1
+	b	htab_bail
+
+
+#else /* CONFIG_PPC_64K_PAGES */
+
+
+/*****************************************************************************
+ *                                                                           *
+ *           64K SW & 4K or 64K HW in a 4K segment pages implementation      *
+ *                                                                           *
+ *****************************************************************************/
+
+/* _hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
+ *		 pte_t *ptep, unsigned long trap, int local, int ssize,
+ *		 int subpg_prot)
+ */
+
+/*
+ * For now, we do NOT implement Admixed pages
+ */
+_GLOBAL(__hash_page_4K)
+	mflr	r0
+	std	r0,16(r1)
+	stdu	r1,-STACKFRAMESIZE(r1)
+	/* Save all params that we need after a function call */
+	std	r6,STK_PARM(r6)(r1)
+	std	r8,STK_PARM(r8)(r1)
+	std	r9,STK_PARM(r9)(r1)
+
+	/* Add _PAGE_PRESENT to access */
+	ori	r4,r4,_PAGE_PRESENT
+
+	/* Save non-volatile registers.
+	 * r31 will hold "old PTE"
+	 * r30 is "new PTE"
+	 * r29 is "va"
+	 * r28 is a hash value
+	 * r27 is hashtab mask (maybe dynamic patched instead ?)
+	 * r26 is the hidx mask
+	 * r25 is the index in combo page
+	 */
+	std	r25,STK_REG(r25)(r1)
+	std	r26,STK_REG(r26)(r1)
+	std	r27,STK_REG(r27)(r1)
+	std	r28,STK_REG(r28)(r1)
+	std	r29,STK_REG(r29)(r1)
+	std	r30,STK_REG(r30)(r1)
+	std	r31,STK_REG(r31)(r1)
+
+	/* Step 1:
+	 *
+	 * Check permissions, atomically mark the linux PTE busy
+	 * and hashed.
+	 */
+1:
+	ldarx	r31,0,r6
+	/* Check access rights (access & ~(pte_val(*ptep))) */
+	andc.	r0,r4,r31
+	bne-	htab_wrong_access
+	/* Check if PTE is busy */
+	andi.	r0,r31,_PAGE_BUSY
+	/* If so, just bail out and refault if needed. Someone else
+	 * is changing this PTE anyway and might hash it.
+	 */
+	bne-	htab_bail_ok
+	/* Prepare new PTE value (turn access RW into DIRTY, then
+	 * add BUSY and ACCESSED)
+	 */
+	rlwinm	r30,r4,32-9+7,31-7,31-7	/* _PAGE_RW -> _PAGE_DIRTY */
+	or	r30,r30,r31
+	ori	r30,r30,_PAGE_BUSY | _PAGE_ACCESSED
+	oris	r30,r30,_PAGE_COMBO@h
+	/* Write the linux PTE atomically (setting busy) */
+	stdcx.	r30,0,r6
+	bne-	1b
+	isync
+
+	/* Step 2:
+	 *
+	 * Insert/Update the HPTE in the hash table. At this point,
+	 * r4 (access) is re-useable, we use it for the new HPTE flags
+	 */
+
+	/* Load the hidx index */
+	rldicl	r25,r3,64-12,60
+
+BEGIN_FTR_SECTION
+	cmpdi	r9,0			/* check segment size */
+	bne	3f
+END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
+	/* Calc va and put it in r29 */
+	rldicr	r29,r5,28,63-28		/* r29 = (vsid << 28) */
+	rldicl	r3,r3,0,36		/* r3 = (ea & 0x0fffffff) */
+	or	r29,r3,r29		/* r29 = va */
+
+	/* Calculate hash value for primary slot and store it in r28 */
+	rldicl	r5,r5,0,25		/* vsid & 0x0000007fffffffff */
+	rldicl	r0,r3,64-12,48		/* (ea >> 12) & 0xffff */
+	xor	r28,r5,r0
+	b	4f
+
+3:	/* Calc VA and hash in r29 and r28 for 1T segment */
+	sldi	r29,r5,40		/* vsid << 40 */
+	clrldi	r3,r3,24		/* ea & 0xffffffffff */
+	rldic	r28,r5,25,25		/* (vsid << 25) & 0x7fffffffff */
+	clrldi	r5,r5,40		/* vsid & 0xffffff */
+	rldicl	r0,r3,64-12,36		/* (ea >> 12) & 0xfffffff */
+	xor	r28,r28,r5
+	or	r29,r3,r29		/* VA */
+	xor	r28,r28,r0		/* hash */
+
+	/* Convert linux PTE bits into HW equivalents */
+4:
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+	andc	r10,r30,r10
+	andi.	r3,r10,0x1fe		/* Get basic set of flags */
+	rlwinm	r0,r10,32-9+1,30,30	/* _PAGE_RW -> _PAGE_USER (r0) */
+#else
+	andi.	r3,r30,0x1fe		/* Get basic set of flags */
+	rlwinm	r0,r30,32-9+1,30,30	/* _PAGE_RW -> _PAGE_USER (r0) */
+#endif
+	xori	r3,r3,HPTE_R_N		/* _PAGE_EXEC -> NOEXEC */
+	rlwinm	r4,r30,32-7+1,30,30	/* _PAGE_DIRTY -> _PAGE_USER (r4) */
+	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
+	andc	r0,r3,r0		/* r0 = pte & ~r0 */
+	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
+	ori	r3,r3,HPTE_R_C		/* Always add "C" bit for perf. */
+
+	/* We eventually do the icache sync here (maybe inline that
+	 * code rather than call a C function...)
+	 */
+BEGIN_FTR_SECTION
+	mr	r4,r30
+	mr	r5,r7
+	bl	.hash_page_do_lazy_icache
+END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
+
+	/* At this point, r3 contains new PP bits, save them in
+	 * place of "access" in the param area (sic)
+	 */
+	std	r3,STK_PARM(r4)(r1)
+
+	/* Get htab_hash_mask */
+	ld	r4,htab_hash_mask@got(2)
+	ld	r27,0(r4)	/* htab_hash_mask -> r27 */
+
+	/* Check if we may already be in the hashtable, in this case, we
+	 * go to out-of-line code to try to modify the HPTE. We look for
+	 * the bit at (1 >> (index + 32))
+	 */
+	rldicl.	r0,r31,64-12,48
+	li	r26,0			/* Default hidx */
+	beq	htab_insert_pte
+
+	/*
+	 * Check if the pte was already inserted into the hash table
+	 * as a 64k HW page, and invalidate the 64k HPTE if so.
+	 */
+	andis.	r0,r31,_PAGE_COMBO@h
+	beq	htab_inval_old_hpte
+
+	ld	r6,STK_PARM(r6)(r1)
+	ori	r26,r6,0x8000		/* Load the hidx mask */
+	ld	r26,0(r26)
+	addi	r5,r25,36		/* Check actual HPTE_SUB bit, this */
+	rldcr.	r0,r31,r5,0		/* must match pgtable.h definition */
+	bne	htab_modify_pte
+
+htab_insert_pte:
+	/* real page number in r5, PTE RPN value + index */
+	andis.	r0,r31,_PAGE_4K_PFN@h
+	srdi	r5,r31,PTE_RPN_SHIFT
+	bne-	htab_special_pfn
+	sldi	r5,r5,PAGE_SHIFT-HW_PAGE_SHIFT
+	add	r5,r5,r25
+htab_special_pfn:
+	sldi	r5,r5,HW_PAGE_SHIFT
+
+	/* Calculate primary group hash */
+	and	r0,r28,r27
+	rldicr	r3,r0,3,63-3		/* r0 = (hash & mask) << 3 */
+
+	/* Call ppc_md.hpte_insert */
+	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
+	mr	r4,r29			/* Retreive va */
+	li	r7,0			/* !bolted, !secondary */
+	li	r8,MMU_PAGE_4K		/* page size */
+	ld	r9,STK_PARM(r9)(r1)	/* segment size */
+_GLOBAL(htab_call_hpte_insert1)
+	bl	.			/* patched by htab_finish_init() */
+	cmpdi	0,r3,0
+	bge	htab_pte_insert_ok	/* Insertion successful */
+	cmpdi	0,r3,-2			/* Critical failure */
+	beq-	htab_pte_insert_failure
+
+	/* Now try secondary slot */
+
+	/* real page number in r5, PTE RPN value + index */
+	andis.	r0,r31,_PAGE_4K_PFN@h
+	srdi	r5,r31,PTE_RPN_SHIFT
+	bne-	3f
+	sldi	r5,r5,PAGE_SHIFT-HW_PAGE_SHIFT
+	add	r5,r5,r25
+3:	sldi	r5,r5,HW_PAGE_SHIFT
+
+	/* Calculate secondary group hash */
+	andc	r0,r27,r28
+	rldicr	r3,r0,3,63-3		/* r0 = (~hash & mask) << 3 */
+
+	/* Call ppc_md.hpte_insert */
+	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
+	mr	r4,r29			/* Retreive va */
+	li	r7,HPTE_V_SECONDARY	/* !bolted, secondary */
+	li	r8,MMU_PAGE_4K		/* page size */
+	ld	r9,STK_PARM(r9)(r1)	/* segment size */
+_GLOBAL(htab_call_hpte_insert2)
+	bl	.			/* patched by htab_finish_init() */
+	cmpdi	0,r3,0
+	bge+	htab_pte_insert_ok	/* Insertion successful */
+	cmpdi	0,r3,-2			/* Critical failure */
+	beq-	htab_pte_insert_failure
+
+	/* Both are full, we need to evict something */
+	mftb	r0
+	/* Pick a random group based on TB */
+	andi.	r0,r0,1
+	mr	r5,r28
+	bne	2f
+	not	r5,r5
+2:	and	r0,r5,r27
+	rldicr	r3,r0,3,63-3		/* r0 = (hash & mask) << 3 */
+	/* Call ppc_md.hpte_remove */
+_GLOBAL(htab_call_hpte_remove)
+	bl	.			/* patched by htab_finish_init() */
+
+	/* Try all again */
+	b	htab_insert_pte
+
+	/*
+	 * Call out to C code to invalidate an 64k HW HPTE that is
+	 * useless now that the segment has been switched to 4k pages.
+	 */
+htab_inval_old_hpte:
+	mr	r3,r29			/* virtual addr */
+	mr	r4,r31			/* PTE.pte */
+	li	r5,0			/* PTE.hidx */
+	li	r6,MMU_PAGE_64K		/* psize */
+	ld	r7,STK_PARM(r9)(r1)	/* ssize */
+	ld	r8,STK_PARM(r8)(r1)	/* local */
+	bl	.flush_hash_page
+	/* Clear out _PAGE_HPTE_SUB bits in the new linux PTE */
+	lis	r0,_PAGE_HPTE_SUB@h
+	ori	r0,r0,_PAGE_HPTE_SUB@l
+	andc	r30,r30,r0
+	b	htab_insert_pte
+	
+htab_bail_ok:
+	li	r3,0
+	b	htab_bail
+
+htab_pte_insert_ok:
+	/* Insert slot number & secondary bit in PTE second half,
+	 * clear _PAGE_BUSY and set approriate HPTE slot bit
+	 */
+	ld	r6,STK_PARM(r6)(r1)
+	li	r0,_PAGE_BUSY
+	andc	r30,r30,r0
+	/* HPTE SUB bit */
+	li	r0,1
+	subfic	r5,r25,27		/* Must match bit position in */
+	sld	r0,r0,r5		/* pgtable.h */
+	or	r30,r30,r0
+	/* hindx */
+	sldi	r5,r25,2
+	sld	r3,r3,r5
+	li	r4,0xf
+	sld	r4,r4,r5
+	andc	r26,r26,r4
+	or	r26,r26,r3
+	ori	r5,r6,0x8000
+	std	r26,0(r5)
+	lwsync
+	std	r30,0(r6)
+	li	r3, 0
+htab_bail:
+	ld	r25,STK_REG(r25)(r1)
+	ld	r26,STK_REG(r26)(r1)
+	ld	r27,STK_REG(r27)(r1)
+	ld	r28,STK_REG(r28)(r1)
+	ld	r29,STK_REG(r29)(r1)
+	ld      r30,STK_REG(r30)(r1)
+	ld      r31,STK_REG(r31)(r1)
+	addi    r1,r1,STACKFRAMESIZE
+	ld      r0,16(r1)
+	mtlr    r0
+	blr
+
+htab_modify_pte:
+	/* Keep PP bits in r4 and slot idx from the PTE around in r3 */
+	mr	r4,r3
+	sldi	r5,r25,2
+	srd	r3,r26,r5
+
+	/* Secondary group ? if yes, get a inverted hash value */
+	mr	r5,r28
+	andi.	r0,r3,0x8 /* page secondary ? */
+	beq	1f
+	not	r5,r5
+1:	andi.	r3,r3,0x7 /* extract idx alone */
+
+	/* Calculate proper slot value for ppc_md.hpte_updatepp */
+	and	r0,r5,r27
+	rldicr	r0,r0,3,63-3	/* r0 = (hash & mask) << 3 */
+	add	r3,r0,r3	/* add slot idx */
+
+	/* Call ppc_md.hpte_updatepp */
+	mr	r5,r29			/* va */
+	li	r6,MMU_PAGE_4K		/* page size */
+	ld	r7,STK_PARM(r9)(r1)	/* segment size */
+	ld	r8,STK_PARM(r8)(r1)	/* get "local" param */
+_GLOBAL(htab_call_hpte_updatepp)
+	bl	.			/* patched by htab_finish_init() */
+
+	/* if we failed because typically the HPTE wasn't really here
+	 * we try an insertion.
+	 */
+	cmpdi	0,r3,-1
+	beq-	htab_insert_pte
+
+	/* Clear the BUSY bit and Write out the PTE */
+	li	r0,_PAGE_BUSY
+	andc	r30,r30,r0
+	ld	r6,STK_PARM(r6)(r1)
+	std	r30,0(r6)
+	li	r3,0
+	b	htab_bail
+
+htab_wrong_access:
+	/* Bail out clearing reservation */
+	stdcx.	r31,0,r6
+	li	r3,1
+	b	htab_bail
+
+htab_pte_insert_failure:
+	/* Bail out restoring old PTE */
+	ld	r6,STK_PARM(r6)(r1)
+	std	r31,0(r6)
+	li	r3,-1
+	b	htab_bail
+
+#endif /* CONFIG_PPC_64K_PAGES */
+
+#ifdef CONFIG_PPC_HAS_HASH_64K
+
+/*****************************************************************************
+ *                                                                           *
+ *           64K SW & 64K HW in a 64K segment pages implementation           *
+ *                                                                           *
+ *****************************************************************************/
+
+_GLOBAL(__hash_page_64K)
+	mflr	r0
+	std	r0,16(r1)
+	stdu	r1,-STACKFRAMESIZE(r1)
+	/* Save all params that we need after a function call */
+	std	r6,STK_PARM(r6)(r1)
+	std	r8,STK_PARM(r8)(r1)
+	std	r9,STK_PARM(r9)(r1)
+
+	/* Add _PAGE_PRESENT to access */
+	ori	r4,r4,_PAGE_PRESENT
+
+	/* Save non-volatile registers.
+	 * r31 will hold "old PTE"
+	 * r30 is "new PTE"
+	 * r29 is "va"
+	 * r28 is a hash value
+	 * r27 is hashtab mask (maybe dynamic patched instead ?)
+	 */
+	std	r27,STK_REG(r27)(r1)
+	std	r28,STK_REG(r28)(r1)
+	std	r29,STK_REG(r29)(r1)
+	std	r30,STK_REG(r30)(r1)
+	std	r31,STK_REG(r31)(r1)
+
+	/* Step 1:
+	 *
+	 * Check permissions, atomically mark the linux PTE busy
+	 * and hashed.
+	 */
+1:
+	ldarx	r31,0,r6
+	/* Check access rights (access & ~(pte_val(*ptep))) */
+	andc.	r0,r4,r31
+	bne-	ht64_wrong_access
+	/* Check if PTE is busy */
+	andi.	r0,r31,_PAGE_BUSY
+	/* If so, just bail out and refault if needed. Someone else
+	 * is changing this PTE anyway and might hash it.
+	 */
+	bne-	ht64_bail_ok
+BEGIN_FTR_SECTION
+	/* Check if PTE has the cache-inhibit bit set */
+	andi.	r0,r31,_PAGE_NO_CACHE
+	/* If so, bail out and refault as a 4k page */
+	bne-	ht64_bail_ok
+END_FTR_SECTION_IFCLR(CPU_FTR_CI_LARGE_PAGE)
+	/* Prepare new PTE value (turn access RW into DIRTY, then
+	 * add BUSY and ACCESSED)
+	 */
+	rlwinm	r30,r4,32-9+7,31-7,31-7	/* _PAGE_RW -> _PAGE_DIRTY */
+	or	r30,r30,r31
+	ori	r30,r30,_PAGE_BUSY | _PAGE_ACCESSED
+	/* Write the linux PTE atomically (setting busy) */
+	stdcx.	r30,0,r6
+	bne-	1b
+	isync
+
+	/* Step 2:
+	 *
+	 * Insert/Update the HPTE in the hash table. At this point,
+	 * r4 (access) is re-useable, we use it for the new HPTE flags
+	 */
+
+BEGIN_FTR_SECTION
+	cmpdi	r9,0			/* check segment size */
+	bne	3f
+END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
+	/* Calc va and put it in r29 */
+	rldicr	r29,r5,28,63-28
+	rldicl	r3,r3,0,36
+	or	r29,r3,r29
+
+	/* Calculate hash value for primary slot and store it in r28 */
+	rldicl	r5,r5,0,25		/* vsid & 0x0000007fffffffff */
+	rldicl	r0,r3,64-16,52		/* (ea >> 16) & 0xfff */
+	xor	r28,r5,r0
+	b	4f
+
+3:	/* Calc VA and hash in r29 and r28 for 1T segment */
+	sldi	r29,r5,40		/* vsid << 40 */
+	clrldi	r3,r3,24		/* ea & 0xffffffffff */
+	rldic	r28,r5,25,25		/* (vsid << 25) & 0x7fffffffff */
+	clrldi	r5,r5,40		/* vsid & 0xffffff */
+	rldicl	r0,r3,64-16,40		/* (ea >> 16) & 0xffffff */
+	xor	r28,r28,r5
+	or	r29,r3,r29		/* VA */
+	xor	r28,r28,r0		/* hash */
+
+	/* Convert linux PTE bits into HW equivalents */
+4:	andi.	r3,r30,0x1fe		/* Get basic set of flags */
+	xori	r3,r3,HPTE_R_N		/* _PAGE_EXEC -> NOEXEC */
+	rlwinm	r0,r30,32-9+1,30,30	/* _PAGE_RW -> _PAGE_USER (r0) */
+	rlwinm	r4,r30,32-7+1,30,30	/* _PAGE_DIRTY -> _PAGE_USER (r4) */
+	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
+	andc	r0,r30,r0		/* r0 = pte & ~r0 */
+	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
+	ori	r3,r3,HPTE_R_C		/* Always add "C" bit for perf. */
+
+	/* We eventually do the icache sync here (maybe inline that
+	 * code rather than call a C function...)
+	 */
+BEGIN_FTR_SECTION
+	mr	r4,r30
+	mr	r5,r7
+	bl	.hash_page_do_lazy_icache
+END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
+
+	/* At this point, r3 contains new PP bits, save them in
+	 * place of "access" in the param area (sic)
+	 */
+	std	r3,STK_PARM(r4)(r1)
+
+	/* Get htab_hash_mask */
+	ld	r4,htab_hash_mask@got(2)
+	ld	r27,0(r4)	/* htab_hash_mask -> r27 */
+
+	/* Check if we may already be in the hashtable, in this case, we
+	 * go to out-of-line code to try to modify the HPTE
+	 */
+	rldicl.	r0,r31,64-12,48
+	bne	ht64_modify_pte
+
+ht64_insert_pte:
+	/* Clear hpte bits in new pte (we also clear BUSY btw) and
+	 * add _PAGE_HPTE_SUB0
+	 */
+	lis	r0,_PAGE_HPTEFLAGS@h
+	ori	r0,r0,_PAGE_HPTEFLAGS@l
+	andc	r30,r30,r0
+#ifdef CONFIG_PPC_64K_PAGES
+	oris	r30,r30,_PAGE_HPTE_SUB0@h
+#else
+	ori	r30,r30,_PAGE_HASHPTE
+#endif
+	/* Phyical address in r5 */
+	rldicl	r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
+	sldi	r5,r5,PAGE_SHIFT
+
+	/* Calculate primary group hash */
+	and	r0,r28,r27
+	rldicr	r3,r0,3,63-3	/* r0 = (hash & mask) << 3 */
+
+	/* Call ppc_md.hpte_insert */
+	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
+	mr	r4,r29			/* Retreive va */
+	li	r7,0			/* !bolted, !secondary */
+	li	r8,MMU_PAGE_64K
+	ld	r9,STK_PARM(r9)(r1)	/* segment size */
+_GLOBAL(ht64_call_hpte_insert1)
+	bl	.			/* patched by htab_finish_init() */
+	cmpdi	0,r3,0
+	bge	ht64_pte_insert_ok	/* Insertion successful */
+	cmpdi	0,r3,-2			/* Critical failure */
+	beq-	ht64_pte_insert_failure
+
+	/* Now try secondary slot */
+
+	/* Phyical address in r5 */
+	rldicl	r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT
+	sldi	r5,r5,PAGE_SHIFT
+
+	/* Calculate secondary group hash */
+	andc	r0,r27,r28
+	rldicr	r3,r0,3,63-3	/* r0 = (~hash & mask) << 3 */
+
+	/* Call ppc_md.hpte_insert */
+	ld	r6,STK_PARM(r4)(r1)	/* Retreive new pp bits */
+	mr	r4,r29			/* Retreive va */
+	li	r7,HPTE_V_SECONDARY	/* !bolted, secondary */
+	li	r8,MMU_PAGE_64K
+	ld	r9,STK_PARM(r9)(r1)	/* segment size */
+_GLOBAL(ht64_call_hpte_insert2)
+	bl	.			/* patched by htab_finish_init() */
+	cmpdi	0,r3,0
+	bge+	ht64_pte_insert_ok	/* Insertion successful */
+	cmpdi	0,r3,-2			/* Critical failure */
+	beq-	ht64_pte_insert_failure
+
+	/* Both are full, we need to evict something */
+	mftb	r0
+	/* Pick a random group based on TB */
+	andi.	r0,r0,1
+	mr	r5,r28
+	bne	2f
+	not	r5,r5
+2:	and	r0,r5,r27
+	rldicr	r3,r0,3,63-3	/* r0 = (hash & mask) << 3 */
+	/* Call ppc_md.hpte_remove */
+_GLOBAL(ht64_call_hpte_remove)
+	bl	.			/* patched by htab_finish_init() */
+
+	/* Try all again */
+	b	ht64_insert_pte
+
+ht64_bail_ok:
+	li	r3,0
+	b	ht64_bail
+
+ht64_pte_insert_ok:
+	/* Insert slot number & secondary bit in PTE */
+	rldimi	r30,r3,12,63-15
+
+	/* Write out the PTE with a normal write
+	 * (maybe add eieio may be good still ?)
+	 */
+ht64_write_out_pte:
+	ld	r6,STK_PARM(r6)(r1)
+	std	r30,0(r6)
+	li	r3, 0
+ht64_bail:
+	ld	r27,STK_REG(r27)(r1)
+	ld	r28,STK_REG(r28)(r1)
+	ld	r29,STK_REG(r29)(r1)
+	ld      r30,STK_REG(r30)(r1)
+	ld      r31,STK_REG(r31)(r1)
+	addi    r1,r1,STACKFRAMESIZE
+	ld      r0,16(r1)
+	mtlr    r0
+	blr
+
+ht64_modify_pte:
+	/* Keep PP bits in r4 and slot idx from the PTE around in r3 */
+	mr	r4,r3
+	rlwinm	r3,r31,32-12,29,31
+
+	/* Secondary group ? if yes, get a inverted hash value */
+	mr	r5,r28
+	andi.	r0,r31,_PAGE_F_SECOND
+	beq	1f
+	not	r5,r5
+1:
+	/* Calculate proper slot value for ppc_md.hpte_updatepp */
+	and	r0,r5,r27
+	rldicr	r0,r0,3,63-3	/* r0 = (hash & mask) << 3 */
+	add	r3,r0,r3	/* add slot idx */
+
+	/* Call ppc_md.hpte_updatepp */
+	mr	r5,r29			/* va */
+	li	r6,MMU_PAGE_64K
+	ld	r7,STK_PARM(r9)(r1)	/* segment size */
+	ld	r8,STK_PARM(r8)(r1)	/* get "local" param */
+_GLOBAL(ht64_call_hpte_updatepp)
+	bl	.			/* patched by htab_finish_init() */
+
+	/* if we failed because typically the HPTE wasn't really here
+	 * we try an insertion.
+	 */
+	cmpdi	0,r3,-1
+	beq-	ht64_insert_pte
+
+	/* Clear the BUSY bit and Write out the PTE */
+	li	r0,_PAGE_BUSY
+	andc	r30,r30,r0
+	b	ht64_write_out_pte
+
+ht64_wrong_access:
+	/* Bail out clearing reservation */
+	stdcx.	r31,0,r6
+	li	r3,1
+	b	ht64_bail
+
+ht64_pte_insert_failure:
+	/* Bail out restoring old PTE */
+	ld	r6,STK_PARM(r6)(r1)
+	std	r31,0(r6)
+	li	r3,-1
+	b	ht64_bail
+
+
+#endif /* CONFIG_PPC_HAS_HASH_64K */
+
+
+/*****************************************************************************
+ *                                                                           *
+ *           Huge pages implementation is in hugetlbpage.c                   *
+ *                                                                           *
+ *****************************************************************************/
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
new file mode 100644
index 0000000..34e5c0b
--- /dev/null
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -0,0 +1,569 @@
+/*
+ * native hashtable management.
+ *
+ * SMP scalability work:
+ *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG_LOW
+
+#include <linux/spinlock.h>
+#include <linux/bitops.h>
+#include <linux/threads.h>
+#include <linux/smp.h>
+
+#include <asm/abs_addr.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/cputable.h>
+#include <asm/udbg.h>
+#include <asm/kexec.h>
+
+#ifdef DEBUG_LOW
+#define DBG_LOW(fmt...) udbg_printf(fmt)
+#else
+#define DBG_LOW(fmt...)
+#endif
+
+#define HPTE_LOCK_BIT 3
+
+static DEFINE_SPINLOCK(native_tlbie_lock);
+
+static inline void __tlbie(unsigned long va, int psize, int ssize)
+{
+	unsigned int penc;
+
+	/* clear top 16 bits, non SLS segment */
+	va &= ~(0xffffULL << 48);
+
+	switch (psize) {
+	case MMU_PAGE_4K:
+		va &= ~0xffful;
+		va |= ssize << 8;
+		asm volatile("tlbie %0,0" : : "r" (va) : "memory");
+		break;
+	default:
+		penc = mmu_psize_defs[psize].penc;
+		va &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+		va |= penc << 12;
+		va |= ssize << 8;
+		asm volatile("tlbie %0,1" : : "r" (va) : "memory");
+		break;
+	}
+}
+
+static inline void __tlbiel(unsigned long va, int psize, int ssize)
+{
+	unsigned int penc;
+
+	/* clear top 16 bits, non SLS segment */
+	va &= ~(0xffffULL << 48);
+
+	switch (psize) {
+	case MMU_PAGE_4K:
+		va &= ~0xffful;
+		va |= ssize << 8;
+		asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)"
+			     : : "r"(va) : "memory");
+		break;
+	default:
+		penc = mmu_psize_defs[psize].penc;
+		va &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+		va |= penc << 12;
+		va |= ssize << 8;
+		asm volatile(".long 0x7c000224 | (%0 << 11) | (1 << 21)"
+			     : : "r"(va) : "memory");
+		break;
+	}
+
+}
+
+static inline void tlbie(unsigned long va, int psize, int ssize, int local)
+{
+	unsigned int use_local = local && cpu_has_feature(CPU_FTR_TLBIEL);
+	int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+
+	if (use_local)
+		use_local = mmu_psize_defs[psize].tlbiel;
+	if (lock_tlbie && !use_local)
+		spin_lock(&native_tlbie_lock);
+	asm volatile("ptesync": : :"memory");
+	if (use_local) {
+		__tlbiel(va, psize, ssize);
+		asm volatile("ptesync": : :"memory");
+	} else {
+		__tlbie(va, psize, ssize);
+		asm volatile("eieio; tlbsync; ptesync": : :"memory");
+	}
+	if (lock_tlbie && !use_local)
+		spin_unlock(&native_tlbie_lock);
+}
+
+static inline void native_lock_hpte(struct hash_pte *hptep)
+{
+	unsigned long *word = &hptep->v;
+
+	while (1) {
+		if (!test_and_set_bit(HPTE_LOCK_BIT, word))
+			break;
+		while(test_bit(HPTE_LOCK_BIT, word))
+			cpu_relax();
+	}
+}
+
+static inline void native_unlock_hpte(struct hash_pte *hptep)
+{
+	unsigned long *word = &hptep->v;
+
+	asm volatile("lwsync":::"memory");
+	clear_bit(HPTE_LOCK_BIT, word);
+}
+
+static long native_hpte_insert(unsigned long hpte_group, unsigned long va,
+			unsigned long pa, unsigned long rflags,
+			unsigned long vflags, int psize, int ssize)
+{
+	struct hash_pte *hptep = htab_address + hpte_group;
+	unsigned long hpte_v, hpte_r;
+	int i;
+
+	if (!(vflags & HPTE_V_BOLTED)) {
+		DBG_LOW("    insert(group=%lx, va=%016lx, pa=%016lx,"
+			" rflags=%lx, vflags=%lx, psize=%d)\n",
+			hpte_group, va, pa, rflags, vflags, psize);
+	}
+
+	for (i = 0; i < HPTES_PER_GROUP; i++) {
+		if (! (hptep->v & HPTE_V_VALID)) {
+			/* retry with lock held */
+			native_lock_hpte(hptep);
+			if (! (hptep->v & HPTE_V_VALID))
+				break;
+			native_unlock_hpte(hptep);
+		}
+
+		hptep++;
+	}
+
+	if (i == HPTES_PER_GROUP)
+		return -1;
+
+	hpte_v = hpte_encode_v(va, psize, ssize) | vflags | HPTE_V_VALID;
+	hpte_r = hpte_encode_r(pa, psize) | rflags;
+
+	if (!(vflags & HPTE_V_BOLTED)) {
+		DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n",
+			i, hpte_v, hpte_r);
+	}
+
+	hptep->r = hpte_r;
+	/* Guarantee the second dword is visible before the valid bit */
+	eieio();
+	/*
+	 * Now set the first dword including the valid bit
+	 * NOTE: this also unlocks the hpte
+	 */
+	hptep->v = hpte_v;
+
+	__asm__ __volatile__ ("ptesync" : : : "memory");
+
+	return i | (!!(vflags & HPTE_V_SECONDARY) << 3);
+}
+
+static long native_hpte_remove(unsigned long hpte_group)
+{
+	struct hash_pte *hptep;
+	int i;
+	int slot_offset;
+	unsigned long hpte_v;
+
+	DBG_LOW("    remove(group=%lx)\n", hpte_group);
+
+	/* pick a random entry to start at */
+	slot_offset = mftb() & 0x7;
+
+	for (i = 0; i < HPTES_PER_GROUP; i++) {
+		hptep = htab_address + hpte_group + slot_offset;
+		hpte_v = hptep->v;
+
+		if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) {
+			/* retry with lock held */
+			native_lock_hpte(hptep);
+			hpte_v = hptep->v;
+			if ((hpte_v & HPTE_V_VALID)
+			    && !(hpte_v & HPTE_V_BOLTED))
+				break;
+			native_unlock_hpte(hptep);
+		}
+
+		slot_offset++;
+		slot_offset &= 0x7;
+	}
+
+	if (i == HPTES_PER_GROUP)
+		return -1;
+
+	/* Invalidate the hpte. NOTE: this also unlocks it */
+	hptep->v = 0;
+
+	return i;
+}
+
+static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
+				 unsigned long va, int psize, int ssize,
+				 int local)
+{
+	struct hash_pte *hptep = htab_address + slot;
+	unsigned long hpte_v, want_v;
+	int ret = 0;
+
+	want_v = hpte_encode_v(va, psize, ssize);
+
+	DBG_LOW("    update(va=%016lx, avpnv=%016lx, hash=%016lx, newpp=%x)",
+		va, want_v & HPTE_V_AVPN, slot, newpp);
+
+	native_lock_hpte(hptep);
+
+	hpte_v = hptep->v;
+
+	/* Even if we miss, we need to invalidate the TLB */
+	if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) {
+		DBG_LOW(" -> miss\n");
+		ret = -1;
+	} else {
+		DBG_LOW(" -> hit\n");
+		/* Update the HPTE */
+		hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
+			(newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C));
+	}
+	native_unlock_hpte(hptep);
+
+	/* Ensure it is out of the tlb too. */
+	tlbie(va, psize, ssize, local);
+
+	return ret;
+}
+
+static long native_hpte_find(unsigned long va, int psize, int ssize)
+{
+	struct hash_pte *hptep;
+	unsigned long hash;
+	unsigned long i;
+	long slot;
+	unsigned long want_v, hpte_v;
+
+	hash = hpt_hash(va, mmu_psize_defs[psize].shift, ssize);
+	want_v = hpte_encode_v(va, psize, ssize);
+
+	/* Bolted mappings are only ever in the primary group */
+	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+	for (i = 0; i < HPTES_PER_GROUP; i++) {
+		hptep = htab_address + slot;
+		hpte_v = hptep->v;
+
+		if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
+			/* HPTE matches */
+			return slot;
+		++slot;
+	}
+
+	return -1;
+}
+
+/*
+ * Update the page protection bits. Intended to be used to create
+ * guard pages for kernel data structures on pages which are bolted
+ * in the HPT. Assumes pages being operated on will not be stolen.
+ *
+ * No need to lock here because we should be the only user.
+ */
+static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
+				       int psize, int ssize)
+{
+	unsigned long vsid, va;
+	long slot;
+	struct hash_pte *hptep;
+
+	vsid = get_kernel_vsid(ea, ssize);
+	va = hpt_va(ea, vsid, ssize);
+
+	slot = native_hpte_find(va, psize, ssize);
+	if (slot == -1)
+		panic("could not find page to bolt\n");
+	hptep = htab_address + slot;
+
+	/* Update the HPTE */
+	hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
+		(newpp & (HPTE_R_PP | HPTE_R_N));
+
+	/* Ensure it is out of the tlb too. */
+	tlbie(va, psize, ssize, 0);
+}
+
+static void native_hpte_invalidate(unsigned long slot, unsigned long va,
+				   int psize, int ssize, int local)
+{
+	struct hash_pte *hptep = htab_address + slot;
+	unsigned long hpte_v;
+	unsigned long want_v;
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	DBG_LOW("    invalidate(va=%016lx, hash: %x)\n", va, slot);
+
+	want_v = hpte_encode_v(va, psize, ssize);
+	native_lock_hpte(hptep);
+	hpte_v = hptep->v;
+
+	/* Even if we miss, we need to invalidate the TLB */
+	if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
+		native_unlock_hpte(hptep);
+	else
+		/* Invalidate the hpte. NOTE: this also unlocks it */
+		hptep->v = 0;
+
+	/* Invalidate the TLB */
+	tlbie(va, psize, ssize, local);
+
+	local_irq_restore(flags);
+}
+
+#define LP_SHIFT	12
+#define LP_BITS		8
+#define LP_MASK(i)	((0xFF >> (i)) << LP_SHIFT)
+
+static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
+			int *psize, int *ssize, unsigned long *va)
+{
+	unsigned long hpte_r = hpte->r;
+	unsigned long hpte_v = hpte->v;
+	unsigned long avpn;
+	int i, size, shift, penc;
+
+	if (!(hpte_v & HPTE_V_LARGE))
+		size = MMU_PAGE_4K;
+	else {
+		for (i = 0; i < LP_BITS; i++) {
+			if ((hpte_r & LP_MASK(i+1)) == LP_MASK(i+1))
+				break;
+		}
+		penc = LP_MASK(i+1) >> LP_SHIFT;
+		for (size = 0; size < MMU_PAGE_COUNT; size++) {
+
+			/* 4K pages are not represented by LP */
+			if (size == MMU_PAGE_4K)
+				continue;
+
+			/* valid entries have a shift value */
+			if (!mmu_psize_defs[size].shift)
+				continue;
+
+			if (penc == mmu_psize_defs[size].penc)
+				break;
+		}
+	}
+
+	/* This works for all page sizes, and for 256M and 1T segments */
+	shift = mmu_psize_defs[size].shift;
+	avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm) << 23;
+
+	if (shift < 23) {
+		unsigned long vpi, vsid, pteg;
+
+		pteg = slot / HPTES_PER_GROUP;
+		if (hpte_v & HPTE_V_SECONDARY)
+			pteg = ~pteg;
+		switch (hpte_v >> HPTE_V_SSIZE_SHIFT) {
+		case MMU_SEGSIZE_256M:
+			vpi = ((avpn >> 28) ^ pteg) & htab_hash_mask;
+			break;
+		case MMU_SEGSIZE_1T:
+			vsid = avpn >> 40;
+			vpi = (vsid ^ (vsid << 25) ^ pteg) & htab_hash_mask;
+			break;
+		default:
+			avpn = vpi = size = 0;
+		}
+		avpn |= (vpi << mmu_psize_defs[size].shift);
+	}
+
+	*va = avpn;
+	*psize = size;
+	*ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
+}
+
+/*
+ * clear all mappings on kexec.  All cpus are in real mode (or they will
+ * be when they isi), and we are the only one left.  We rely on our kernel
+ * mapping being 0xC0's and the hardware ignoring those two real bits.
+ *
+ * TODO: add batching support when enabled.  remember, no dynamic memory here,
+ * athough there is the control page available...
+ */
+static void native_hpte_clear(void)
+{
+	unsigned long slot, slots, flags;
+	struct hash_pte *hptep = htab_address;
+	unsigned long hpte_v, va;
+	unsigned long pteg_count;
+	int psize, ssize;
+
+	pteg_count = htab_hash_mask + 1;
+
+	local_irq_save(flags);
+
+	/* we take the tlbie lock and hold it.  Some hardware will
+	 * deadlock if we try to tlbie from two processors at once.
+	 */
+	spin_lock(&native_tlbie_lock);
+
+	slots = pteg_count * HPTES_PER_GROUP;
+
+	for (slot = 0; slot < slots; slot++, hptep++) {
+		/*
+		 * we could lock the pte here, but we are the only cpu
+		 * running,  right?  and for crash dump, we probably
+		 * don't want to wait for a maybe bad cpu.
+		 */
+		hpte_v = hptep->v;
+
+		/*
+		 * Call __tlbie() here rather than tlbie() since we
+		 * already hold the native_tlbie_lock.
+		 */
+		if (hpte_v & HPTE_V_VALID) {
+			hpte_decode(hptep, slot, &psize, &ssize, &va);
+			hptep->v = 0;
+			__tlbie(va, psize, ssize);
+		}
+	}
+
+	asm volatile("eieio; tlbsync; ptesync":::"memory");
+	spin_unlock(&native_tlbie_lock);
+	local_irq_restore(flags);
+}
+
+/*
+ * Batched hash table flush, we batch the tlbie's to avoid taking/releasing
+ * the lock all the time
+ */
+static void native_flush_hash_range(unsigned long number, int local)
+{
+	unsigned long va, hash, index, hidx, shift, slot;
+	struct hash_pte *hptep;
+	unsigned long hpte_v;
+	unsigned long want_v;
+	unsigned long flags;
+	real_pte_t pte;
+	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	unsigned long psize = batch->psize;
+	int ssize = batch->ssize;
+	int i;
+
+	local_irq_save(flags);
+
+	for (i = 0; i < number; i++) {
+		va = batch->vaddr[i];
+		pte = batch->pte[i];
+
+		pte_iterate_hashed_subpages(pte, psize, va, index, shift) {
+			hash = hpt_hash(va, shift, ssize);
+			hidx = __rpte_to_hidx(pte, index);
+			if (hidx & _PTEIDX_SECONDARY)
+				hash = ~hash;
+			slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+			slot += hidx & _PTEIDX_GROUP_IX;
+			hptep = htab_address + slot;
+			want_v = hpte_encode_v(va, psize, ssize);
+			native_lock_hpte(hptep);
+			hpte_v = hptep->v;
+			if (!HPTE_V_COMPARE(hpte_v, want_v) ||
+			    !(hpte_v & HPTE_V_VALID))
+				native_unlock_hpte(hptep);
+			else
+				hptep->v = 0;
+		} pte_iterate_hashed_end();
+	}
+
+	if (cpu_has_feature(CPU_FTR_TLBIEL) &&
+	    mmu_psize_defs[psize].tlbiel && local) {
+		asm volatile("ptesync":::"memory");
+		for (i = 0; i < number; i++) {
+			va = batch->vaddr[i];
+			pte = batch->pte[i];
+
+			pte_iterate_hashed_subpages(pte, psize, va, index,
+						    shift) {
+				__tlbiel(va, psize, ssize);
+			} pte_iterate_hashed_end();
+		}
+		asm volatile("ptesync":::"memory");
+	} else {
+		int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+
+		if (lock_tlbie)
+			spin_lock(&native_tlbie_lock);
+
+		asm volatile("ptesync":::"memory");
+		for (i = 0; i < number; i++) {
+			va = batch->vaddr[i];
+			pte = batch->pte[i];
+
+			pte_iterate_hashed_subpages(pte, psize, va, index,
+						    shift) {
+				__tlbie(va, psize, ssize);
+			} pte_iterate_hashed_end();
+		}
+		asm volatile("eieio; tlbsync; ptesync":::"memory");
+
+		if (lock_tlbie)
+			spin_unlock(&native_tlbie_lock);
+	}
+
+	local_irq_restore(flags);
+}
+
+#ifdef CONFIG_PPC_PSERIES
+/* Disable TLB batching on nighthawk */
+static inline int tlb_batching_enabled(void)
+{
+	struct device_node *root = of_find_node_by_path("/");
+	int enabled = 1;
+
+	if (root) {
+		const char *model = of_get_property(root, "model", NULL);
+		if (model && !strcmp(model, "IBM,9076-N81"))
+			enabled = 0;
+		of_node_put(root);
+	}
+
+	return enabled;
+}
+#else
+static inline int tlb_batching_enabled(void)
+{
+	return 1;
+}
+#endif
+
+void __init hpte_init_native(void)
+{
+	ppc_md.hpte_invalidate	= native_hpte_invalidate;
+	ppc_md.hpte_updatepp	= native_hpte_updatepp;
+	ppc_md.hpte_updateboltedpp = native_hpte_updateboltedpp;
+	ppc_md.hpte_insert	= native_hpte_insert;
+	ppc_md.hpte_remove	= native_hpte_remove;
+	ppc_md.hpte_clear_all	= native_hpte_clear;
+	if (tlb_batching_enabled())
+		ppc_md.flush_hash_range = native_flush_hash_range;
+}
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
new file mode 100644
index 0000000..8d5b475
--- /dev/null
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -0,0 +1,1197 @@
+/*
+ * PowerPC64 port by Mike Corrigan and Dave Engebretsen
+ *   {mikejc|engebret}@us.ibm.com
+ *
+ *    Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com>
+ *
+ * SMP scalability work:
+ *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ * 
+ *    Module name: htab.c
+ *
+ *    Description:
+ *      PowerPC Hashed Page Table functions
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+#undef DEBUG_LOW
+
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/sysctl.h>
+#include <linux/ctype.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/signal.h>
+#include <linux/lmb.h>
+
+#include <asm/processor.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/page.h>
+#include <asm/types.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/machdep.h>
+#include <asm/prom.h>
+#include <asm/abs_addr.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+#include <asm/eeh.h>
+#include <asm/tlb.h>
+#include <asm/cacheflush.h>
+#include <asm/cputable.h>
+#include <asm/sections.h>
+#include <asm/spu.h>
+#include <asm/udbg.h>
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+#ifdef DEBUG_LOW
+#define DBG_LOW(fmt...) udbg_printf(fmt)
+#else
+#define DBG_LOW(fmt...)
+#endif
+
+#define KB (1024)
+#define MB (1024*KB)
+#define GB (1024L*MB)
+
+/*
+ * Note:  pte   --> Linux PTE
+ *        HPTE  --> PowerPC Hashed Page Table Entry
+ *
+ * Execution context:
+ *   htab_initialize is called with the MMU off (of course), but
+ *   the kernel has been copied down to zero so it can directly
+ *   reference global data.  At this point it is very difficult
+ *   to print debug info.
+ *
+ */
+
+#ifdef CONFIG_U3_DART
+extern unsigned long dart_tablebase;
+#endif /* CONFIG_U3_DART */
+
+static unsigned long _SDR1;
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
+
+struct hash_pte *htab_address;
+unsigned long htab_size_bytes;
+unsigned long htab_hash_mask;
+int mmu_linear_psize = MMU_PAGE_4K;
+int mmu_virtual_psize = MMU_PAGE_4K;
+int mmu_vmalloc_psize = MMU_PAGE_4K;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+int mmu_vmemmap_psize = MMU_PAGE_4K;
+#endif
+int mmu_io_psize = MMU_PAGE_4K;
+int mmu_kernel_ssize = MMU_SEGSIZE_256M;
+int mmu_highuser_ssize = MMU_SEGSIZE_256M;
+u16 mmu_slb_size = 64;
+#ifdef CONFIG_HUGETLB_PAGE
+unsigned int HPAGE_SHIFT;
+#endif
+#ifdef CONFIG_PPC_64K_PAGES
+int mmu_ci_restrictions;
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+static u8 *linear_map_hash_slots;
+static unsigned long linear_map_hash_count;
+static DEFINE_SPINLOCK(linear_map_hash_lock);
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
+/* There are definitions of page sizes arrays to be used when none
+ * is provided by the firmware.
+ */
+
+/* Pre-POWER4 CPUs (4k pages only)
+ */
+static struct mmu_psize_def mmu_psize_defaults_old[] = {
+	[MMU_PAGE_4K] = {
+		.shift	= 12,
+		.sllp	= 0,
+		.penc	= 0,
+		.avpnm	= 0,
+		.tlbiel = 0,
+	},
+};
+
+/* POWER4, GPUL, POWER5
+ *
+ * Support for 16Mb large pages
+ */
+static struct mmu_psize_def mmu_psize_defaults_gp[] = {
+	[MMU_PAGE_4K] = {
+		.shift	= 12,
+		.sllp	= 0,
+		.penc	= 0,
+		.avpnm	= 0,
+		.tlbiel = 1,
+	},
+	[MMU_PAGE_16M] = {
+		.shift	= 24,
+		.sllp	= SLB_VSID_L,
+		.penc	= 0,
+		.avpnm	= 0x1UL,
+		.tlbiel = 0,
+	},
+};
+
+static unsigned long htab_convert_pte_flags(unsigned long pteflags)
+{
+	unsigned long rflags = pteflags & 0x1fa;
+
+	/* _PAGE_EXEC -> NOEXEC */
+	if ((pteflags & _PAGE_EXEC) == 0)
+		rflags |= HPTE_R_N;
+
+	/* PP bits. PAGE_USER is already PP bit 0x2, so we only
+	 * need to add in 0x1 if it's a read-only user page
+	 */
+	if ((pteflags & _PAGE_USER) && !((pteflags & _PAGE_RW) &&
+					 (pteflags & _PAGE_DIRTY)))
+		rflags |= 1;
+
+	/* Always add C */
+	return rflags | HPTE_R_C;
+}
+
+int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
+		      unsigned long pstart, unsigned long prot,
+		      int psize, int ssize)
+{
+	unsigned long vaddr, paddr;
+	unsigned int step, shift;
+	int ret = 0;
+
+	shift = mmu_psize_defs[psize].shift;
+	step = 1 << shift;
+
+	prot = htab_convert_pte_flags(prot);
+
+	DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n",
+	    vstart, vend, pstart, prot, psize, ssize);
+
+	for (vaddr = vstart, paddr = pstart; vaddr < vend;
+	     vaddr += step, paddr += step) {
+		unsigned long hash, hpteg;
+		unsigned long vsid = get_kernel_vsid(vaddr, ssize);
+		unsigned long va = hpt_va(vaddr, vsid, ssize);
+		unsigned long tprot = prot;
+
+		/* Make kernel text executable */
+		if (overlaps_kernel_text(vaddr, vaddr + step))
+			tprot &= ~HPTE_R_N;
+
+		hash = hpt_hash(va, shift, ssize);
+		hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
+
+		BUG_ON(!ppc_md.hpte_insert);
+		ret = ppc_md.hpte_insert(hpteg, va, paddr, tprot,
+					 HPTE_V_BOLTED, psize, ssize);
+
+		if (ret < 0)
+			break;
+#ifdef CONFIG_DEBUG_PAGEALLOC
+		if ((paddr >> PAGE_SHIFT) < linear_map_hash_count)
+			linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80;
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+	}
+	return ret < 0 ? ret : 0;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static int htab_remove_mapping(unsigned long vstart, unsigned long vend,
+		      int psize, int ssize)
+{
+	unsigned long vaddr;
+	unsigned int step, shift;
+
+	shift = mmu_psize_defs[psize].shift;
+	step = 1 << shift;
+
+	if (!ppc_md.hpte_removebolted) {
+		printk(KERN_WARNING "Platform doesn't implement "
+				"hpte_removebolted\n");
+		return -EINVAL;
+	}
+
+	for (vaddr = vstart; vaddr < vend; vaddr += step)
+		ppc_md.hpte_removebolted(vaddr, psize, ssize);
+
+	return 0;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+static int __init htab_dt_scan_seg_sizes(unsigned long node,
+					 const char *uname, int depth,
+					 void *data)
+{
+	char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	u32 *prop;
+	unsigned long size = 0;
+
+	/* We are scanning "cpu" nodes only */
+	if (type == NULL || strcmp(type, "cpu") != 0)
+		return 0;
+
+	prop = (u32 *)of_get_flat_dt_prop(node, "ibm,processor-segment-sizes",
+					  &size);
+	if (prop == NULL)
+		return 0;
+	for (; size >= 4; size -= 4, ++prop) {
+		if (prop[0] == 40) {
+			DBG("1T segment support detected\n");
+			cur_cpu_spec->cpu_features |= CPU_FTR_1T_SEGMENT;
+			return 1;
+		}
+	}
+	cur_cpu_spec->cpu_features &= ~CPU_FTR_NO_SLBIE_B;
+	return 0;
+}
+
+static void __init htab_init_seg_sizes(void)
+{
+	of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL);
+}
+
+static int __init htab_dt_scan_page_sizes(unsigned long node,
+					  const char *uname, int depth,
+					  void *data)
+{
+	char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	u32 *prop;
+	unsigned long size = 0;
+
+	/* We are scanning "cpu" nodes only */
+	if (type == NULL || strcmp(type, "cpu") != 0)
+		return 0;
+
+	prop = (u32 *)of_get_flat_dt_prop(node,
+					  "ibm,segment-page-sizes", &size);
+	if (prop != NULL) {
+		DBG("Page sizes from device-tree:\n");
+		size /= 4;
+		cur_cpu_spec->cpu_features &= ~(CPU_FTR_16M_PAGE);
+		while(size > 0) {
+			unsigned int shift = prop[0];
+			unsigned int slbenc = prop[1];
+			unsigned int lpnum = prop[2];
+			unsigned int lpenc = 0;
+			struct mmu_psize_def *def;
+			int idx = -1;
+
+			size -= 3; prop += 3;
+			while(size > 0 && lpnum) {
+				if (prop[0] == shift)
+					lpenc = prop[1];
+				prop += 2; size -= 2;
+				lpnum--;
+			}
+			switch(shift) {
+			case 0xc:
+				idx = MMU_PAGE_4K;
+				break;
+			case 0x10:
+				idx = MMU_PAGE_64K;
+				break;
+			case 0x14:
+				idx = MMU_PAGE_1M;
+				break;
+			case 0x18:
+				idx = MMU_PAGE_16M;
+				cur_cpu_spec->cpu_features |= CPU_FTR_16M_PAGE;
+				break;
+			case 0x22:
+				idx = MMU_PAGE_16G;
+				break;
+			}
+			if (idx < 0)
+				continue;
+			def = &mmu_psize_defs[idx];
+			def->shift = shift;
+			if (shift <= 23)
+				def->avpnm = 0;
+			else
+				def->avpnm = (1 << (shift - 23)) - 1;
+			def->sllp = slbenc;
+			def->penc = lpenc;
+			/* We don't know for sure what's up with tlbiel, so
+			 * for now we only set it for 4K and 64K pages
+			 */
+			if (idx == MMU_PAGE_4K || idx == MMU_PAGE_64K)
+				def->tlbiel = 1;
+			else
+				def->tlbiel = 0;
+
+			DBG(" %d: shift=%02x, sllp=%04x, avpnm=%08x, "
+			    "tlbiel=%d, penc=%d\n",
+			    idx, shift, def->sllp, def->avpnm, def->tlbiel,
+			    def->penc);
+		}
+		return 1;
+	}
+	return 0;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+/* Scan for 16G memory blocks that have been set aside for huge pages
+ * and reserve those blocks for 16G huge pages.
+ */
+static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
+					const char *uname, int depth,
+					void *data) {
+	char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	unsigned long *addr_prop;
+	u32 *page_count_prop;
+	unsigned int expected_pages;
+	long unsigned int phys_addr;
+	long unsigned int block_size;
+
+	/* We are scanning "memory" nodes only */
+	if (type == NULL || strcmp(type, "memory") != 0)
+		return 0;
+
+	/* This property is the log base 2 of the number of virtual pages that
+	 * will represent this memory block. */
+	page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL);
+	if (page_count_prop == NULL)
+		return 0;
+	expected_pages = (1 << page_count_prop[0]);
+	addr_prop = of_get_flat_dt_prop(node, "reg", NULL);
+	if (addr_prop == NULL)
+		return 0;
+	phys_addr = addr_prop[0];
+	block_size = addr_prop[1];
+	if (block_size != (16 * GB))
+		return 0;
+	printk(KERN_INFO "Huge page(16GB) memory: "
+			"addr = 0x%lX size = 0x%lX pages = %d\n",
+			phys_addr, block_size, expected_pages);
+	if (phys_addr + (16 * GB) <= lmb_end_of_DRAM()) {
+		lmb_reserve(phys_addr, block_size * expected_pages);
+		add_gpage(phys_addr, block_size, expected_pages);
+	}
+	return 0;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+static void __init htab_init_page_sizes(void)
+{
+	int rc;
+
+	/* Default to 4K pages only */
+	memcpy(mmu_psize_defs, mmu_psize_defaults_old,
+	       sizeof(mmu_psize_defaults_old));
+
+	/*
+	 * Try to find the available page sizes in the device-tree
+	 */
+	rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL);
+	if (rc != 0)  /* Found */
+		goto found;
+
+	/*
+	 * Not in the device-tree, let's fallback on known size
+	 * list for 16M capable GP & GR
+	 */
+	if (cpu_has_feature(CPU_FTR_16M_PAGE))
+		memcpy(mmu_psize_defs, mmu_psize_defaults_gp,
+		       sizeof(mmu_psize_defaults_gp));
+ found:
+#ifndef CONFIG_DEBUG_PAGEALLOC
+	/*
+	 * Pick a size for the linear mapping. Currently, we only support
+	 * 16M, 1M and 4K which is the default
+	 */
+	if (mmu_psize_defs[MMU_PAGE_16M].shift)
+		mmu_linear_psize = MMU_PAGE_16M;
+	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
+		mmu_linear_psize = MMU_PAGE_1M;
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
+#ifdef CONFIG_PPC_64K_PAGES
+	/*
+	 * Pick a size for the ordinary pages. Default is 4K, we support
+	 * 64K for user mappings and vmalloc if supported by the processor.
+	 * We only use 64k for ioremap if the processor
+	 * (and firmware) support cache-inhibited large pages.
+	 * If not, we use 4k and set mmu_ci_restrictions so that
+	 * hash_page knows to switch processes that use cache-inhibited
+	 * mappings to 4k pages.
+	 */
+	if (mmu_psize_defs[MMU_PAGE_64K].shift) {
+		mmu_virtual_psize = MMU_PAGE_64K;
+		mmu_vmalloc_psize = MMU_PAGE_64K;
+		if (mmu_linear_psize == MMU_PAGE_4K)
+			mmu_linear_psize = MMU_PAGE_64K;
+		if (cpu_has_feature(CPU_FTR_CI_LARGE_PAGE)) {
+			/*
+			 * Don't use 64k pages for ioremap on pSeries, since
+			 * that would stop us accessing the HEA ethernet.
+			 */
+			if (!machine_is(pseries))
+				mmu_io_psize = MMU_PAGE_64K;
+		} else
+			mmu_ci_restrictions = 1;
+	}
+#endif /* CONFIG_PPC_64K_PAGES */
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	/* We try to use 16M pages for vmemmap if that is supported
+	 * and we have at least 1G of RAM at boot
+	 */
+	if (mmu_psize_defs[MMU_PAGE_16M].shift &&
+	    lmb_phys_mem_size() >= 0x40000000)
+		mmu_vmemmap_psize = MMU_PAGE_16M;
+	else if (mmu_psize_defs[MMU_PAGE_64K].shift)
+		mmu_vmemmap_psize = MMU_PAGE_64K;
+	else
+		mmu_vmemmap_psize = MMU_PAGE_4K;
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
+	printk(KERN_DEBUG "Page orders: linear mapping = %d, "
+	       "virtual = %d, io = %d"
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	       ", vmemmap = %d"
+#endif
+	       "\n",
+	       mmu_psize_defs[mmu_linear_psize].shift,
+	       mmu_psize_defs[mmu_virtual_psize].shift,
+	       mmu_psize_defs[mmu_io_psize].shift
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	       ,mmu_psize_defs[mmu_vmemmap_psize].shift
+#endif
+	       );
+
+#ifdef CONFIG_HUGETLB_PAGE
+	/* Reserve 16G huge page memory sections for huge pages */
+	of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
+
+/* Set default large page size. Currently, we pick 16M or 1M depending
+	 * on what is available
+	 */
+	if (mmu_psize_defs[MMU_PAGE_16M].shift)
+		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
+	/* With 4k/4level pagetables, we can't (for now) cope with a
+	 * huge page size < PMD_SIZE */
+	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
+		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
+#endif /* CONFIG_HUGETLB_PAGE */
+}
+
+static int __init htab_dt_scan_pftsize(unsigned long node,
+				       const char *uname, int depth,
+				       void *data)
+{
+	char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	u32 *prop;
+
+	/* We are scanning "cpu" nodes only */
+	if (type == NULL || strcmp(type, "cpu") != 0)
+		return 0;
+
+	prop = (u32 *)of_get_flat_dt_prop(node, "ibm,pft-size", NULL);
+	if (prop != NULL) {
+		/* pft_size[0] is the NUMA CEC cookie */
+		ppc64_pft_size = prop[1];
+		return 1;
+	}
+	return 0;
+}
+
+static unsigned long __init htab_get_table_size(void)
+{
+	unsigned long mem_size, rnd_mem_size, pteg_count;
+
+	/* If hash size isn't already provided by the platform, we try to
+	 * retrieve it from the device-tree. If it's not there neither, we
+	 * calculate it now based on the total RAM size
+	 */
+	if (ppc64_pft_size == 0)
+		of_scan_flat_dt(htab_dt_scan_pftsize, NULL);
+	if (ppc64_pft_size)
+		return 1UL << ppc64_pft_size;
+
+	/* round mem_size up to next power of 2 */
+	mem_size = lmb_phys_mem_size();
+	rnd_mem_size = 1UL << __ilog2(mem_size);
+	if (rnd_mem_size < mem_size)
+		rnd_mem_size <<= 1;
+
+	/* # pages / 2 */
+	pteg_count = max(rnd_mem_size >> (12 + 1), 1UL << 11);
+
+	return pteg_count << 7;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void create_section_mapping(unsigned long start, unsigned long end)
+{
+	BUG_ON(htab_bolt_mapping(start, end, __pa(start),
+				 pgprot_val(PAGE_KERNEL), mmu_linear_psize,
+				 mmu_kernel_ssize));
+}
+
+int remove_section_mapping(unsigned long start, unsigned long end)
+{
+	return htab_remove_mapping(start, end, mmu_linear_psize,
+			mmu_kernel_ssize);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+static inline void make_bl(unsigned int *insn_addr, void *func)
+{
+	unsigned long funcp = *((unsigned long *)func);
+	int offset = funcp - (unsigned long)insn_addr;
+
+	*insn_addr = (unsigned int)(0x48000001 | (offset & 0x03fffffc));
+	flush_icache_range((unsigned long)insn_addr, 4+
+			   (unsigned long)insn_addr);
+}
+
+static void __init htab_finish_init(void)
+{
+	extern unsigned int *htab_call_hpte_insert1;
+	extern unsigned int *htab_call_hpte_insert2;
+	extern unsigned int *htab_call_hpte_remove;
+	extern unsigned int *htab_call_hpte_updatepp;
+
+#ifdef CONFIG_PPC_HAS_HASH_64K
+	extern unsigned int *ht64_call_hpte_insert1;
+	extern unsigned int *ht64_call_hpte_insert2;
+	extern unsigned int *ht64_call_hpte_remove;
+	extern unsigned int *ht64_call_hpte_updatepp;
+
+	make_bl(ht64_call_hpte_insert1, ppc_md.hpte_insert);
+	make_bl(ht64_call_hpte_insert2, ppc_md.hpte_insert);
+	make_bl(ht64_call_hpte_remove, ppc_md.hpte_remove);
+	make_bl(ht64_call_hpte_updatepp, ppc_md.hpte_updatepp);
+#endif /* CONFIG_PPC_HAS_HASH_64K */
+
+	make_bl(htab_call_hpte_insert1, ppc_md.hpte_insert);
+	make_bl(htab_call_hpte_insert2, ppc_md.hpte_insert);
+	make_bl(htab_call_hpte_remove, ppc_md.hpte_remove);
+	make_bl(htab_call_hpte_updatepp, ppc_md.hpte_updatepp);
+}
+
+void __init htab_initialize(void)
+{
+	unsigned long table;
+	unsigned long pteg_count;
+	unsigned long prot;
+	unsigned long base = 0, size = 0, limit;
+	int i;
+
+	DBG(" -> htab_initialize()\n");
+
+	/* Initialize segment sizes */
+	htab_init_seg_sizes();
+
+	/* Initialize page sizes */
+	htab_init_page_sizes();
+
+	if (cpu_has_feature(CPU_FTR_1T_SEGMENT)) {
+		mmu_kernel_ssize = MMU_SEGSIZE_1T;
+		mmu_highuser_ssize = MMU_SEGSIZE_1T;
+		printk(KERN_INFO "Using 1TB segments\n");
+	}
+
+	/*
+	 * Calculate the required size of the htab.  We want the number of
+	 * PTEGs to equal one half the number of real pages.
+	 */ 
+	htab_size_bytes = htab_get_table_size();
+	pteg_count = htab_size_bytes >> 7;
+
+	htab_hash_mask = pteg_count - 1;
+
+	if (firmware_has_feature(FW_FEATURE_LPAR)) {
+		/* Using a hypervisor which owns the htab */
+		htab_address = NULL;
+		_SDR1 = 0; 
+	} else {
+		/* Find storage for the HPT.  Must be contiguous in
+		 * the absolute address space. On cell we want it to be
+		 * in the first 2 Gig so we can use it for IOMMU hacks.
+		 */
+		if (machine_is(cell))
+			limit = 0x80000000;
+		else
+			limit = 0;
+
+		table = lmb_alloc_base(htab_size_bytes, htab_size_bytes, limit);
+
+		DBG("Hash table allocated at %lx, size: %lx\n", table,
+		    htab_size_bytes);
+
+		htab_address = abs_to_virt(table);
+
+		/* htab absolute addr + encoded htabsize */
+		_SDR1 = table + __ilog2(pteg_count) - 11;
+
+		/* Initialize the HPT with no entries */
+		memset((void *)table, 0, htab_size_bytes);
+
+		/* Set SDR1 */
+		mtspr(SPRN_SDR1, _SDR1);
+	}
+
+	prot = pgprot_val(PAGE_KERNEL);
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	linear_map_hash_count = lmb_end_of_DRAM() >> PAGE_SHIFT;
+	linear_map_hash_slots = __va(lmb_alloc_base(linear_map_hash_count,
+						    1, lmb.rmo_size));
+	memset(linear_map_hash_slots, 0, linear_map_hash_count);
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
+	/* On U3 based machines, we need to reserve the DART area and
+	 * _NOT_ map it to avoid cache paradoxes as it's remapped non
+	 * cacheable later on
+	 */
+
+	/* create bolted the linear mapping in the hash table */
+	for (i=0; i < lmb.memory.cnt; i++) {
+		base = (unsigned long)__va(lmb.memory.region[i].base);
+		size = lmb.memory.region[i].size;
+
+		DBG("creating mapping for region: %lx..%lx (prot: %x)\n",
+		    base, size, prot);
+
+#ifdef CONFIG_U3_DART
+		/* Do not map the DART space. Fortunately, it will be aligned
+		 * in such a way that it will not cross two lmb regions and
+		 * will fit within a single 16Mb page.
+		 * The DART space is assumed to be a full 16Mb region even if
+		 * we only use 2Mb of that space. We will use more of it later
+		 * for AGP GART. We have to use a full 16Mb large page.
+		 */
+		DBG("DART base: %lx\n", dart_tablebase);
+
+		if (dart_tablebase != 0 && dart_tablebase >= base
+		    && dart_tablebase < (base + size)) {
+			unsigned long dart_table_end = dart_tablebase + 16 * MB;
+			if (base != dart_tablebase)
+				BUG_ON(htab_bolt_mapping(base, dart_tablebase,
+							__pa(base), prot,
+							mmu_linear_psize,
+							mmu_kernel_ssize));
+			if ((base + size) > dart_table_end)
+				BUG_ON(htab_bolt_mapping(dart_tablebase+16*MB,
+							base + size,
+							__pa(dart_table_end),
+							 prot,
+							 mmu_linear_psize,
+							 mmu_kernel_ssize));
+			continue;
+		}
+#endif /* CONFIG_U3_DART */
+		BUG_ON(htab_bolt_mapping(base, base + size, __pa(base),
+				prot, mmu_linear_psize, mmu_kernel_ssize));
+       }
+
+	/*
+	 * If we have a memory_limit and we've allocated TCEs then we need to
+	 * explicitly map the TCE area at the top of RAM. We also cope with the
+	 * case that the TCEs start below memory_limit.
+	 * tce_alloc_start/end are 16MB aligned so the mapping should work
+	 * for either 4K or 16MB pages.
+	 */
+	if (tce_alloc_start) {
+		tce_alloc_start = (unsigned long)__va(tce_alloc_start);
+		tce_alloc_end = (unsigned long)__va(tce_alloc_end);
+
+		if (base + size >= tce_alloc_start)
+			tce_alloc_start = base + size + 1;
+
+		BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end,
+					 __pa(tce_alloc_start), prot,
+					 mmu_linear_psize, mmu_kernel_ssize));
+	}
+
+	htab_finish_init();
+
+	DBG(" <- htab_initialize()\n");
+}
+#undef KB
+#undef MB
+
+void htab_initialize_secondary(void)
+{
+	if (!firmware_has_feature(FW_FEATURE_LPAR))
+		mtspr(SPRN_SDR1, _SDR1);
+}
+
+/*
+ * Called by asm hashtable.S for doing lazy icache flush
+ */
+unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
+{
+	struct page *page;
+
+	if (!pfn_valid(pte_pfn(pte)))
+		return pp;
+
+	page = pte_page(pte);
+
+	/* page is dirty */
+	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
+		if (trap == 0x400) {
+			__flush_dcache_icache(page_address(page));
+			set_bit(PG_arch_1, &page->flags);
+		} else
+			pp |= HPTE_R_N;
+	}
+	return pp;
+}
+
+#ifdef CONFIG_PPC_MM_SLICES
+unsigned int get_paca_psize(unsigned long addr)
+{
+	unsigned long index, slices;
+
+	if (addr < SLICE_LOW_TOP) {
+		slices = get_paca()->context.low_slices_psize;
+		index = GET_LOW_SLICE_INDEX(addr);
+	} else {
+		slices = get_paca()->context.high_slices_psize;
+		index = GET_HIGH_SLICE_INDEX(addr);
+	}
+	return (slices >> (index * 4)) & 0xF;
+}
+
+#else
+unsigned int get_paca_psize(unsigned long addr)
+{
+	return get_paca()->context.user_psize;
+}
+#endif
+
+/*
+ * Demote a segment to using 4k pages.
+ * For now this makes the whole process use 4k pages.
+ */
+#ifdef CONFIG_PPC_64K_PAGES
+void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
+{
+	if (get_slice_psize(mm, addr) == MMU_PAGE_4K)
+		return;
+	slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K);
+#ifdef CONFIG_SPU_BASE
+	spu_flush_all_slbs(mm);
+#endif
+	if (get_paca_psize(addr) != MMU_PAGE_4K) {
+		get_paca()->context = mm->context;
+		slb_flush_and_rebolt();
+	}
+}
+#endif /* CONFIG_PPC_64K_PAGES */
+
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+/*
+ * This looks up a 2-bit protection code for a 4k subpage of a 64k page.
+ * Userspace sets the subpage permissions using the subpage_prot system call.
+ *
+ * Result is 0: full permissions, _PAGE_RW: read-only,
+ * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access.
+ */
+static int subpage_protection(pgd_t *pgdir, unsigned long ea)
+{
+	struct subpage_prot_table *spt = pgd_subpage_prot(pgdir);
+	u32 spp = 0;
+	u32 **sbpm, *sbpp;
+
+	if (ea >= spt->maxaddr)
+		return 0;
+	if (ea < 0x100000000) {
+		/* addresses below 4GB use spt->low_prot */
+		sbpm = spt->low_prot;
+	} else {
+		sbpm = spt->protptrs[ea >> SBP_L3_SHIFT];
+		if (!sbpm)
+			return 0;
+	}
+	sbpp = sbpm[(ea >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)];
+	if (!sbpp)
+		return 0;
+	spp = sbpp[(ea >> PAGE_SHIFT) & (SBP_L1_COUNT - 1)];
+
+	/* extract 2-bit bitfield for this 4k subpage */
+	spp >>= 30 - 2 * ((ea >> 12) & 0xf);
+
+	/* turn 0,1,2,3 into combination of _PAGE_USER and _PAGE_RW */
+	spp = ((spp & 2) ? _PAGE_USER : 0) | ((spp & 1) ? _PAGE_RW : 0);
+	return spp;
+}
+
+#else /* CONFIG_PPC_SUBPAGE_PROT */
+static inline int subpage_protection(pgd_t *pgdir, unsigned long ea)
+{
+	return 0;
+}
+#endif
+
+/* Result code is:
+ *  0 - handled
+ *  1 - normal page fault
+ * -1 - critical hash insertion error
+ * -2 - access not permitted by subpage protection mechanism
+ */
+int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
+{
+	void *pgdir;
+	unsigned long vsid;
+	struct mm_struct *mm;
+	pte_t *ptep;
+	cpumask_t tmp;
+	int rc, user_region = 0, local = 0;
+	int psize, ssize;
+
+	DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
+		ea, access, trap);
+
+	if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) {
+		DBG_LOW(" out of pgtable range !\n");
+ 		return 1;
+	}
+
+	/* Get region & vsid */
+ 	switch (REGION_ID(ea)) {
+	case USER_REGION_ID:
+		user_region = 1;
+		mm = current->mm;
+		if (! mm) {
+			DBG_LOW(" user region with no mm !\n");
+			return 1;
+		}
+		psize = get_slice_psize(mm, ea);
+		ssize = user_segment_size(ea);
+		vsid = get_vsid(mm->context.id, ea, ssize);
+		break;
+	case VMALLOC_REGION_ID:
+		mm = &init_mm;
+		vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+		if (ea < VMALLOC_END)
+			psize = mmu_vmalloc_psize;
+		else
+			psize = mmu_io_psize;
+		ssize = mmu_kernel_ssize;
+		break;
+	default:
+		/* Not a valid range
+		 * Send the problem up to do_page_fault 
+		 */
+		return 1;
+	}
+	DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid);
+
+	/* Get pgdir */
+	pgdir = mm->pgd;
+	if (pgdir == NULL)
+		return 1;
+
+	/* Check CPU locality */
+	tmp = cpumask_of_cpu(smp_processor_id());
+	if (user_region && cpus_equal(mm->cpu_vm_mask, tmp))
+		local = 1;
+
+#ifdef CONFIG_HUGETLB_PAGE
+	/* Handle hugepage regions */
+	if (HPAGE_SHIFT && mmu_huge_psizes[psize]) {
+		DBG_LOW(" -> huge page !\n");
+		return hash_huge_page(mm, access, ea, vsid, local, trap);
+	}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+#ifndef CONFIG_PPC_64K_PAGES
+	/* If we use 4K pages and our psize is not 4K, then we are hitting
+	 * a special driver mapping, we need to align the address before
+	 * we fetch the PTE
+	 */
+	if (psize != MMU_PAGE_4K)
+		ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
+#endif /* CONFIG_PPC_64K_PAGES */
+
+	/* Get PTE and page size from page tables */
+	ptep = find_linux_pte(pgdir, ea);
+	if (ptep == NULL || !pte_present(*ptep)) {
+		DBG_LOW(" no PTE !\n");
+		return 1;
+	}
+
+#ifndef CONFIG_PPC_64K_PAGES
+	DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
+#else
+	DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep),
+		pte_val(*(ptep + PTRS_PER_PTE)));
+#endif
+	/* Pre-check access permissions (will be re-checked atomically
+	 * in __hash_page_XX but this pre-check is a fast path
+	 */
+	if (access & ~pte_val(*ptep)) {
+		DBG_LOW(" no access !\n");
+		return 1;
+	}
+
+	/* Do actual hashing */
+#ifdef CONFIG_PPC_64K_PAGES
+	/* If _PAGE_4K_PFN is set, make sure this is a 4k segment */
+	if ((pte_val(*ptep) & _PAGE_4K_PFN) && psize == MMU_PAGE_64K) {
+		demote_segment_4k(mm, ea);
+		psize = MMU_PAGE_4K;
+	}
+
+	/* If this PTE is non-cacheable and we have restrictions on
+	 * using non cacheable large pages, then we switch to 4k
+	 */
+	if (mmu_ci_restrictions && psize == MMU_PAGE_64K &&
+	    (pte_val(*ptep) & _PAGE_NO_CACHE)) {
+		if (user_region) {
+			demote_segment_4k(mm, ea);
+			psize = MMU_PAGE_4K;
+		} else if (ea < VMALLOC_END) {
+			/*
+			 * some driver did a non-cacheable mapping
+			 * in vmalloc space, so switch vmalloc
+			 * to 4k pages
+			 */
+			printk(KERN_ALERT "Reducing vmalloc segment "
+			       "to 4kB pages because of "
+			       "non-cacheable mapping\n");
+			psize = mmu_vmalloc_psize = MMU_PAGE_4K;
+#ifdef CONFIG_SPU_BASE
+			spu_flush_all_slbs(mm);
+#endif
+		}
+	}
+	if (user_region) {
+		if (psize != get_paca_psize(ea)) {
+			get_paca()->context = mm->context;
+			slb_flush_and_rebolt();
+		}
+	} else if (get_paca()->vmalloc_sllp !=
+		   mmu_psize_defs[mmu_vmalloc_psize].sllp) {
+		get_paca()->vmalloc_sllp =
+			mmu_psize_defs[mmu_vmalloc_psize].sllp;
+		slb_vmalloc_update();
+	}
+#endif /* CONFIG_PPC_64K_PAGES */
+
+#ifdef CONFIG_PPC_HAS_HASH_64K
+	if (psize == MMU_PAGE_64K)
+		rc = __hash_page_64K(ea, access, vsid, ptep, trap, local, ssize);
+	else
+#endif /* CONFIG_PPC_HAS_HASH_64K */
+	{
+		int spp = subpage_protection(pgdir, ea);
+		if (access & spp)
+			rc = -2;
+		else
+			rc = __hash_page_4K(ea, access, vsid, ptep, trap,
+					    local, ssize, spp);
+	}
+
+#ifndef CONFIG_PPC_64K_PAGES
+	DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep));
+#else
+	DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep),
+		pte_val(*(ptep + PTRS_PER_PTE)));
+#endif
+	DBG_LOW(" -> rc=%d\n", rc);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(hash_page);
+
+void hash_preload(struct mm_struct *mm, unsigned long ea,
+		  unsigned long access, unsigned long trap)
+{
+	unsigned long vsid;
+	void *pgdir;
+	pte_t *ptep;
+	cpumask_t mask;
+	unsigned long flags;
+	int local = 0;
+	int ssize;
+
+	BUG_ON(REGION_ID(ea) != USER_REGION_ID);
+
+#ifdef CONFIG_PPC_MM_SLICES
+	/* We only prefault standard pages for now */
+	if (unlikely(get_slice_psize(mm, ea) != mm->context.user_psize))
+		return;
+#endif
+
+	DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx,"
+		" trap=%lx\n", mm, mm->pgd, ea, access, trap);
+
+	/* Get Linux PTE if available */
+	pgdir = mm->pgd;
+	if (pgdir == NULL)
+		return;
+	ptep = find_linux_pte(pgdir, ea);
+	if (!ptep)
+		return;
+
+#ifdef CONFIG_PPC_64K_PAGES
+	/* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on
+	 * a 64K kernel), then we don't preload, hash_page() will take
+	 * care of it once we actually try to access the page.
+	 * That way we don't have to duplicate all of the logic for segment
+	 * page size demotion here
+	 */
+	if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE))
+		return;
+#endif /* CONFIG_PPC_64K_PAGES */
+
+	/* Get VSID */
+	ssize = user_segment_size(ea);
+	vsid = get_vsid(mm->context.id, ea, ssize);
+
+	/* Hash doesn't like irqs */
+	local_irq_save(flags);
+
+	/* Is that local to this CPU ? */
+	mask = cpumask_of_cpu(smp_processor_id());
+	if (cpus_equal(mm->cpu_vm_mask, mask))
+		local = 1;
+
+	/* Hash it in */
+#ifdef CONFIG_PPC_HAS_HASH_64K
+	if (mm->context.user_psize == MMU_PAGE_64K)
+		__hash_page_64K(ea, access, vsid, ptep, trap, local, ssize);
+	else
+#endif /* CONFIG_PPC_HAS_HASH_64K */
+		__hash_page_4K(ea, access, vsid, ptep, trap, local, ssize,
+			       subpage_protection(pgdir, ea));
+
+	local_irq_restore(flags);
+}
+
+/* WARNING: This is called from hash_low_64.S, if you change this prototype,
+ *          do not forget to update the assembly call site !
+ */
+void flush_hash_page(unsigned long va, real_pte_t pte, int psize, int ssize,
+		     int local)
+{
+	unsigned long hash, index, shift, hidx, slot;
+
+	DBG_LOW("flush_hash_page(va=%016x)\n", va);
+	pte_iterate_hashed_subpages(pte, psize, va, index, shift) {
+		hash = hpt_hash(va, shift, ssize);
+		hidx = __rpte_to_hidx(pte, index);
+		if (hidx & _PTEIDX_SECONDARY)
+			hash = ~hash;
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += hidx & _PTEIDX_GROUP_IX;
+		DBG_LOW(" sub %d: hash=%x, hidx=%x\n", index, slot, hidx);
+		ppc_md.hpte_invalidate(slot, va, psize, ssize, local);
+	} pte_iterate_hashed_end();
+}
+
+void flush_hash_range(unsigned long number, int local)
+{
+	if (ppc_md.flush_hash_range)
+		ppc_md.flush_hash_range(number, local);
+	else {
+		int i;
+		struct ppc64_tlb_batch *batch =
+			&__get_cpu_var(ppc64_tlb_batch);
+
+		for (i = 0; i < number; i++)
+			flush_hash_page(batch->vaddr[i], batch->pte[i],
+					batch->psize, batch->ssize, local);
+	}
+}
+
+/*
+ * low_hash_fault is called when we the low level hash code failed
+ * to instert a PTE due to an hypervisor error
+ */
+void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc)
+{
+	if (user_mode(regs)) {
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+		if (rc == -2)
+			_exception(SIGSEGV, regs, SEGV_ACCERR, address);
+		else
+#endif
+			_exception(SIGBUS, regs, BUS_ADRERR, address);
+	} else
+		bad_page_fault(regs, address, SIGBUS);
+}
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
+{
+	unsigned long hash, hpteg;
+	unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
+	unsigned long va = hpt_va(vaddr, vsid, mmu_kernel_ssize);
+	unsigned long mode = htab_convert_pte_flags(PAGE_KERNEL);
+	int ret;
+
+	hash = hpt_hash(va, PAGE_SHIFT, mmu_kernel_ssize);
+	hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
+
+	ret = ppc_md.hpte_insert(hpteg, va, __pa(vaddr),
+				 mode, HPTE_V_BOLTED,
+				 mmu_linear_psize, mmu_kernel_ssize);
+	BUG_ON (ret < 0);
+	spin_lock(&linear_map_hash_lock);
+	BUG_ON(linear_map_hash_slots[lmi] & 0x80);
+	linear_map_hash_slots[lmi] = ret | 0x80;
+	spin_unlock(&linear_map_hash_lock);
+}
+
+static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
+{
+	unsigned long hash, hidx, slot;
+	unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
+	unsigned long va = hpt_va(vaddr, vsid, mmu_kernel_ssize);
+
+	hash = hpt_hash(va, PAGE_SHIFT, mmu_kernel_ssize);
+	spin_lock(&linear_map_hash_lock);
+	BUG_ON(!(linear_map_hash_slots[lmi] & 0x80));
+	hidx = linear_map_hash_slots[lmi] & 0x7f;
+	linear_map_hash_slots[lmi] = 0;
+	spin_unlock(&linear_map_hash_lock);
+	if (hidx & _PTEIDX_SECONDARY)
+		hash = ~hash;
+	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+	slot += hidx & _PTEIDX_GROUP_IX;
+	ppc_md.hpte_invalidate(slot, va, mmu_linear_psize, mmu_kernel_ssize, 0);
+}
+
+void kernel_map_pages(struct page *page, int numpages, int enable)
+{
+	unsigned long flags, vaddr, lmi;
+	int i;
+
+	local_irq_save(flags);
+	for (i = 0; i < numpages; i++, page++) {
+		vaddr = (unsigned long)page_address(page);
+		lmi = __pa(vaddr) >> PAGE_SHIFT;
+		if (lmi >= linear_map_hash_count)
+			continue;
+		if (enable)
+			kernel_map_linear_page(vaddr, lmi);
+		else
+			kernel_unmap_linear_page(vaddr, lmi);
+	}
+	local_irq_restore(flags);
+}
+#endif /* CONFIG_DEBUG_PAGEALLOC */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
new file mode 100644
index 0000000..f0c3b88
--- /dev/null
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -0,0 +1,781 @@
+/*
+ * PPC64 (POWER4) Huge TLB Page Support for Kernel.
+ *
+ * Copyright (C) 2003 David Gibson, IBM Corporation.
+ *
+ * Based on the IA-32 version:
+ * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/sysctl.h>
+#include <asm/mman.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/machdep.h>
+#include <asm/cputable.h>
+#include <asm/spu.h>
+
+#define PAGE_SHIFT_64K	16
+#define PAGE_SHIFT_16M	24
+#define PAGE_SHIFT_16G	34
+
+#define NUM_LOW_AREAS	(0x100000000UL >> SID_SHIFT)
+#define NUM_HIGH_AREAS	(PGTABLE_RANGE >> HTLB_AREA_SHIFT)
+#define MAX_NUMBER_GPAGES	1024
+
+/* Tracks the 16G pages after the device tree is scanned and before the
+ * huge_boot_pages list is ready.  */
+static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
+static unsigned nr_gpages;
+
+/* Array of valid huge page sizes - non-zero value(hugepte_shift) is
+ * stored for the huge page sizes that are valid.
+ */
+unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
+
+#define hugepte_shift			mmu_huge_psizes
+#define PTRS_PER_HUGEPTE(psize)		(1 << hugepte_shift[psize])
+#define HUGEPTE_TABLE_SIZE(psize)	(sizeof(pte_t) << hugepte_shift[psize])
+
+#define HUGEPD_SHIFT(psize)		(mmu_psize_to_shift(psize) \
+						+ hugepte_shift[psize])
+#define HUGEPD_SIZE(psize)		(1UL << HUGEPD_SHIFT(psize))
+#define HUGEPD_MASK(psize)		(~(HUGEPD_SIZE(psize)-1))
+
+/* Subtract one from array size because we don't need a cache for 4K since
+ * is not a huge page size */
+#define huge_pgtable_cache(psize)	(pgtable_cache[HUGEPTE_CACHE_NUM \
+							+ psize-1])
+#define HUGEPTE_CACHE_NAME(psize)	(huge_pgtable_cache_name[psize])
+
+static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
+	"unused_4K", "hugepte_cache_64K", "unused_64K_AP",
+	"hugepte_cache_1M", "hugepte_cache_16M", "hugepte_cache_16G"
+};
+
+/* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
+ * will choke on pointers to hugepte tables, which is handy for
+ * catching screwups early. */
+#define HUGEPD_OK	0x1
+
+typedef struct { unsigned long pd; } hugepd_t;
+
+#define hugepd_none(hpd)	((hpd).pd == 0)
+
+static inline int shift_to_mmu_psize(unsigned int shift)
+{
+	switch (shift) {
+#ifndef CONFIG_PPC_64K_PAGES
+	case PAGE_SHIFT_64K:
+	    return MMU_PAGE_64K;
+#endif
+	case PAGE_SHIFT_16M:
+	    return MMU_PAGE_16M;
+	case PAGE_SHIFT_16G:
+	    return MMU_PAGE_16G;
+	}
+	return -1;
+}
+
+static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
+{
+	if (mmu_psize_defs[mmu_psize].shift)
+		return mmu_psize_defs[mmu_psize].shift;
+	BUG();
+}
+
+static inline pte_t *hugepd_page(hugepd_t hpd)
+{
+	BUG_ON(!(hpd.pd & HUGEPD_OK));
+	return (pte_t *)(hpd.pd & ~HUGEPD_OK);
+}
+
+static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
+				    struct hstate *hstate)
+{
+	unsigned int shift = huge_page_shift(hstate);
+	int psize = shift_to_mmu_psize(shift);
+	unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1));
+	pte_t *dir = hugepd_page(*hpdp);
+
+	return dir + idx;
+}
+
+static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
+			   unsigned long address, unsigned int psize)
+{
+	pte_t *new = kmem_cache_zalloc(huge_pgtable_cache(psize),
+				      GFP_KERNEL|__GFP_REPEAT);
+
+	if (! new)
+		return -ENOMEM;
+
+	spin_lock(&mm->page_table_lock);
+	if (!hugepd_none(*hpdp))
+		kmem_cache_free(huge_pgtable_cache(psize), new);
+	else
+		hpdp->pd = (unsigned long)new | HUGEPD_OK;
+	spin_unlock(&mm->page_table_lock);
+	return 0;
+}
+
+
+static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate)
+{
+	if (huge_page_shift(hstate) < PUD_SHIFT)
+		return pud_offset(pgd, addr);
+	else
+		return (pud_t *) pgd;
+}
+static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr,
+			 struct hstate *hstate)
+{
+	if (huge_page_shift(hstate) < PUD_SHIFT)
+		return pud_alloc(mm, pgd, addr);
+	else
+		return (pud_t *) pgd;
+}
+static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate)
+{
+	if (huge_page_shift(hstate) < PMD_SHIFT)
+		return pmd_offset(pud, addr);
+	else
+		return (pmd_t *) pud;
+}
+static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr,
+			 struct hstate *hstate)
+{
+	if (huge_page_shift(hstate) < PMD_SHIFT)
+		return pmd_alloc(mm, pud, addr);
+	else
+		return (pmd_t *) pud;
+}
+
+/* Build list of addresses of gigantic pages.  This function is used in early
+ * boot before the buddy or bootmem allocator is setup.
+ */
+void add_gpage(unsigned long addr, unsigned long page_size,
+	unsigned long number_of_pages)
+{
+	if (!addr)
+		return;
+	while (number_of_pages > 0) {
+		gpage_freearray[nr_gpages] = addr;
+		nr_gpages++;
+		number_of_pages--;
+		addr += page_size;
+	}
+}
+
+/* Moves the gigantic page addresses from the temporary list to the
+ * huge_boot_pages list.
+ */
+int alloc_bootmem_huge_page(struct hstate *hstate)
+{
+	struct huge_bootmem_page *m;
+	if (nr_gpages == 0)
+		return 0;
+	m = phys_to_virt(gpage_freearray[--nr_gpages]);
+	gpage_freearray[nr_gpages] = 0;
+	list_add(&m->list, &huge_boot_pages);
+	m->hstate = hstate;
+	return 1;
+}
+
+
+/* Modelled after find_linux_pte() */
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pg;
+	pud_t *pu;
+	pmd_t *pm;
+
+	unsigned int psize;
+	unsigned int shift;
+	unsigned long sz;
+	struct hstate *hstate;
+	psize = get_slice_psize(mm, addr);
+	shift = mmu_psize_to_shift(psize);
+	sz = ((1UL) << shift);
+	hstate = size_to_hstate(sz);
+
+	addr &= hstate->mask;
+
+	pg = pgd_offset(mm, addr);
+	if (!pgd_none(*pg)) {
+		pu = hpud_offset(pg, addr, hstate);
+		if (!pud_none(*pu)) {
+			pm = hpmd_offset(pu, addr, hstate);
+			if (!pmd_none(*pm))
+				return hugepte_offset((hugepd_t *)pm, addr,
+						      hstate);
+		}
+	}
+
+	return NULL;
+}
+
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+			unsigned long addr, unsigned long sz)
+{
+	pgd_t *pg;
+	pud_t *pu;
+	pmd_t *pm;
+	hugepd_t *hpdp = NULL;
+	struct hstate *hstate;
+	unsigned int psize;
+	hstate = size_to_hstate(sz);
+
+	psize = get_slice_psize(mm, addr);
+	BUG_ON(!mmu_huge_psizes[psize]);
+
+	addr &= hstate->mask;
+
+	pg = pgd_offset(mm, addr);
+	pu = hpud_alloc(mm, pg, addr, hstate);
+
+	if (pu) {
+		pm = hpmd_alloc(mm, pu, addr, hstate);
+		if (pm)
+			hpdp = (hugepd_t *)pm;
+	}
+
+	if (! hpdp)
+		return NULL;
+
+	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize))
+		return NULL;
+
+	return hugepte_offset(hpdp, addr, hstate);
+}
+
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+	return 0;
+}
+
+static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp,
+			       unsigned int psize)
+{
+	pte_t *hugepte = hugepd_page(*hpdp);
+
+	hpdp->pd = 0;
+	tlb->need_flush = 1;
+	pgtable_free_tlb(tlb, pgtable_free_cache(hugepte,
+						 HUGEPTE_CACHE_NUM+psize-1,
+						 PGF_CACHENUM_MASK));
+}
+
+static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+				   unsigned long addr, unsigned long end,
+				   unsigned long floor, unsigned long ceiling,
+				   unsigned int psize)
+{
+	pmd_t *pmd;
+	unsigned long next;
+	unsigned long start;
+
+	start = addr;
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none(*pmd))
+			continue;
+		free_hugepte_range(tlb, (hugepd_t *)pmd, psize);
+	} while (pmd++, addr = next, addr != end);
+
+	start &= PUD_MASK;
+	if (start < floor)
+		return;
+	if (ceiling) {
+		ceiling &= PUD_MASK;
+		if (!ceiling)
+			return;
+	}
+	if (end - 1 > ceiling - 1)
+		return;
+
+	pmd = pmd_offset(pud, start);
+	pud_clear(pud);
+	pmd_free_tlb(tlb, pmd);
+}
+
+static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+				   unsigned long addr, unsigned long end,
+				   unsigned long floor, unsigned long ceiling)
+{
+	pud_t *pud;
+	unsigned long next;
+	unsigned long start;
+	unsigned int shift;
+	unsigned int psize = get_slice_psize(tlb->mm, addr);
+	shift = mmu_psize_to_shift(psize);
+
+	start = addr;
+	pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (shift < PMD_SHIFT) {
+			if (pud_none_or_clear_bad(pud))
+				continue;
+			hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
+					       ceiling, psize);
+		} else {
+			if (pud_none(*pud))
+				continue;
+			free_hugepte_range(tlb, (hugepd_t *)pud, psize);
+		}
+	} while (pud++, addr = next, addr != end);
+
+	start &= PGDIR_MASK;
+	if (start < floor)
+		return;
+	if (ceiling) {
+		ceiling &= PGDIR_MASK;
+		if (!ceiling)
+			return;
+	}
+	if (end - 1 > ceiling - 1)
+		return;
+
+	pud = pud_offset(pgd, start);
+	pgd_clear(pgd);
+	pud_free_tlb(tlb, pud);
+}
+
+/*
+ * This function frees user-level page tables of a process.
+ *
+ * Must be called with pagetable lock held.
+ */
+void hugetlb_free_pgd_range(struct mmu_gather *tlb,
+			    unsigned long addr, unsigned long end,
+			    unsigned long floor, unsigned long ceiling)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	unsigned long start;
+
+	/*
+	 * Comments below take from the normal free_pgd_range().  They
+	 * apply here too.  The tests against HUGEPD_MASK below are
+	 * essential, because we *don't* test for this at the bottom
+	 * level.  Without them we'll attempt to free a hugepte table
+	 * when we unmap just part of it, even if there are other
+	 * active mappings using it.
+	 *
+	 * The next few lines have given us lots of grief...
+	 *
+	 * Why are we testing HUGEPD* at this top level?  Because
+	 * often there will be no work to do at all, and we'd prefer
+	 * not to go all the way down to the bottom just to discover
+	 * that.
+	 *
+	 * Why all these "- 1"s?  Because 0 represents both the bottom
+	 * of the address space and the top of it (using -1 for the
+	 * top wouldn't help much: the masks would do the wrong thing).
+	 * The rule is that addr 0 and floor 0 refer to the bottom of
+	 * the address space, but end 0 and ceiling 0 refer to the top
+	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
+	 * that end 0 case should be mythical).
+	 *
+	 * Wherever addr is brought up or ceiling brought down, we
+	 * must be careful to reject "the opposite 0" before it
+	 * confuses the subsequent tests.  But what about where end is
+	 * brought down by HUGEPD_SIZE below? no, end can't go down to
+	 * 0 there.
+	 *
+	 * Whereas we round start (addr) and ceiling down, by different
+	 * masks at different levels, in order to test whether a table
+	 * now has no other vmas using it, so can be freed, we don't
+	 * bother to round floor or end up - the tests don't need that.
+	 */
+	unsigned int psize = get_slice_psize(tlb->mm, addr);
+
+	addr &= HUGEPD_MASK(psize);
+	if (addr < floor) {
+		addr += HUGEPD_SIZE(psize);
+		if (!addr)
+			return;
+	}
+	if (ceiling) {
+		ceiling &= HUGEPD_MASK(psize);
+		if (!ceiling)
+			return;
+	}
+	if (end - 1 > ceiling - 1)
+		end -= HUGEPD_SIZE(psize);
+	if (addr > end - 1)
+		return;
+
+	start = addr;
+	pgd = pgd_offset(tlb->mm, addr);
+	do {
+		psize = get_slice_psize(tlb->mm, addr);
+		BUG_ON(!mmu_huge_psizes[psize]);
+		next = pgd_addr_end(addr, end);
+		if (mmu_psize_to_shift(psize) < PUD_SHIFT) {
+			if (pgd_none_or_clear_bad(pgd))
+				continue;
+			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
+		} else {
+			if (pgd_none(*pgd))
+				continue;
+			free_hugepte_range(tlb, (hugepd_t *)pgd, psize);
+		}
+	} while (pgd++, addr = next, addr != end);
+}
+
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t pte)
+{
+	if (pte_present(*ptep)) {
+		/* We open-code pte_clear because we need to pass the right
+		 * argument to hpte_need_flush (huge / !huge). Might not be
+		 * necessary anymore if we make hpte_need_flush() get the
+		 * page size from the slices
+		 */
+		unsigned int psize = get_slice_psize(mm, addr);
+		unsigned int shift = mmu_psize_to_shift(psize);
+		unsigned long sz = ((1UL) << shift);
+		struct hstate *hstate = size_to_hstate(sz);
+		pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1);
+	}
+	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
+}
+
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+			      pte_t *ptep)
+{
+	unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
+	return __pte(old);
+}
+
+struct page *
+follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
+{
+	pte_t *ptep;
+	struct page *page;
+	unsigned int mmu_psize = get_slice_psize(mm, address);
+
+	/* Verify it is a huge page else bail. */
+	if (!mmu_huge_psizes[mmu_psize])
+		return ERR_PTR(-EINVAL);
+
+	ptep = huge_pte_offset(mm, address);
+	page = pte_page(*ptep);
+	if (page) {
+		unsigned int shift = mmu_psize_to_shift(mmu_psize);
+		unsigned long sz = ((1UL) << shift);
+		page += (address % sz) / PAGE_SIZE;
+	}
+
+	return page;
+}
+
+int pmd_huge(pmd_t pmd)
+{
+	return 0;
+}
+
+int pud_huge(pud_t pud)
+{
+	return 0;
+}
+
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+		pmd_t *pmd, int write)
+{
+	BUG();
+	return NULL;
+}
+
+
+unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+					unsigned long len, unsigned long pgoff,
+					unsigned long flags)
+{
+	struct hstate *hstate = hstate_file(file);
+	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
+
+	if (!mmu_huge_psizes[mmu_psize])
+		return -EINVAL;
+	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
+}
+
+/*
+ * Called by asm hashtable.S for doing lazy icache flush
+ */
+static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
+					pte_t pte, int trap, unsigned long sz)
+{
+	struct page *page;
+	int i;
+
+	if (!pfn_valid(pte_pfn(pte)))
+		return rflags;
+
+	page = pte_page(pte);
+
+	/* page is dirty */
+	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
+		if (trap == 0x400) {
+			for (i = 0; i < (sz / PAGE_SIZE); i++)
+				__flush_dcache_icache(page_address(page+i));
+			set_bit(PG_arch_1, &page->flags);
+		} else {
+			rflags |= HPTE_R_N;
+		}
+	}
+	return rflags;
+}
+
+int hash_huge_page(struct mm_struct *mm, unsigned long access,
+		   unsigned long ea, unsigned long vsid, int local,
+		   unsigned long trap)
+{
+	pte_t *ptep;
+	unsigned long old_pte, new_pte;
+	unsigned long va, rflags, pa, sz;
+	long slot;
+	int err = 1;
+	int ssize = user_segment_size(ea);
+	unsigned int mmu_psize;
+	int shift;
+	mmu_psize = get_slice_psize(mm, ea);
+
+	if (!mmu_huge_psizes[mmu_psize])
+		goto out;
+	ptep = huge_pte_offset(mm, ea);
+
+	/* Search the Linux page table for a match with va */
+	va = hpt_va(ea, vsid, ssize);
+
+	/*
+	 * If no pte found or not present, send the problem up to
+	 * do_page_fault
+	 */
+	if (unlikely(!ptep || pte_none(*ptep)))
+		goto out;
+
+	/* 
+	 * Check the user's access rights to the page.  If access should be
+	 * prevented then send the problem up to do_page_fault.
+	 */
+	if (unlikely(access & ~pte_val(*ptep)))
+		goto out;
+	/*
+	 * At this point, we have a pte (old_pte) which can be used to build
+	 * or update an HPTE. There are 2 cases:
+	 *
+	 * 1. There is a valid (present) pte with no associated HPTE (this is 
+	 *	the most common case)
+	 * 2. There is a valid (present) pte with an associated HPTE. The
+	 *	current values of the pp bits in the HPTE prevent access
+	 *	because we are doing software DIRTY bit management and the
+	 *	page is currently not DIRTY. 
+	 */
+
+
+	do {
+		old_pte = pte_val(*ptep);
+		if (old_pte & _PAGE_BUSY)
+			goto out;
+		new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
+	} while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
+					 old_pte, new_pte));
+
+	rflags = 0x2 | (!(new_pte & _PAGE_RW));
+ 	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
+	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
+	shift = mmu_psize_to_shift(mmu_psize);
+	sz = ((1UL) << shift);
+	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+		/* No CPU has hugepages but lacks no execute, so we
+		 * don't need to worry about that case */
+		rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
+						       trap, sz);
+
+	/* Check if pte already has an hpte (case 2) */
+	if (unlikely(old_pte & _PAGE_HASHPTE)) {
+		/* There MIGHT be an HPTE for this pte */
+		unsigned long hash, slot;
+
+		hash = hpt_hash(va, shift, ssize);
+		if (old_pte & _PAGE_F_SECOND)
+			hash = ~hash;
+		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+		slot += (old_pte & _PAGE_F_GIX) >> 12;
+
+		if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
+					 ssize, local) == -1)
+			old_pte &= ~_PAGE_HPTEFLAGS;
+	}
+
+	if (likely(!(old_pte & _PAGE_HASHPTE))) {
+		unsigned long hash = hpt_hash(va, shift, ssize);
+		unsigned long hpte_group;
+
+		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+
+repeat:
+		hpte_group = ((hash & htab_hash_mask) *
+			      HPTES_PER_GROUP) & ~0x7UL;
+
+		/* clear HPTE slot informations in new PTE */
+#ifdef CONFIG_PPC_64K_PAGES
+		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
+#else
+		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+#endif
+		/* Add in WIMG bits */
+		rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
+				      _PAGE_COHERENT | _PAGE_GUARDED));
+
+		/* Insert into the hash table, primary slot */
+		slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
+					  mmu_psize, ssize);
+
+		/* Primary is full, try the secondary */
+		if (unlikely(slot == -1)) {
+			hpte_group = ((~hash & htab_hash_mask) *
+				      HPTES_PER_GROUP) & ~0x7UL; 
+			slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
+						  HPTE_V_SECONDARY,
+						  mmu_psize, ssize);
+			if (slot == -1) {
+				if (mftb() & 0x1)
+					hpte_group = ((hash & htab_hash_mask) *
+						      HPTES_PER_GROUP)&~0x7UL;
+
+				ppc_md.hpte_remove(hpte_group);
+				goto repeat;
+                        }
+		}
+
+		if (unlikely(slot == -2))
+			panic("hash_huge_page: pte_insert failed\n");
+
+		new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
+	}
+
+	/*
+	 * No need to use ldarx/stdcx here
+	 */
+	*ptep = __pte(new_pte & ~_PAGE_BUSY);
+
+	err = 0;
+
+ out:
+	return err;
+}
+
+static void __init set_huge_psize(int psize)
+{
+	/* Check that it is a page size supported by the hardware and
+	 * that it fits within pagetable limits. */
+	if (mmu_psize_defs[psize].shift &&
+		mmu_psize_defs[psize].shift < SID_SHIFT_1T &&
+		(mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
+		 mmu_psize_defs[psize].shift == PAGE_SHIFT_64K ||
+		 mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) {
+		/* Return if huge page size has already been setup or is the
+		 * same as the base page size. */
+		if (mmu_huge_psizes[psize] ||
+		   mmu_psize_defs[psize].shift == PAGE_SHIFT)
+			return;
+		hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
+
+		switch (mmu_psize_defs[psize].shift) {
+		case PAGE_SHIFT_64K:
+		    /* We only allow 64k hpages with 4k base page,
+		     * which was checked above, and always put them
+		     * at the PMD */
+		    hugepte_shift[psize] = PMD_SHIFT;
+		    break;
+		case PAGE_SHIFT_16M:
+		    /* 16M pages can be at two different levels
+		     * of pagestables based on base page size */
+		    if (PAGE_SHIFT == PAGE_SHIFT_64K)
+			    hugepte_shift[psize] = PMD_SHIFT;
+		    else /* 4k base page */
+			    hugepte_shift[psize] = PUD_SHIFT;
+		    break;
+		case PAGE_SHIFT_16G:
+		    /* 16G pages are always at PGD level */
+		    hugepte_shift[psize] = PGDIR_SHIFT;
+		    break;
+		}
+		hugepte_shift[psize] -= mmu_psize_defs[psize].shift;
+	} else
+		hugepte_shift[psize] = 0;
+}
+
+static int __init hugepage_setup_sz(char *str)
+{
+	unsigned long long size;
+	int mmu_psize;
+	int shift;
+
+	size = memparse(str, &str);
+
+	shift = __ffs(size);
+	mmu_psize = shift_to_mmu_psize(shift);
+	if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift)
+		set_huge_psize(mmu_psize);
+	else
+		printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
+
+	return 1;
+}
+__setup("hugepagesz=", hugepage_setup_sz);
+
+static int __init hugetlbpage_init(void)
+{
+	unsigned int psize;
+
+	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
+		return -ENODEV;
+
+	/* Add supported huge page sizes.  Need to change HUGE_MAX_HSTATE
+	 * and adjust PTE_NONCACHE_NUM if the number of supported huge page
+	 * sizes changes.
+	 */
+	set_huge_psize(MMU_PAGE_16M);
+	set_huge_psize(MMU_PAGE_16G);
+
+	/* Temporarily disable support for 64K huge pages when 64K SPU local
+	 * store support is enabled as the current implementation conflicts.
+	 */
+#ifndef CONFIG_SPU_FS_64K_LS
+	set_huge_psize(MMU_PAGE_64K);
+#endif
+
+	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+		if (mmu_huge_psizes[psize]) {
+			huge_pgtable_cache(psize) = kmem_cache_create(
+						HUGEPTE_CACHE_NAME(psize),
+						HUGEPTE_TABLE_SIZE(psize),
+						HUGEPTE_TABLE_SIZE(psize),
+						0,
+						NULL);
+			if (!huge_pgtable_cache(psize))
+				panic("hugetlbpage_init(): could not create %s"\
+				      "\n", HUGEPTE_CACHE_NAME(psize));
+		}
+	}
+
+	return 0;
+}
+
+module_init(hugetlbpage_init);
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
new file mode 100644
index 0000000..388ceda
--- /dev/null
+++ b/arch/powerpc/mm/init_32.c
@@ -0,0 +1,290 @@
+/*
+ *  PowerPC version
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *  PPC44x/36-bit changes by Matt Porter (mporter@mvista.com)
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/stddef.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/initrd.h>
+#include <linux/pagemap.h>
+#include <linux/lmb.h>
+
+#include <asm/pgalloc.h>
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/smp.h>
+#include <asm/machdep.h>
+#include <asm/btext.h>
+#include <asm/tlb.h>
+#include <asm/sections.h>
+#include <asm/system.h>
+
+#include "mmu_decl.h"
+
+#if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL)
+/* The ammount of lowmem must be within 0xF0000000 - KERNELBASE. */
+#if (CONFIG_LOWMEM_SIZE > (0xF0000000 - KERNELBASE))
+#error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_START_KERNEL"
+#endif
+#endif
+#define MAX_LOW_MEM	CONFIG_LOWMEM_SIZE
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+
+phys_addr_t total_memory;
+phys_addr_t total_lowmem;
+
+phys_addr_t memstart_addr = (phys_addr_t)~0ull;
+EXPORT_SYMBOL(memstart_addr);
+phys_addr_t kernstart_addr;
+EXPORT_SYMBOL(kernstart_addr);
+phys_addr_t lowmem_end_addr;
+
+int boot_mapsize;
+#ifdef CONFIG_PPC_PMAC
+unsigned long agp_special_page;
+EXPORT_SYMBOL(agp_special_page);
+#endif
+
+void MMU_init(void);
+
+/* XXX should be in current.h  -- paulus */
+extern struct task_struct *current_set[NR_CPUS];
+
+/*
+ * this tells the system to map all of ram with the segregs
+ * (i.e. page tables) instead of the bats.
+ * -- Cort
+ */
+int __map_without_bats;
+int __map_without_ltlbs;
+
+/* max amount of low RAM to map in */
+unsigned long __max_low_memory = MAX_LOW_MEM;
+
+/*
+ * address of the limit of what is accessible with initial MMU setup -
+ * 256MB usually, but only 16MB on 601.
+ */
+phys_addr_t __initial_memory_limit_addr = (phys_addr_t)0x10000000;
+
+/*
+ * Check for command-line options that affect what MMU_init will do.
+ */
+void MMU_setup(void)
+{
+	/* Check for nobats option (used in mapin_ram). */
+	if (strstr(cmd_line, "nobats")) {
+		__map_without_bats = 1;
+	}
+
+	if (strstr(cmd_line, "noltlbs")) {
+		__map_without_ltlbs = 1;
+	}
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	__map_without_bats = 1;
+	__map_without_ltlbs = 1;
+#endif
+}
+
+/*
+ * MMU_init sets up the basic memory mappings for the kernel,
+ * including both RAM and possibly some I/O regions,
+ * and sets up the page tables and the MMU hardware ready to go.
+ */
+void __init MMU_init(void)
+{
+	if (ppc_md.progress)
+		ppc_md.progress("MMU:enter", 0x111);
+
+	/* 601 can only access 16MB at the moment */
+	if (PVR_VER(mfspr(SPRN_PVR)) == 1)
+		__initial_memory_limit_addr = 0x01000000;
+	/* 8xx can only access 8MB at the moment */
+	if (PVR_VER(mfspr(SPRN_PVR)) == 0x50)
+		__initial_memory_limit_addr = 0x00800000;
+
+	/* parse args from command line */
+	MMU_setup();
+
+	if (lmb.memory.cnt > 1) {
+		lmb.memory.cnt = 1;
+		lmb_analyze();
+		printk(KERN_WARNING "Only using first contiguous memory region");
+	}
+
+	total_lowmem = total_memory = lmb_end_of_DRAM() - memstart_addr;
+	lowmem_end_addr = memstart_addr + total_lowmem;
+
+#ifdef CONFIG_FSL_BOOKE
+	/* Freescale Book-E parts expect lowmem to be mapped by fixed TLB
+	 * entries, so we need to adjust lowmem to match the amount we can map
+	 * in the fixed entries */
+	adjust_total_lowmem();
+#endif /* CONFIG_FSL_BOOKE */
+
+	if (total_lowmem > __max_low_memory) {
+		total_lowmem = __max_low_memory;
+		lowmem_end_addr = memstart_addr + total_lowmem;
+#ifndef CONFIG_HIGHMEM
+		total_memory = total_lowmem;
+		lmb_enforce_memory_limit(lowmem_end_addr);
+		lmb_analyze();
+#endif /* CONFIG_HIGHMEM */
+	}
+
+	/* Initialize the MMU hardware */
+	if (ppc_md.progress)
+		ppc_md.progress("MMU:hw init", 0x300);
+	MMU_init_hw();
+
+	/* Map in all of RAM starting at KERNELBASE */
+	if (ppc_md.progress)
+		ppc_md.progress("MMU:mapin", 0x301);
+	mapin_ram();
+
+#ifdef CONFIG_HIGHMEM
+	ioremap_base = PKMAP_BASE;
+#else
+	ioremap_base = 0xfe000000UL;	/* for now, could be 0xfffff000 */
+#endif /* CONFIG_HIGHMEM */
+	ioremap_bot = ioremap_base;
+
+	/* Map in I/O resources */
+	if (ppc_md.progress)
+		ppc_md.progress("MMU:setio", 0x302);
+
+	/* Initialize the context management stuff */
+	mmu_context_init();
+
+	if (ppc_md.progress)
+		ppc_md.progress("MMU:exit", 0x211);
+
+	/* From now on, btext is no longer BAT mapped if it was at all */
+#ifdef CONFIG_BOOTX_TEXT
+	btext_unmap();
+#endif
+}
+
+/* This is only called until mem_init is done. */
+void __init *early_get_page(void)
+{
+	void *p;
+
+	if (init_bootmem_done) {
+		p = alloc_bootmem_pages(PAGE_SIZE);
+	} else {
+		p = __va(lmb_alloc_base(PAGE_SIZE, PAGE_SIZE,
+					__initial_memory_limit_addr));
+	}
+	return p;
+}
+
+/* Free up now-unused memory */
+static void free_sec(unsigned long start, unsigned long end, const char *name)
+{
+	unsigned long cnt = 0;
+
+	while (start < end) {
+		ClearPageReserved(virt_to_page(start));
+		init_page_count(virt_to_page(start));
+		free_page(start);
+		cnt++;
+		start += PAGE_SIZE;
+ 	}
+	if (cnt) {
+		printk(" %ldk %s", cnt << (PAGE_SHIFT - 10), name);
+		totalram_pages += cnt;
+	}
+}
+
+void free_initmem(void)
+{
+#define FREESEC(TYPE) \
+	free_sec((unsigned long)(&__ ## TYPE ## _begin), \
+		 (unsigned long)(&__ ## TYPE ## _end), \
+		 #TYPE);
+
+	printk ("Freeing unused kernel memory:");
+	FREESEC(init);
+ 	printk("\n");
+	ppc_md.progress = NULL;
+#undef FREESEC
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+	if (start < end)
+		printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
+	for (; start < end; start += PAGE_SIZE) {
+		ClearPageReserved(virt_to_page(start));
+		init_page_count(virt_to_page(start));
+		free_page(start);
+		totalram_pages++;
+	}
+}
+#endif
+
+#ifdef CONFIG_PROC_KCORE
+static struct kcore_list kcore_vmem;
+
+static int __init setup_kcore(void)
+{
+	int i;
+
+	for (i = 0; i < lmb.memory.cnt; i++) {
+		unsigned long base;
+		unsigned long size;
+		struct kcore_list *kcore_mem;
+
+		base = lmb.memory.region[i].base;
+		size = lmb.memory.region[i].size;
+
+		kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC);
+		if (!kcore_mem)
+			panic("%s: kmalloc failed\n", __func__);
+
+		/* must stay under 32 bits */
+		if ( 0xfffffffful - (unsigned long)__va(base) < size) {
+			size = 0xfffffffful - (unsigned long)(__va(base));
+			printk(KERN_DEBUG "setup_kcore: restrict size=%lx\n",
+						size);
+		}
+
+		kclist_add(kcore_mem, __va(base), size);
+	}
+
+	kclist_add(&kcore_vmem, (void *)VMALLOC_START,
+		VMALLOC_END-VMALLOC_START);
+
+	return 0;
+}
+module_init(setup_kcore);
+#endif
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
new file mode 100644
index 0000000..3e6a654
--- /dev/null
+++ b/arch/powerpc/mm/init_64.c
@@ -0,0 +1,238 @@
+/*
+ *  PowerPC version
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  Dave Engebretsen <engebret@us.ibm.com>
+ *      Rework for PPC64 port.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#undef DEBUG
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/idr.h>
+#include <linux/nodemask.h>
+#include <linux/module.h>
+#include <linux/poison.h>
+#include <linux/lmb.h>
+
+#include <asm/pgalloc.h>
+#include <asm/page.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/uaccess.h>
+#include <asm/smp.h>
+#include <asm/machdep.h>
+#include <asm/tlb.h>
+#include <asm/eeh.h>
+#include <asm/processor.h>
+#include <asm/mmzone.h>
+#include <asm/cputable.h>
+#include <asm/sections.h>
+#include <asm/system.h>
+#include <asm/iommu.h>
+#include <asm/abs_addr.h>
+#include <asm/vdso.h>
+
+#include "mmu_decl.h"
+
+#if PGTABLE_RANGE > USER_VSID_RANGE
+#warning Limited user VSID range means pagetable space is wasted
+#endif
+
+#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
+#warning TASK_SIZE is smaller than it needs to be.
+#endif
+
+phys_addr_t memstart_addr = ~0;
+phys_addr_t kernstart_addr;
+
+void free_initmem(void)
+{
+	unsigned long addr;
+
+	addr = (unsigned long)__init_begin;
+	for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
+		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
+		ClearPageReserved(virt_to_page(addr));
+		init_page_count(virt_to_page(addr));
+		free_page(addr);
+		totalram_pages++;
+	}
+	printk ("Freeing unused kernel memory: %luk freed\n",
+		((unsigned long)__init_end - (unsigned long)__init_begin) >> 10);
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+	if (start < end)
+		printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
+	for (; start < end; start += PAGE_SIZE) {
+		ClearPageReserved(virt_to_page(start));
+		init_page_count(virt_to_page(start));
+		free_page(start);
+		totalram_pages++;
+	}
+}
+#endif
+
+#ifdef CONFIG_PROC_KCORE
+static struct kcore_list kcore_vmem;
+
+static int __init setup_kcore(void)
+{
+	int i;
+
+	for (i=0; i < lmb.memory.cnt; i++) {
+		unsigned long base, size;
+		struct kcore_list *kcore_mem;
+
+		base = lmb.memory.region[i].base;
+		size = lmb.memory.region[i].size;
+
+		/* GFP_ATOMIC to avoid might_sleep warnings during boot */
+		kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC);
+		if (!kcore_mem)
+			panic("%s: kmalloc failed\n", __func__);
+
+		kclist_add(kcore_mem, __va(base), size);
+	}
+
+	kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
+
+	return 0;
+}
+module_init(setup_kcore);
+#endif
+
+static void pgd_ctor(void *addr)
+{
+	memset(addr, 0, PGD_TABLE_SIZE);
+}
+
+static void pmd_ctor(void *addr)
+{
+	memset(addr, 0, PMD_TABLE_SIZE);
+}
+
+static const unsigned int pgtable_cache_size[2] = {
+	PGD_TABLE_SIZE, PMD_TABLE_SIZE
+};
+static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
+#ifdef CONFIG_PPC_64K_PAGES
+	"pgd_cache", "pmd_cache",
+#else
+	"pgd_cache", "pud_pmd_cache",
+#endif /* CONFIG_PPC_64K_PAGES */
+};
+
+#ifdef CONFIG_HUGETLB_PAGE
+/* Hugepages need an extra cache per hugepagesize, initialized in
+ * hugetlbpage.c.  We can't put into the tables above, because HPAGE_SHIFT
+ * is not compile time constant. */
+struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+MMU_PAGE_COUNT];
+#else
+struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
+#endif
+
+void pgtable_cache_init(void)
+{
+	pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor);
+	pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor);
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/*
+ * Given an address within the vmemmap, determine the pfn of the page that
+ * represents the start of the section it is within.  Note that we have to
+ * do this by hand as the proffered address may not be correctly aligned.
+ * Subtraction of non-aligned pointers produces undefined results.
+ */
+static unsigned long __meminit vmemmap_section_start(unsigned long page)
+{
+	unsigned long offset = page - ((unsigned long)(vmemmap));
+
+	/* Return the pfn of the start of the section. */
+	return (offset / sizeof(struct page)) & PAGE_SECTION_MASK;
+}
+
+/*
+ * Check if this vmemmap page is already initialised.  If any section
+ * which overlaps this vmemmap page is initialised then this page is
+ * initialised already.
+ */
+static int __meminit vmemmap_populated(unsigned long start, int page_size)
+{
+	unsigned long end = start + page_size;
+
+	for (; start < end; start += (PAGES_PER_SECTION * sizeof(struct page)))
+		if (pfn_valid(vmemmap_section_start(start)))
+			return 1;
+
+	return 0;
+}
+
+int __meminit vmemmap_populate(struct page *start_page,
+			       unsigned long nr_pages, int node)
+{
+	unsigned long start = (unsigned long)start_page;
+	unsigned long end = (unsigned long)(start_page + nr_pages);
+	unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
+
+	/* Align to the page size of the linear mapping. */
+	start = _ALIGN_DOWN(start, page_size);
+
+	for (; start < end; start += page_size) {
+		int mapped;
+		void *p;
+
+		if (vmemmap_populated(start, page_size))
+			continue;
+
+		p = vmemmap_alloc_block(page_size, node);
+		if (!p)
+			return -ENOMEM;
+
+		pr_debug("vmemmap %08lx allocated at %p, physical %08lx.\n",
+			start, p, __pa(p));
+
+		mapped = htab_bolt_mapping(start, start + page_size, __pa(p),
+					   pgprot_val(PAGE_KERNEL),
+					   mmu_vmemmap_psize, mmu_kernel_ssize);
+		BUG_ON(mapped < 0);
+	}
+
+	return 0;
+}
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
new file mode 100644
index 0000000..b9e1a1d
--- /dev/null
+++ b/arch/powerpc/mm/mem.c
@@ -0,0 +1,529 @@
+/*
+ *  PowerPC version
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *  PPC44x/36-bit changes by Matt Porter (mporter@mvista.com)
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/stddef.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/initrd.h>
+#include <linux/pagemap.h>
+#include <linux/suspend.h>
+#include <linux/lmb.h>
+
+#include <asm/pgalloc.h>
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/smp.h>
+#include <asm/machdep.h>
+#include <asm/btext.h>
+#include <asm/tlb.h>
+#include <asm/sections.h>
+#include <asm/sparsemem.h>
+#include <asm/vdso.h>
+#include <asm/fixmap.h>
+
+#include "mmu_decl.h"
+
+#ifndef CPU_FTR_COHERENT_ICACHE
+#define CPU_FTR_COHERENT_ICACHE	0	/* XXX for now */
+#define CPU_FTR_NOEXECUTE	0
+#endif
+
+int init_bootmem_done;
+int mem_init_done;
+unsigned long memory_limit;
+
+#ifdef CONFIG_HIGHMEM
+pte_t *kmap_pte;
+pgprot_t kmap_prot;
+
+EXPORT_SYMBOL(kmap_prot);
+EXPORT_SYMBOL(kmap_pte);
+
+static inline pte_t *virt_to_kpte(unsigned long vaddr)
+{
+	return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
+			vaddr), vaddr), vaddr);
+}
+#endif
+
+int page_is_ram(unsigned long pfn)
+{
+#ifndef CONFIG_PPC64	/* XXX for now */
+	return pfn < max_pfn;
+#else
+	unsigned long paddr = (pfn << PAGE_SHIFT);
+	int i;
+	for (i=0; i < lmb.memory.cnt; i++) {
+		unsigned long base;
+
+		base = lmb.memory.region[i].base;
+
+		if ((paddr >= base) &&
+			(paddr < (base + lmb.memory.region[i].size))) {
+			return 1;
+		}
+	}
+
+	return 0;
+#endif
+}
+
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+			      unsigned long size, pgprot_t vma_prot)
+{
+	if (ppc_md.phys_mem_access_prot)
+		return ppc_md.phys_mem_access_prot(file, pfn, size, vma_prot);
+
+	if (!page_is_ram(pfn))
+		vma_prot = __pgprot(pgprot_val(vma_prot)
+				    | _PAGE_GUARDED | _PAGE_NO_CACHE);
+	return vma_prot;
+}
+EXPORT_SYMBOL(phys_mem_access_prot);
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+
+#ifdef CONFIG_NUMA
+int memory_add_physaddr_to_nid(u64 start)
+{
+	return hot_add_scn_to_nid(start);
+}
+#endif
+
+int arch_add_memory(int nid, u64 start, u64 size)
+{
+	struct pglist_data *pgdata;
+	struct zone *zone;
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+
+	pgdata = NODE_DATA(nid);
+
+	start = (unsigned long)__va(start);
+	create_section_mapping(start, start + size);
+
+	/* this should work for most non-highmem platforms */
+	zone = pgdata->node_zones;
+
+	return __add_pages(zone, start_pfn, nr_pages);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+/*
+ * walk_memory_resource() needs to make sure there is no holes in a given
+ * memory range.  PPC64 does not maintain the memory layout in /proc/iomem.
+ * Instead it maintains it in lmb.memory structures.  Walk through the
+ * memory regions, find holes and callback for contiguous regions.
+ */
+int
+walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg,
+			int (*func)(unsigned long, unsigned long, void *))
+{
+	struct lmb_property res;
+	unsigned long pfn, len;
+	u64 end;
+	int ret = -1;
+
+	res.base = (u64) start_pfn << PAGE_SHIFT;
+	res.size = (u64) nr_pages << PAGE_SHIFT;
+
+	end = res.base + res.size - 1;
+	while ((res.base < end) && (lmb_find(&res) >= 0)) {
+		pfn = (unsigned long)(res.base >> PAGE_SHIFT);
+		len = (unsigned long)(res.size >> PAGE_SHIFT);
+		ret = (*func)(pfn, len, arg);
+		if (ret)
+			break;
+		res.base += (res.size + 1);
+		res.size = (end - res.base + 1);
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(walk_memory_resource);
+
+/*
+ * Initialize the bootmem system and give it all the memory we
+ * have available.  If we are using highmem, we only put the
+ * lowmem into the bootmem system.
+ */
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+void __init do_init_bootmem(void)
+{
+	unsigned long i;
+	unsigned long start, bootmap_pages;
+	unsigned long total_pages;
+	int boot_mapsize;
+
+	max_low_pfn = max_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
+	total_pages = (lmb_end_of_DRAM() - memstart_addr) >> PAGE_SHIFT;
+#ifdef CONFIG_HIGHMEM
+	total_pages = total_lowmem >> PAGE_SHIFT;
+	max_low_pfn = lowmem_end_addr >> PAGE_SHIFT;
+#endif
+
+	/*
+	 * Find an area to use for the bootmem bitmap.  Calculate the size of
+	 * bitmap required as (Total Memory) / PAGE_SIZE / BITS_PER_BYTE.
+	 * Add 1 additional page in case the address isn't page-aligned.
+	 */
+	bootmap_pages = bootmem_bootmap_pages(total_pages);
+
+	start = lmb_alloc(bootmap_pages << PAGE_SHIFT, PAGE_SIZE);
+
+	min_low_pfn = MEMORY_START >> PAGE_SHIFT;
+	boot_mapsize = init_bootmem_node(NODE_DATA(0), start >> PAGE_SHIFT, min_low_pfn, max_low_pfn);
+
+	/* Add active regions with valid PFNs */
+	for (i = 0; i < lmb.memory.cnt; i++) {
+		unsigned long start_pfn, end_pfn;
+		start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT;
+		end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i);
+		add_active_range(0, start_pfn, end_pfn);
+	}
+
+	/* Add all physical memory to the bootmem map, mark each area
+	 * present.
+	 */
+#ifdef CONFIG_HIGHMEM
+	free_bootmem_with_active_regions(0, lowmem_end_addr >> PAGE_SHIFT);
+
+	/* reserve the sections we're already using */
+	for (i = 0; i < lmb.reserved.cnt; i++) {
+		unsigned long addr = lmb.reserved.region[i].base +
+				     lmb_size_bytes(&lmb.reserved, i) - 1;
+		if (addr < lowmem_end_addr)
+			reserve_bootmem(lmb.reserved.region[i].base,
+					lmb_size_bytes(&lmb.reserved, i),
+					BOOTMEM_DEFAULT);
+		else if (lmb.reserved.region[i].base < lowmem_end_addr) {
+			unsigned long adjusted_size = lowmem_end_addr -
+				      lmb.reserved.region[i].base;
+			reserve_bootmem(lmb.reserved.region[i].base,
+					adjusted_size, BOOTMEM_DEFAULT);
+		}
+	}
+#else
+	free_bootmem_with_active_regions(0, max_pfn);
+
+	/* reserve the sections we're already using */
+	for (i = 0; i < lmb.reserved.cnt; i++)
+		reserve_bootmem(lmb.reserved.region[i].base,
+				lmb_size_bytes(&lmb.reserved, i),
+				BOOTMEM_DEFAULT);
+
+#endif
+	/* XXX need to clip this if using highmem? */
+	sparse_memory_present_with_active_regions(0);
+
+	init_bootmem_done = 1;
+}
+
+/* mark pages that don't exist as nosave */
+static int __init mark_nonram_nosave(void)
+{
+	unsigned long lmb_next_region_start_pfn,
+		      lmb_region_max_pfn;
+	int i;
+
+	for (i = 0; i < lmb.memory.cnt - 1; i++) {
+		lmb_region_max_pfn =
+			(lmb.memory.region[i].base >> PAGE_SHIFT) +
+			(lmb.memory.region[i].size >> PAGE_SHIFT);
+		lmb_next_region_start_pfn =
+			lmb.memory.region[i+1].base >> PAGE_SHIFT;
+
+		if (lmb_region_max_pfn < lmb_next_region_start_pfn)
+			register_nosave_region(lmb_region_max_pfn,
+					       lmb_next_region_start_pfn);
+	}
+
+	return 0;
+}
+
+/*
+ * paging_init() sets up the page tables - in fact we've already done this.
+ */
+void __init paging_init(void)
+{
+	unsigned long total_ram = lmb_phys_mem_size();
+	phys_addr_t top_of_ram = lmb_end_of_DRAM();
+	unsigned long max_zone_pfns[MAX_NR_ZONES];
+
+#ifdef CONFIG_PPC32
+	unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1);
+	unsigned long end = __fix_to_virt(FIX_HOLE);
+
+	for (; v < end; v += PAGE_SIZE)
+		map_page(v, 0, 0); /* XXX gross */
+#endif
+
+#ifdef CONFIG_HIGHMEM
+	map_page(PKMAP_BASE, 0, 0);	/* XXX gross */
+	pkmap_page_table = virt_to_kpte(PKMAP_BASE);
+
+	kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN));
+	kmap_prot = PAGE_KERNEL;
+#endif /* CONFIG_HIGHMEM */
+
+	printk(KERN_DEBUG "Top of RAM: 0x%llx, Total RAM: 0x%lx\n",
+	       (unsigned long long)top_of_ram, total_ram);
+	printk(KERN_DEBUG "Memory hole size: %ldMB\n",
+	       (long int)((top_of_ram - total_ram) >> 20));
+	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+#ifdef CONFIG_HIGHMEM
+	max_zone_pfns[ZONE_DMA] = lowmem_end_addr >> PAGE_SHIFT;
+	max_zone_pfns[ZONE_HIGHMEM] = top_of_ram >> PAGE_SHIFT;
+#else
+	max_zone_pfns[ZONE_DMA] = top_of_ram >> PAGE_SHIFT;
+#endif
+	free_area_init_nodes(max_zone_pfns);
+
+	mark_nonram_nosave();
+}
+#endif /* ! CONFIG_NEED_MULTIPLE_NODES */
+
+void __init mem_init(void)
+{
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+	int nid;
+#endif
+	pg_data_t *pgdat;
+	unsigned long i;
+	struct page *page;
+	unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize;
+
+	num_physpages = lmb.memory.size >> PAGE_SHIFT;
+	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
+
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+        for_each_online_node(nid) {
+		if (NODE_DATA(nid)->node_spanned_pages != 0) {
+			printk("freeing bootmem node %d\n", nid);
+			totalram_pages +=
+				free_all_bootmem_node(NODE_DATA(nid));
+		}
+	}
+#else
+	max_mapnr = max_pfn;
+	totalram_pages += free_all_bootmem();
+#endif
+	for_each_online_pgdat(pgdat) {
+		for (i = 0; i < pgdat->node_spanned_pages; i++) {
+			if (!pfn_valid(pgdat->node_start_pfn + i))
+				continue;
+			page = pgdat_page_nr(pgdat, i);
+			if (PageReserved(page))
+				reservedpages++;
+		}
+	}
+
+	codesize = (unsigned long)&_sdata - (unsigned long)&_stext;
+	datasize = (unsigned long)&_edata - (unsigned long)&_sdata;
+	initsize = (unsigned long)&__init_end - (unsigned long)&__init_begin;
+	bsssize = (unsigned long)&__bss_stop - (unsigned long)&__bss_start;
+
+#ifdef CONFIG_HIGHMEM
+	{
+		unsigned long pfn, highmem_mapnr;
+
+		highmem_mapnr = lowmem_end_addr >> PAGE_SHIFT;
+		for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) {
+			struct page *page = pfn_to_page(pfn);
+			if (lmb_is_reserved(pfn << PAGE_SHIFT))
+				continue;
+			ClearPageReserved(page);
+			init_page_count(page);
+			__free_page(page);
+			totalhigh_pages++;
+			reservedpages--;
+		}
+		totalram_pages += totalhigh_pages;
+		printk(KERN_DEBUG "High memory: %luk\n",
+		       totalhigh_pages << (PAGE_SHIFT-10));
+	}
+#endif /* CONFIG_HIGHMEM */
+
+	printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, "
+	       "%luk reserved, %luk data, %luk bss, %luk init)\n",
+		(unsigned long)nr_free_pages() << (PAGE_SHIFT-10),
+		num_physpages << (PAGE_SHIFT-10),
+		codesize >> 10,
+		reservedpages << (PAGE_SHIFT-10),
+		datasize >> 10,
+		bsssize >> 10,
+		initsize >> 10);
+
+	mem_init_done = 1;
+}
+
+/*
+ * This is called when a page has been modified by the kernel.
+ * It just marks the page as not i-cache clean.  We do the i-cache
+ * flush later when the page is given to a user process, if necessary.
+ */
+void flush_dcache_page(struct page *page)
+{
+	if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+		return;
+	/* avoid an atomic op if possible */
+	if (test_bit(PG_arch_1, &page->flags))
+		clear_bit(PG_arch_1, &page->flags);
+}
+EXPORT_SYMBOL(flush_dcache_page);
+
+void flush_dcache_icache_page(struct page *page)
+{
+#ifdef CONFIG_BOOKE
+	void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE);
+	__flush_dcache_icache(start);
+	kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
+#elif defined(CONFIG_8xx) || defined(CONFIG_PPC64)
+	/* On 8xx there is no need to kmap since highmem is not supported */
+	__flush_dcache_icache(page_address(page)); 
+#else
+	__flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT);
+#endif
+
+}
+void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
+{
+	clear_page(page);
+
+	/*
+	 * We shouldnt have to do this, but some versions of glibc
+	 * require it (ld.so assumes zero filled pages are icache clean)
+	 * - Anton
+	 */
+	flush_dcache_page(pg);
+}
+EXPORT_SYMBOL(clear_user_page);
+
+void copy_user_page(void *vto, void *vfrom, unsigned long vaddr,
+		    struct page *pg)
+{
+	copy_page(vto, vfrom);
+
+	/*
+	 * We should be able to use the following optimisation, however
+	 * there are two problems.
+	 * Firstly a bug in some versions of binutils meant PLT sections
+	 * were not marked executable.
+	 * Secondly the first word in the GOT section is blrl, used
+	 * to establish the GOT address. Until recently the GOT was
+	 * not marked executable.
+	 * - Anton
+	 */
+#if 0
+	if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0))
+		return;
+#endif
+
+	flush_dcache_page(pg);
+}
+
+void flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
+			     unsigned long addr, int len)
+{
+	unsigned long maddr;
+
+	maddr = (unsigned long) kmap(page) + (addr & ~PAGE_MASK);
+	flush_icache_range(maddr, maddr + len);
+	kunmap(page);
+}
+EXPORT_SYMBOL(flush_icache_user_range);
+
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a PTE in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux PTE.
+ * 
+ * This must always be called with the pte lock held.
+ */
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
+		      pte_t pte)
+{
+#ifdef CONFIG_PPC_STD_MMU
+	unsigned long access = 0, trap;
+#endif
+	unsigned long pfn = pte_pfn(pte);
+
+	/* handle i-cache coherency */
+	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) &&
+	    !cpu_has_feature(CPU_FTR_NOEXECUTE) &&
+	    pfn_valid(pfn)) {
+		struct page *page = pfn_to_page(pfn);
+#ifdef CONFIG_8xx
+		/* On 8xx, cache control instructions (particularly
+		 * "dcbst" from flush_dcache_icache) fault as write
+		 * operation if there is an unpopulated TLB entry
+		 * for the address in question. To workaround that,
+		 * we invalidate the TLB here, thus avoiding dcbst
+		 * misbehaviour.
+		 */
+		_tlbie(address, 0 /* 8xx doesn't care about PID */);
+#endif
+		/* The _PAGE_USER test should really be _PAGE_EXEC, but
+		 * older glibc versions execute some code from no-exec
+		 * pages, which for now we are supporting.  If exec-only
+		 * pages are ever implemented, this will have to change.
+		 */
+		if (!PageReserved(page) && (pte_val(pte) & _PAGE_USER)
+		    && !test_bit(PG_arch_1, &page->flags)) {
+			if (vma->vm_mm == current->active_mm) {
+				__flush_dcache_icache((void *) address);
+			} else
+				flush_dcache_icache_page(page);
+			set_bit(PG_arch_1, &page->flags);
+		}
+	}
+
+#ifdef CONFIG_PPC_STD_MMU
+	/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
+	if (!pte_young(pte) || address >= TASK_SIZE)
+		return;
+
+	/* We try to figure out if we are coming from an instruction
+	 * access fault and pass that down to __hash_page so we avoid
+	 * double-faulting on execution of fresh text. We have to test
+	 * for regs NULL since init will get here first thing at boot
+	 *
+	 * We also avoid filling the hash if not coming from a fault
+	 */
+	if (current->thread.regs == NULL)
+		return;
+	trap = TRAP(current->thread.regs);
+	if (trap == 0x400)
+		access |= _PAGE_EXEC;
+	else if (trap != 0x300)
+		return;
+	hash_preload(vma->vm_mm, address, access, trap);
+#endif /* CONFIG_PPC_STD_MMU */
+}
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
new file mode 100644
index 0000000..86010fc
--- /dev/null
+++ b/arch/powerpc/mm/mmap.c
@@ -0,0 +1,85 @@
+/*
+ *  flexible mmap layout support
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *
+ * Started by Ingo Molnar <mingo@elte.hu>
+ */
+
+#include <linux/personality.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+
+/*
+ * Top of mmap area (just below the process stack).
+ *
+ * Leave an at least ~128 MB hole.
+ */
+#define MIN_GAP (128*1024*1024)
+#define MAX_GAP (TASK_SIZE/6*5)
+
+static inline unsigned long mmap_base(void)
+{
+	unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
+
+	if (gap < MIN_GAP)
+		gap = MIN_GAP;
+	else if (gap > MAX_GAP)
+		gap = MAX_GAP;
+
+	return TASK_SIZE - (gap & PAGE_MASK);
+}
+
+static inline int mmap_is_legacy(void)
+{
+	/*
+	 * Force standard allocation for 64 bit programs.
+	 */
+	if (!test_thread_flag(TIF_32BIT))
+		return 1;
+
+	if (current->personality & ADDR_COMPAT_LAYOUT)
+		return 1;
+
+	if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
+		return 1;
+
+	return sysctl_legacy_va_layout;
+}
+
+/*
+ * This function, called very early during the creation of a new
+ * process VM image, sets up which VM layout function to use:
+ */
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+	/*
+	 * Fall back to the standard layout if the personality
+	 * bit is set, or if the expected stack growth is unlimited:
+	 */
+	if (mmap_is_legacy()) {
+		mm->mmap_base = TASK_UNMAPPED_BASE;
+		mm->get_unmapped_area = arch_get_unmapped_area;
+		mm->unmap_area = arch_unmap_area;
+	} else {
+		mm->mmap_base = mmap_base();
+		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+		mm->unmap_area = arch_unmap_area_topdown;
+	}
+}
diff --git a/arch/powerpc/mm/mmu_context_32.c b/arch/powerpc/mm/mmu_context_32.c
new file mode 100644
index 0000000..cc32ba4
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context_32.c
@@ -0,0 +1,84 @@
+/*
+ * This file contains the routines for handling the MMU on those
+ * PowerPC implementations where the MMU substantially follows the
+ * architecture specification.  This includes the 6xx, 7xx, 7xxx,
+ * 8260, and POWER3 implementations but excludes the 8xx and 4xx.
+ *  -- paulus
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
+
+unsigned long next_mmu_context;
+unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
+#ifdef FEW_CONTEXTS
+atomic_t nr_free_contexts;
+struct mm_struct *context_mm[LAST_CONTEXT+1];
+void steal_context(void);
+#endif /* FEW_CONTEXTS */
+
+/*
+ * Initialize the context management stuff.
+ */
+void __init
+mmu_context_init(void)
+{
+	/*
+	 * Some processors have too few contexts to reserve one for
+	 * init_mm, and require using context 0 for a normal task.
+	 * Other processors reserve the use of context zero for the kernel.
+	 * This code assumes FIRST_CONTEXT < 32.
+	 */
+	context_map[0] = (1 << FIRST_CONTEXT) - 1;
+	next_mmu_context = FIRST_CONTEXT;
+#ifdef FEW_CONTEXTS
+	atomic_set(&nr_free_contexts, LAST_CONTEXT - FIRST_CONTEXT + 1);
+#endif /* FEW_CONTEXTS */
+}
+
+#ifdef FEW_CONTEXTS
+/*
+ * Steal a context from a task that has one at the moment.
+ * This is only used on 8xx and 4xx and we presently assume that
+ * they don't do SMP.  If they do then this will have to check
+ * whether the MM we steal is in use.
+ * We also assume that this is only used on systems that don't
+ * use an MMU hash table - this is true for 8xx and 4xx.
+ * This isn't an LRU system, it just frees up each context in
+ * turn (sort-of pseudo-random replacement :).  This would be the
+ * place to implement an LRU scheme if anyone was motivated to do it.
+ *  -- paulus
+ */
+void
+steal_context(void)
+{
+	struct mm_struct *mm;
+
+	/* free up context `next_mmu_context' */
+	/* if we shouldn't free context 0, don't... */
+	if (next_mmu_context < FIRST_CONTEXT)
+		next_mmu_context = FIRST_CONTEXT;
+	mm = context_mm[next_mmu_context];
+	flush_tlb_mm(mm);
+	destroy_context(mm);
+}
+#endif /* FEW_CONTEXTS */
diff --git a/arch/powerpc/mm/mmu_context_64.c b/arch/powerpc/mm/mmu_context_64.c
new file mode 100644
index 0000000..1db38ba
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context_64.c
@@ -0,0 +1,70 @@
+/*
+ *  MMU context allocation for 64-bit kernels.
+ *
+ *  Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+
+#include <asm/mmu_context.h>
+
+static DEFINE_SPINLOCK(mmu_context_lock);
+static DEFINE_IDR(mmu_context_idr);
+
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+	int index;
+	int err;
+
+again:
+	if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
+		return -ENOMEM;
+
+	spin_lock(&mmu_context_lock);
+	err = idr_get_new_above(&mmu_context_idr, NULL, 1, &index);
+	spin_unlock(&mmu_context_lock);
+
+	if (err == -EAGAIN)
+		goto again;
+	else if (err)
+		return err;
+
+	if (index > MAX_CONTEXT) {
+		spin_lock(&mmu_context_lock);
+		idr_remove(&mmu_context_idr, index);
+		spin_unlock(&mmu_context_lock);
+		return -ENOMEM;
+	}
+
+	/* The old code would re-promote on fork, we don't do that
+	 * when using slices as it could cause problem promoting slices
+	 * that have been forced down to 4K
+	 */
+	if (slice_mm_new_context(mm))
+		slice_set_user_psize(mm, mmu_virtual_psize);
+	mm->context.id = index;
+
+	return 0;
+}
+
+void destroy_context(struct mm_struct *mm)
+{
+	spin_lock(&mmu_context_lock);
+	idr_remove(&mmu_context_idr, mm->context.id);
+	spin_unlock(&mmu_context_lock);
+
+	mm->context.id = NO_CONTEXT;
+}
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
new file mode 100644
index 0000000..fab3cfa
--- /dev/null
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -0,0 +1,94 @@
+/*
+ * Declarations of procedures and variables shared between files
+ * in arch/ppc/mm/.
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+#include <linux/mm.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu.h>
+
+extern void hash_preload(struct mm_struct *mm, unsigned long ea,
+			 unsigned long access, unsigned long trap);
+
+
+#ifdef CONFIG_PPC32
+extern void mapin_ram(void);
+extern int map_page(unsigned long va, phys_addr_t pa, int flags);
+extern void setbat(int index, unsigned long virt, phys_addr_t phys,
+		   unsigned int size, int flags);
+extern void settlbcam(int index, unsigned long virt, phys_addr_t phys,
+		      unsigned int size, int flags, unsigned int pid);
+extern void invalidate_tlbcam_entry(int index);
+
+extern int __map_without_bats;
+extern unsigned long ioremap_base;
+extern unsigned int rtas_data, rtas_size;
+
+struct hash_pte;
+extern struct hash_pte *Hash, *Hash_end;
+extern unsigned long Hash_size, Hash_mask;
+
+extern unsigned int num_tlbcam_entries;
+#endif
+
+extern unsigned long ioremap_bot;
+extern unsigned long __max_low_memory;
+extern phys_addr_t __initial_memory_limit_addr;
+extern phys_addr_t total_memory;
+extern phys_addr_t total_lowmem;
+extern phys_addr_t memstart_addr;
+extern phys_addr_t lowmem_end_addr;
+
+/* ...and now those things that may be slightly different between processor
+ * architectures.  -- Dan
+ */
+#if defined(CONFIG_8xx)
+#define flush_HPTE(X, va, pg)	_tlbie(va, 0 /* 8xx doesn't care about PID */)
+#define MMU_init_hw()		do { } while(0)
+#define mmu_mapin_ram()		(0UL)
+
+#elif defined(CONFIG_4xx)
+#define flush_HPTE(pid, va, pg)	_tlbie(va, pid)
+extern void MMU_init_hw(void);
+extern unsigned long mmu_mapin_ram(void);
+
+#elif defined(CONFIG_FSL_BOOKE)
+#define flush_HPTE(pid, va, pg)	_tlbie(va, pid)
+extern void MMU_init_hw(void);
+extern unsigned long mmu_mapin_ram(void);
+extern void adjust_total_lowmem(void);
+
+#elif defined(CONFIG_PPC32)
+/* anything 32-bit except 4xx or 8xx */
+extern void MMU_init_hw(void);
+extern unsigned long mmu_mapin_ram(void);
+
+/* Be careful....this needs to be updated if we ever encounter 603 SMPs,
+ * which includes all new 82xx processors.  We need tlbie/tlbsync here
+ * in that case (I think). -- Dan.
+ */
+static inline void flush_HPTE(unsigned context, unsigned long va,
+			      unsigned long pdval)
+{
+	if ((Hash != 0) &&
+	    cpu_has_feature(CPU_FTR_HPTE_TABLE))
+		flush_hash_pages(0, va, pdval, 1);
+	else
+		_tlbie(va);
+}
+#endif
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
new file mode 100644
index 0000000..cf81049
--- /dev/null
+++ b/arch/powerpc/mm/numa.c
@@ -0,0 +1,1157 @@
+/*
+ * pSeries NUMA support
+ *
+ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/threads.h>
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/lmb.h>
+#include <linux/of.h>
+#include <asm/sparsemem.h>
+#include <asm/prom.h>
+#include <asm/system.h>
+#include <asm/smp.h>
+
+static int numa_enabled = 1;
+
+static char *cmdline __initdata;
+
+static int numa_debug;
+#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
+
+int numa_cpu_lookup_table[NR_CPUS];
+cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
+struct pglist_data *node_data[MAX_NUMNODES];
+
+EXPORT_SYMBOL(numa_cpu_lookup_table);
+EXPORT_SYMBOL(numa_cpumask_lookup_table);
+EXPORT_SYMBOL(node_data);
+
+static int min_common_depth;
+static int n_mem_addr_cells, n_mem_size_cells;
+
+static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
+						unsigned int *nid)
+{
+	unsigned long long mem;
+	char *p = cmdline;
+	static unsigned int fake_nid;
+	static unsigned long long curr_boundary;
+
+	/*
+	 * Modify node id, iff we started creating NUMA nodes
+	 * We want to continue from where we left of the last time
+	 */
+	if (fake_nid)
+		*nid = fake_nid;
+	/*
+	 * In case there are no more arguments to parse, the
+	 * node_id should be the same as the last fake node id
+	 * (we've handled this above).
+	 */
+	if (!p)
+		return 0;
+
+	mem = memparse(p, &p);
+	if (!mem)
+		return 0;
+
+	if (mem < curr_boundary)
+		return 0;
+
+	curr_boundary = mem;
+
+	if ((end_pfn << PAGE_SHIFT) > mem) {
+		/*
+		 * Skip commas and spaces
+		 */
+		while (*p == ',' || *p == ' ' || *p == '\t')
+			p++;
+
+		cmdline = p;
+		fake_nid++;
+		*nid = fake_nid;
+		dbg("created new fake_node with id %d\n", fake_nid);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * get_active_region_work_fn - A helper function for get_node_active_region
+ *	Returns datax set to the start_pfn and end_pfn if they contain
+ *	the initial value of datax->start_pfn between them
+ * @start_pfn: start page(inclusive) of region to check
+ * @end_pfn: end page(exclusive) of region to check
+ * @datax: comes in with ->start_pfn set to value to search for and
+ *	goes out with active range if it contains it
+ * Returns 1 if search value is in range else 0
+ */
+static int __init get_active_region_work_fn(unsigned long start_pfn,
+					unsigned long end_pfn, void *datax)
+{
+	struct node_active_region *data;
+	data = (struct node_active_region *)datax;
+
+	if (start_pfn <= data->start_pfn && end_pfn > data->start_pfn) {
+		data->start_pfn = start_pfn;
+		data->end_pfn = end_pfn;
+		return 1;
+	}
+	return 0;
+
+}
+
+/*
+ * get_node_active_region - Return active region containing start_pfn
+ * Active range returned is empty if none found.
+ * @start_pfn: The page to return the region for.
+ * @node_ar: Returned set to the active region containing start_pfn
+ */
+static void __init get_node_active_region(unsigned long start_pfn,
+		       struct node_active_region *node_ar)
+{
+	int nid = early_pfn_to_nid(start_pfn);
+
+	node_ar->nid = nid;
+	node_ar->start_pfn = start_pfn;
+	node_ar->end_pfn = start_pfn;
+	work_with_active_regions(nid, get_active_region_work_fn, node_ar);
+}
+
+static void __cpuinit map_cpu_to_node(int cpu, int node)
+{
+	numa_cpu_lookup_table[cpu] = node;
+
+	dbg("adding cpu %d to node %d\n", cpu, node);
+
+	if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node])))
+		cpu_set(cpu, numa_cpumask_lookup_table[node]);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void unmap_cpu_from_node(unsigned long cpu)
+{
+	int node = numa_cpu_lookup_table[cpu];
+
+	dbg("removing cpu %lu from node %d\n", cpu, node);
+
+	if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
+		cpu_clear(cpu, numa_cpumask_lookup_table[node]);
+	} else {
+		printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
+		       cpu, node);
+	}
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static struct device_node * __cpuinit find_cpu_node(unsigned int cpu)
+{
+	unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
+	struct device_node *cpu_node = NULL;
+	const unsigned int *interrupt_server, *reg;
+	int len;
+
+	while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
+		/* Try interrupt server first */
+		interrupt_server = of_get_property(cpu_node,
+					"ibm,ppc-interrupt-server#s", &len);
+
+		len = len / sizeof(u32);
+
+		if (interrupt_server && (len > 0)) {
+			while (len--) {
+				if (interrupt_server[len] == hw_cpuid)
+					return cpu_node;
+			}
+		} else {
+			reg = of_get_property(cpu_node, "reg", &len);
+			if (reg && (len > 0) && (reg[0] == hw_cpuid))
+				return cpu_node;
+		}
+	}
+
+	return NULL;
+}
+
+/* must hold reference to node during call */
+static const int *of_get_associativity(struct device_node *dev)
+{
+	return of_get_property(dev, "ibm,associativity", NULL);
+}
+
+/*
+ * Returns the property linux,drconf-usable-memory if
+ * it exists (the property exists only in kexec/kdump kernels,
+ * added by kexec-tools)
+ */
+static const u32 *of_get_usable_memory(struct device_node *memory)
+{
+	const u32 *prop;
+	u32 len;
+	prop = of_get_property(memory, "linux,drconf-usable-memory", &len);
+	if (!prop || len < sizeof(unsigned int))
+		return 0;
+	return prop;
+}
+
+/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
+ * info is found.
+ */
+static int of_node_to_nid_single(struct device_node *device)
+{
+	int nid = -1;
+	const unsigned int *tmp;
+
+	if (min_common_depth == -1)
+		goto out;
+
+	tmp = of_get_associativity(device);
+	if (!tmp)
+		goto out;
+
+	if (tmp[0] >= min_common_depth)
+		nid = tmp[min_common_depth];
+
+	/* POWER4 LPAR uses 0xffff as invalid node */
+	if (nid == 0xffff || nid >= MAX_NUMNODES)
+		nid = -1;
+out:
+	return nid;
+}
+
+/* Walk the device tree upwards, looking for an associativity id */
+int of_node_to_nid(struct device_node *device)
+{
+	struct device_node *tmp;
+	int nid = -1;
+
+	of_node_get(device);
+	while (device) {
+		nid = of_node_to_nid_single(device);
+		if (nid != -1)
+			break;
+
+	        tmp = device;
+		device = of_get_parent(tmp);
+		of_node_put(tmp);
+	}
+	of_node_put(device);
+
+	return nid;
+}
+EXPORT_SYMBOL_GPL(of_node_to_nid);
+
+/*
+ * In theory, the "ibm,associativity" property may contain multiple
+ * associativity lists because a resource may be multiply connected
+ * into the machine.  This resource then has different associativity
+ * characteristics relative to its multiple connections.  We ignore
+ * this for now.  We also assume that all cpu and memory sets have
+ * their distances represented at a common level.  This won't be
+ * true for hierarchical NUMA.
+ *
+ * In any case the ibm,associativity-reference-points should give
+ * the correct depth for a normal NUMA system.
+ *
+ * - Dave Hansen <haveblue@us.ibm.com>
+ */
+static int __init find_min_common_depth(void)
+{
+	int depth;
+	const unsigned int *ref_points;
+	struct device_node *rtas_root;
+	unsigned int len;
+
+	rtas_root = of_find_node_by_path("/rtas");
+
+	if (!rtas_root)
+		return -1;
+
+	/*
+	 * this property is 2 32-bit integers, each representing a level of
+	 * depth in the associativity nodes.  The first is for an SMP
+	 * configuration (should be all 0's) and the second is for a normal
+	 * NUMA configuration.
+	 */
+	ref_points = of_get_property(rtas_root,
+			"ibm,associativity-reference-points", &len);
+
+	if ((len >= 1) && ref_points) {
+		depth = ref_points[1];
+	} else {
+		dbg("NUMA: ibm,associativity-reference-points not found.\n");
+		depth = -1;
+	}
+	of_node_put(rtas_root);
+
+	return depth;
+}
+
+static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
+{
+	struct device_node *memory = NULL;
+
+	memory = of_find_node_by_type(memory, "memory");
+	if (!memory)
+		panic("numa.c: No memory nodes found!");
+
+	*n_addr_cells = of_n_addr_cells(memory);
+	*n_size_cells = of_n_size_cells(memory);
+	of_node_put(memory);
+}
+
+static unsigned long __devinit read_n_cells(int n, const unsigned int **buf)
+{
+	unsigned long result = 0;
+
+	while (n--) {
+		result = (result << 32) | **buf;
+		(*buf)++;
+	}
+	return result;
+}
+
+struct of_drconf_cell {
+	u64	base_addr;
+	u32	drc_index;
+	u32	reserved;
+	u32	aa_index;
+	u32	flags;
+};
+
+#define DRCONF_MEM_ASSIGNED	0x00000008
+#define DRCONF_MEM_AI_INVALID	0x00000040
+#define DRCONF_MEM_RESERVED	0x00000080
+
+/*
+ * Read the next lmb list entry from the ibm,dynamic-memory property
+ * and return the information in the provided of_drconf_cell structure.
+ */
+static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp)
+{
+	const u32 *cp;
+
+	drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp);
+
+	cp = *cellp;
+	drmem->drc_index = cp[0];
+	drmem->reserved = cp[1];
+	drmem->aa_index = cp[2];
+	drmem->flags = cp[3];
+
+	*cellp = cp + 4;
+}
+
+/*
+ * Retreive and validate the ibm,dynamic-memory property of the device tree.
+ *
+ * The layout of the ibm,dynamic-memory property is a number N of lmb
+ * list entries followed by N lmb list entries.  Each lmb list entry
+ * contains information as layed out in the of_drconf_cell struct above.
+ */
+static int of_get_drconf_memory(struct device_node *memory, const u32 **dm)
+{
+	const u32 *prop;
+	u32 len, entries;
+
+	prop = of_get_property(memory, "ibm,dynamic-memory", &len);
+	if (!prop || len < sizeof(unsigned int))
+		return 0;
+
+	entries = *prop++;
+
+	/* Now that we know the number of entries, revalidate the size
+	 * of the property read in to ensure we have everything
+	 */
+	if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int))
+		return 0;
+
+	*dm = prop;
+	return entries;
+}
+
+/*
+ * Retreive and validate the ibm,lmb-size property for drconf memory
+ * from the device tree.
+ */
+static u64 of_get_lmb_size(struct device_node *memory)
+{
+	const u32 *prop;
+	u32 len;
+
+	prop = of_get_property(memory, "ibm,lmb-size", &len);
+	if (!prop || len < sizeof(unsigned int))
+		return 0;
+
+	return read_n_cells(n_mem_size_cells, &prop);
+}
+
+struct assoc_arrays {
+	u32	n_arrays;
+	u32	array_sz;
+	const u32 *arrays;
+};
+
+/*
+ * Retreive and validate the list of associativity arrays for drconf
+ * memory from the ibm,associativity-lookup-arrays property of the
+ * device tree..
+ *
+ * The layout of the ibm,associativity-lookup-arrays property is a number N
+ * indicating the number of associativity arrays, followed by a number M
+ * indicating the size of each associativity array, followed by a list
+ * of N associativity arrays.
+ */
+static int of_get_assoc_arrays(struct device_node *memory,
+			       struct assoc_arrays *aa)
+{
+	const u32 *prop;
+	u32 len;
+
+	prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
+	if (!prop || len < 2 * sizeof(unsigned int))
+		return -1;
+
+	aa->n_arrays = *prop++;
+	aa->array_sz = *prop++;
+
+	/* Now that we know the number of arrrays and size of each array,
+	 * revalidate the size of the property read in.
+	 */
+	if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
+		return -1;
+
+	aa->arrays = prop;
+	return 0;
+}
+
+/*
+ * This is like of_node_to_nid_single() for memory represented in the
+ * ibm,dynamic-reconfiguration-memory node.
+ */
+static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
+				   struct assoc_arrays *aa)
+{
+	int default_nid = 0;
+	int nid = default_nid;
+	int index;
+
+	if (min_common_depth > 0 && min_common_depth <= aa->array_sz &&
+	    !(drmem->flags & DRCONF_MEM_AI_INVALID) &&
+	    drmem->aa_index < aa->n_arrays) {
+		index = drmem->aa_index * aa->array_sz + min_common_depth - 1;
+		nid = aa->arrays[index];
+
+		if (nid == 0xffff || nid >= MAX_NUMNODES)
+			nid = default_nid;
+	}
+
+	return nid;
+}
+
+/*
+ * Figure out to which domain a cpu belongs and stick it there.
+ * Return the id of the domain used.
+ */
+static int __cpuinit numa_setup_cpu(unsigned long lcpu)
+{
+	int nid = 0;
+	struct device_node *cpu = find_cpu_node(lcpu);
+
+	if (!cpu) {
+		WARN_ON(1);
+		goto out;
+	}
+
+	nid = of_node_to_nid_single(cpu);
+
+	if (nid < 0 || !node_online(nid))
+		nid = any_online_node(NODE_MASK_ALL);
+out:
+	map_cpu_to_node(lcpu, nid);
+
+	of_node_put(cpu);
+
+	return nid;
+}
+
+static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
+			     unsigned long action,
+			     void *hcpu)
+{
+	unsigned long lcpu = (unsigned long)hcpu;
+	int ret = NOTIFY_DONE;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		numa_setup_cpu(lcpu);
+		ret = NOTIFY_OK;
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		unmap_cpu_from_node(lcpu);
+		break;
+		ret = NOTIFY_OK;
+#endif
+	}
+	return ret;
+}
+
+/*
+ * Check and possibly modify a memory region to enforce the memory limit.
+ *
+ * Returns the size the region should have to enforce the memory limit.
+ * This will either be the original value of size, a truncated value,
+ * or zero. If the returned value of size is 0 the region should be
+ * discarded as it lies wholy above the memory limit.
+ */
+static unsigned long __init numa_enforce_memory_limit(unsigned long start,
+						      unsigned long size)
+{
+	/*
+	 * We use lmb_end_of_DRAM() in here instead of memory_limit because
+	 * we've already adjusted it for the limit and it takes care of
+	 * having memory holes below the limit.  Also, in the case of
+	 * iommu_is_off, memory_limit is not set but is implicitly enforced.
+	 */
+
+	if (start + size <= lmb_end_of_DRAM())
+		return size;
+
+	if (start >= lmb_end_of_DRAM())
+		return 0;
+
+	return lmb_end_of_DRAM() - start;
+}
+
+/*
+ * Reads the counter for a given entry in
+ * linux,drconf-usable-memory property
+ */
+static inline int __init read_usm_ranges(const u32 **usm)
+{
+	/*
+	 * For each lmb in ibm,dynamic-memory a corresponding
+	 * entry in linux,drconf-usable-memory property contains
+	 * a counter followed by that many (base, size) duple.
+	 * read the counter from linux,drconf-usable-memory
+	 */
+	return read_n_cells(n_mem_size_cells, usm);
+}
+
+/*
+ * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
+ * node.  This assumes n_mem_{addr,size}_cells have been set.
+ */
+static void __init parse_drconf_memory(struct device_node *memory)
+{
+	const u32 *dm, *usm;
+	unsigned int n, rc, ranges, is_kexec_kdump = 0;
+	unsigned long lmb_size, base, size, sz;
+	int nid;
+	struct assoc_arrays aa;
+
+	n = of_get_drconf_memory(memory, &dm);
+	if (!n)
+		return;
+
+	lmb_size = of_get_lmb_size(memory);
+	if (!lmb_size)
+		return;
+
+	rc = of_get_assoc_arrays(memory, &aa);
+	if (rc)
+		return;
+
+	/* check if this is a kexec/kdump kernel */
+	usm = of_get_usable_memory(memory);
+	if (usm != NULL)
+		is_kexec_kdump = 1;
+
+	for (; n != 0; --n) {
+		struct of_drconf_cell drmem;
+
+		read_drconf_cell(&drmem, &dm);
+
+		/* skip this block if the reserved bit is set in flags (0x80)
+		   or if the block is not assigned to this partition (0x8) */
+		if ((drmem.flags & DRCONF_MEM_RESERVED)
+		    || !(drmem.flags & DRCONF_MEM_ASSIGNED))
+			continue;
+
+		base = drmem.base_addr;
+		size = lmb_size;
+		ranges = 1;
+
+		if (is_kexec_kdump) {
+			ranges = read_usm_ranges(&usm);
+			if (!ranges) /* there are no (base, size) duple */
+				continue;
+		}
+		do {
+			if (is_kexec_kdump) {
+				base = read_n_cells(n_mem_addr_cells, &usm);
+				size = read_n_cells(n_mem_size_cells, &usm);
+			}
+			nid = of_drconf_to_nid_single(&drmem, &aa);
+			fake_numa_create_new_node(
+				((base + size) >> PAGE_SHIFT),
+					   &nid);
+			node_set_online(nid);
+			sz = numa_enforce_memory_limit(base, size);
+			if (sz)
+				add_active_range(nid, base >> PAGE_SHIFT,
+						 (base >> PAGE_SHIFT)
+						 + (sz >> PAGE_SHIFT));
+		} while (--ranges);
+	}
+}
+
+static int __init parse_numa_properties(void)
+{
+	struct device_node *cpu = NULL;
+	struct device_node *memory = NULL;
+	int default_nid = 0;
+	unsigned long i;
+
+	if (numa_enabled == 0) {
+		printk(KERN_WARNING "NUMA disabled by user\n");
+		return -1;
+	}
+
+	min_common_depth = find_min_common_depth();
+
+	if (min_common_depth < 0)
+		return min_common_depth;
+
+	dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
+
+	/*
+	 * Even though we connect cpus to numa domains later in SMP
+	 * init, we need to know the node ids now. This is because
+	 * each node to be onlined must have NODE_DATA etc backing it.
+	 */
+	for_each_present_cpu(i) {
+		int nid;
+
+		cpu = find_cpu_node(i);
+		BUG_ON(!cpu);
+		nid = of_node_to_nid_single(cpu);
+		of_node_put(cpu);
+
+		/*
+		 * Don't fall back to default_nid yet -- we will plug
+		 * cpus into nodes once the memory scan has discovered
+		 * the topology.
+		 */
+		if (nid < 0)
+			continue;
+		node_set_online(nid);
+	}
+
+	get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
+	memory = NULL;
+	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
+		unsigned long start;
+		unsigned long size;
+		int nid;
+		int ranges;
+		const unsigned int *memcell_buf;
+		unsigned int len;
+
+		memcell_buf = of_get_property(memory,
+			"linux,usable-memory", &len);
+		if (!memcell_buf || len <= 0)
+			memcell_buf = of_get_property(memory, "reg", &len);
+		if (!memcell_buf || len <= 0)
+			continue;
+
+		/* ranges in cell */
+		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
+new_range:
+		/* these are order-sensitive, and modify the buffer pointer */
+		start = read_n_cells(n_mem_addr_cells, &memcell_buf);
+		size = read_n_cells(n_mem_size_cells, &memcell_buf);
+
+		/*
+		 * Assumption: either all memory nodes or none will
+		 * have associativity properties.  If none, then
+		 * everything goes to default_nid.
+		 */
+		nid = of_node_to_nid_single(memory);
+		if (nid < 0)
+			nid = default_nid;
+
+		fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
+		node_set_online(nid);
+
+		if (!(size = numa_enforce_memory_limit(start, size))) {
+			if (--ranges)
+				goto new_range;
+			else
+				continue;
+		}
+
+		add_active_range(nid, start >> PAGE_SHIFT,
+				(start >> PAGE_SHIFT) + (size >> PAGE_SHIFT));
+
+		if (--ranges)
+			goto new_range;
+	}
+
+	/*
+	 * Now do the same thing for each LMB listed in the ibm,dynamic-memory
+	 * property in the ibm,dynamic-reconfiguration-memory node.
+	 */
+	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+	if (memory)
+		parse_drconf_memory(memory);
+
+	return 0;
+}
+
+static void __init setup_nonnuma(void)
+{
+	unsigned long top_of_ram = lmb_end_of_DRAM();
+	unsigned long total_ram = lmb_phys_mem_size();
+	unsigned long start_pfn, end_pfn;
+	unsigned int i, nid = 0;
+
+	printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
+	       top_of_ram, total_ram);
+	printk(KERN_DEBUG "Memory hole size: %ldMB\n",
+	       (top_of_ram - total_ram) >> 20);
+
+	for (i = 0; i < lmb.memory.cnt; ++i) {
+		start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT;
+		end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i);
+
+		fake_numa_create_new_node(end_pfn, &nid);
+		add_active_range(nid, start_pfn, end_pfn);
+		node_set_online(nid);
+	}
+}
+
+void __init dump_numa_cpu_topology(void)
+{
+	unsigned int node;
+	unsigned int cpu, count;
+
+	if (min_common_depth == -1 || !numa_enabled)
+		return;
+
+	for_each_online_node(node) {
+		printk(KERN_DEBUG "Node %d CPUs:", node);
+
+		count = 0;
+		/*
+		 * If we used a CPU iterator here we would miss printing
+		 * the holes in the cpumap.
+		 */
+		for (cpu = 0; cpu < NR_CPUS; cpu++) {
+			if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
+				if (count == 0)
+					printk(" %u", cpu);
+				++count;
+			} else {
+				if (count > 1)
+					printk("-%u", cpu - 1);
+				count = 0;
+			}
+		}
+
+		if (count > 1)
+			printk("-%u", NR_CPUS - 1);
+		printk("\n");
+	}
+}
+
+static void __init dump_numa_memory_topology(void)
+{
+	unsigned int node;
+	unsigned int count;
+
+	if (min_common_depth == -1 || !numa_enabled)
+		return;
+
+	for_each_online_node(node) {
+		unsigned long i;
+
+		printk(KERN_DEBUG "Node %d Memory:", node);
+
+		count = 0;
+
+		for (i = 0; i < lmb_end_of_DRAM();
+		     i += (1 << SECTION_SIZE_BITS)) {
+			if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) {
+				if (count == 0)
+					printk(" 0x%lx", i);
+				++count;
+			} else {
+				if (count > 0)
+					printk("-0x%lx", i);
+				count = 0;
+			}
+		}
+
+		if (count > 0)
+			printk("-0x%lx", i);
+		printk("\n");
+	}
+}
+
+/*
+ * Allocate some memory, satisfying the lmb or bootmem allocator where
+ * required. nid is the preferred node and end is the physical address of
+ * the highest address in the node.
+ *
+ * Returns the physical address of the memory.
+ */
+static void __init *careful_allocation(int nid, unsigned long size,
+				       unsigned long align,
+				       unsigned long end_pfn)
+{
+	int new_nid;
+	unsigned long ret = __lmb_alloc_base(size, align, end_pfn << PAGE_SHIFT);
+
+	/* retry over all memory */
+	if (!ret)
+		ret = __lmb_alloc_base(size, align, lmb_end_of_DRAM());
+
+	if (!ret)
+		panic("numa.c: cannot allocate %lu bytes on node %d",
+		      size, nid);
+
+	/*
+	 * If the memory came from a previously allocated node, we must
+	 * retry with the bootmem allocator.
+	 */
+	new_nid = early_pfn_to_nid(ret >> PAGE_SHIFT);
+	if (new_nid < nid) {
+		ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(new_nid),
+				size, align, 0);
+
+		if (!ret)
+			panic("numa.c: cannot allocate %lu bytes on node %d",
+			      size, new_nid);
+
+		ret = __pa(ret);
+
+		dbg("alloc_bootmem %lx %lx\n", ret, size);
+	}
+
+	return (void *)ret;
+}
+
+static struct notifier_block __cpuinitdata ppc64_numa_nb = {
+	.notifier_call = cpu_numa_callback,
+	.priority = 1 /* Must run before sched domains notifier. */
+};
+
+static void mark_reserved_regions_for_nid(int nid)
+{
+	struct pglist_data *node = NODE_DATA(nid);
+	int i;
+
+	for (i = 0; i < lmb.reserved.cnt; i++) {
+		unsigned long physbase = lmb.reserved.region[i].base;
+		unsigned long size = lmb.reserved.region[i].size;
+		unsigned long start_pfn = physbase >> PAGE_SHIFT;
+		unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT);
+		struct node_active_region node_ar;
+		unsigned long node_end_pfn = node->node_start_pfn +
+					     node->node_spanned_pages;
+
+		/*
+		 * Check to make sure that this lmb.reserved area is
+		 * within the bounds of the node that we care about.
+		 * Checking the nid of the start and end points is not
+		 * sufficient because the reserved area could span the
+		 * entire node.
+		 */
+		if (end_pfn <= node->node_start_pfn ||
+		    start_pfn >= node_end_pfn)
+			continue;
+
+		get_node_active_region(start_pfn, &node_ar);
+		while (start_pfn < end_pfn &&
+			node_ar.start_pfn < node_ar.end_pfn) {
+			unsigned long reserve_size = size;
+			/*
+			 * if reserved region extends past active region
+			 * then trim size to active region
+			 */
+			if (end_pfn > node_ar.end_pfn)
+				reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
+					- (start_pfn << PAGE_SHIFT);
+			/*
+			 * Only worry about *this* node, others may not
+			 * yet have valid NODE_DATA().
+			 */
+			if (node_ar.nid == nid) {
+				dbg("reserve_bootmem %lx %lx nid=%d\n",
+					physbase, reserve_size, node_ar.nid);
+				reserve_bootmem_node(NODE_DATA(node_ar.nid),
+						physbase, reserve_size,
+						BOOTMEM_DEFAULT);
+			}
+			/*
+			 * if reserved region is contained in the active region
+			 * then done.
+			 */
+			if (end_pfn <= node_ar.end_pfn)
+				break;
+
+			/*
+			 * reserved region extends past the active region
+			 *   get next active region that contains this
+			 *   reserved region
+			 */
+			start_pfn = node_ar.end_pfn;
+			physbase = start_pfn << PAGE_SHIFT;
+			size = size - reserve_size;
+			get_node_active_region(start_pfn, &node_ar);
+		}
+	}
+}
+
+
+void __init do_init_bootmem(void)
+{
+	int nid;
+
+	min_low_pfn = 0;
+	max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
+	max_pfn = max_low_pfn;
+
+	if (parse_numa_properties())
+		setup_nonnuma();
+	else
+		dump_numa_memory_topology();
+
+	register_cpu_notifier(&ppc64_numa_nb);
+	cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
+			  (void *)(unsigned long)boot_cpuid);
+
+	for_each_online_node(nid) {
+		unsigned long start_pfn, end_pfn;
+		unsigned long bootmem_paddr;
+		unsigned long bootmap_pages;
+
+		get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+
+		/*
+		 * Allocate the node structure node local if possible
+		 *
+		 * Be careful moving this around, as it relies on all
+		 * previous nodes' bootmem to be initialized and have
+		 * all reserved areas marked.
+		 */
+		NODE_DATA(nid) = careful_allocation(nid,
+					sizeof(struct pglist_data),
+					SMP_CACHE_BYTES, end_pfn);
+		NODE_DATA(nid) = __va(NODE_DATA(nid));
+		memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+
+  		dbg("node %d\n", nid);
+		dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
+
+		NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
+		NODE_DATA(nid)->node_start_pfn = start_pfn;
+		NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
+
+		if (NODE_DATA(nid)->node_spanned_pages == 0)
+  			continue;
+
+  		dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT);
+  		dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT);
+
+		bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
+		bootmem_paddr = (unsigned long)careful_allocation(nid,
+					bootmap_pages << PAGE_SHIFT,
+					PAGE_SIZE, end_pfn);
+		memset(__va(bootmem_paddr), 0, bootmap_pages << PAGE_SHIFT);
+
+		dbg("bootmap_paddr = %lx\n", bootmem_paddr);
+
+		init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
+				  start_pfn, end_pfn);
+
+		free_bootmem_with_active_regions(nid, end_pfn);
+		/*
+		 * Be very careful about moving this around.  Future
+		 * calls to careful_allocation() depend on this getting
+		 * done correctly.
+		 */
+		mark_reserved_regions_for_nid(nid);
+		sparse_memory_present_with_active_regions(nid);
+	}
+}
+
+void __init paging_init(void)
+{
+	unsigned long max_zone_pfns[MAX_NR_ZONES];
+	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+	max_zone_pfns[ZONE_DMA] = lmb_end_of_DRAM() >> PAGE_SHIFT;
+	free_area_init_nodes(max_zone_pfns);
+}
+
+static int __init early_numa(char *p)
+{
+	if (!p)
+		return 0;
+
+	if (strstr(p, "off"))
+		numa_enabled = 0;
+
+	if (strstr(p, "debug"))
+		numa_debug = 1;
+
+	p = strstr(p, "fake=");
+	if (p)
+		cmdline = p + strlen("fake=");
+
+	return 0;
+}
+early_param("numa", early_numa);
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * Validate the node associated with the memory section we are
+ * trying to add.
+ */
+int valid_hot_add_scn(int *nid, unsigned long start, u32 lmb_size,
+		      unsigned long scn_addr)
+{
+	nodemask_t nodes;
+
+	if (*nid < 0 || !node_online(*nid))
+		*nid = any_online_node(NODE_MASK_ALL);
+
+	if ((scn_addr >= start) && (scn_addr < (start + lmb_size))) {
+		nodes_setall(nodes);
+		while (NODE_DATA(*nid)->node_spanned_pages == 0) {
+			node_clear(*nid, nodes);
+			*nid = any_online_node(nodes);
+		}
+
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * Find the node associated with a hot added memory section represented
+ * by the ibm,dynamic-reconfiguration-memory node.
+ */
+static int hot_add_drconf_scn_to_nid(struct device_node *memory,
+				     unsigned long scn_addr)
+{
+	const u32 *dm;
+	unsigned int n, rc;
+	unsigned long lmb_size;
+	int default_nid = any_online_node(NODE_MASK_ALL);
+	int nid;
+	struct assoc_arrays aa;
+
+	n = of_get_drconf_memory(memory, &dm);
+	if (!n)
+		return default_nid;;
+
+	lmb_size = of_get_lmb_size(memory);
+	if (!lmb_size)
+		return default_nid;
+
+	rc = of_get_assoc_arrays(memory, &aa);
+	if (rc)
+		return default_nid;
+
+	for (; n != 0; --n) {
+		struct of_drconf_cell drmem;
+
+		read_drconf_cell(&drmem, &dm);
+
+		/* skip this block if it is reserved or not assigned to
+		 * this partition */
+		if ((drmem.flags & DRCONF_MEM_RESERVED)
+		    || !(drmem.flags & DRCONF_MEM_ASSIGNED))
+			continue;
+
+		nid = of_drconf_to_nid_single(&drmem, &aa);
+
+		if (valid_hot_add_scn(&nid, drmem.base_addr, lmb_size,
+				      scn_addr))
+			return nid;
+	}
+
+	BUG();	/* section address should be found above */
+	return 0;
+}
+
+/*
+ * Find the node associated with a hot added memory section.  Section
+ * corresponds to a SPARSEMEM section, not an LMB.  It is assumed that
+ * sections are fully contained within a single LMB.
+ */
+int hot_add_scn_to_nid(unsigned long scn_addr)
+{
+	struct device_node *memory = NULL;
+	int nid;
+
+	if (!numa_enabled || (min_common_depth < 0))
+		return any_online_node(NODE_MASK_ALL);
+
+	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+	if (memory) {
+		nid = hot_add_drconf_scn_to_nid(memory, scn_addr);
+		of_node_put(memory);
+		return nid;
+	}
+
+	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
+		unsigned long start, size;
+		int ranges;
+		const unsigned int *memcell_buf;
+		unsigned int len;
+
+		memcell_buf = of_get_property(memory, "reg", &len);
+		if (!memcell_buf || len <= 0)
+			continue;
+
+		/* ranges in cell */
+		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
+ha_new_range:
+		start = read_n_cells(n_mem_addr_cells, &memcell_buf);
+		size = read_n_cells(n_mem_size_cells, &memcell_buf);
+		nid = of_node_to_nid_single(memory);
+
+		if (valid_hot_add_scn(&nid, start, size, scn_addr)) {
+			of_node_put(memory);
+			return nid;
+		}
+
+		if (--ranges)		/* process all ranges in cell */
+			goto ha_new_range;
+	}
+	BUG();	/* section address should be found above */
+	return 0;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
new file mode 100644
index 0000000..2e664f3
--- /dev/null
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -0,0 +1,422 @@
+/*
+ * This file contains the routines setting up the linux page tables.
+ *  -- paulus
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
+#include <asm/io.h>
+
+#include "mmu_decl.h"
+
+unsigned long ioremap_base;
+unsigned long ioremap_bot;
+EXPORT_SYMBOL(ioremap_bot);	/* aka VMALLOC_END */
+
+#if defined(CONFIG_6xx) || defined(CONFIG_POWER3)
+#define HAVE_BATS	1
+#endif
+
+#if defined(CONFIG_FSL_BOOKE)
+#define HAVE_TLBCAM	1
+#endif
+
+extern char etext[], _stext[];
+
+#ifdef CONFIG_SMP
+extern void hash_page_sync(void);
+#endif
+
+#ifdef HAVE_BATS
+extern phys_addr_t v_mapped_by_bats(unsigned long va);
+extern unsigned long p_mapped_by_bats(phys_addr_t pa);
+void setbat(int index, unsigned long virt, phys_addr_t phys,
+	    unsigned int size, int flags);
+
+#else /* !HAVE_BATS */
+#define v_mapped_by_bats(x)	(0UL)
+#define p_mapped_by_bats(x)	(0UL)
+#endif /* HAVE_BATS */
+
+#ifdef HAVE_TLBCAM
+extern unsigned int tlbcam_index;
+extern phys_addr_t v_mapped_by_tlbcam(unsigned long va);
+extern unsigned long p_mapped_by_tlbcam(phys_addr_t pa);
+#else /* !HAVE_TLBCAM */
+#define v_mapped_by_tlbcam(x)	(0UL)
+#define p_mapped_by_tlbcam(x)	(0UL)
+#endif /* HAVE_TLBCAM */
+
+#ifdef CONFIG_PTE_64BIT
+/* Some processors use an 8kB pgdir because they have 8-byte Linux PTEs. */
+#define PGDIR_ORDER	1
+#else
+#define PGDIR_ORDER	0
+#endif
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+	pgd_t *ret;
+
+	ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER);
+	return ret;
+}
+
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+	free_pages((unsigned long)pgd, PGDIR_ORDER);
+}
+
+__init_refok pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+{
+	pte_t *pte;
+	extern int mem_init_done;
+	extern void *early_get_page(void);
+
+	if (mem_init_done) {
+		pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+	} else {
+		pte = (pte_t *)early_get_page();
+		if (pte)
+			clear_page(pte);
+	}
+	return pte;
+}
+
+pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+	struct page *ptepage;
+
+#ifdef CONFIG_HIGHPTE
+	gfp_t flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_REPEAT | __GFP_ZERO;
+#else
+	gfp_t flags = GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO;
+#endif
+
+	ptepage = alloc_pages(flags, 0);
+	if (!ptepage)
+		return NULL;
+	pgtable_page_ctor(ptepage);
+	return ptepage;
+}
+
+void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+#ifdef CONFIG_SMP
+	hash_page_sync();
+#endif
+	free_page((unsigned long)pte);
+}
+
+void pte_free(struct mm_struct *mm, pgtable_t ptepage)
+{
+#ifdef CONFIG_SMP
+	hash_page_sync();
+#endif
+	pgtable_page_dtor(ptepage);
+	__free_page(ptepage);
+}
+
+void __iomem *
+ioremap(phys_addr_t addr, unsigned long size)
+{
+	return __ioremap(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED);
+}
+EXPORT_SYMBOL(ioremap);
+
+void __iomem *
+ioremap_flags(phys_addr_t addr, unsigned long size, unsigned long flags)
+{
+	/* writeable implies dirty for kernel addresses */
+	if (flags & _PAGE_RW)
+		flags |= _PAGE_DIRTY | _PAGE_HWWRITE;
+
+	/* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
+	flags &= ~(_PAGE_USER | _PAGE_EXEC | _PAGE_HWEXEC);
+
+	return __ioremap(addr, size, flags);
+}
+EXPORT_SYMBOL(ioremap_flags);
+
+void __iomem *
+__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
+{
+	unsigned long v, i;
+	phys_addr_t p;
+	int err;
+
+	/* Make sure we have the base flags */
+	if ((flags & _PAGE_PRESENT) == 0)
+		flags |= _PAGE_KERNEL;
+
+	/* Non-cacheable page cannot be coherent */
+	if (flags & _PAGE_NO_CACHE)
+		flags &= ~_PAGE_COHERENT;
+
+	/*
+	 * Choose an address to map it to.
+	 * Once the vmalloc system is running, we use it.
+	 * Before then, we use space going down from ioremap_base
+	 * (ioremap_bot records where we're up to).
+	 */
+	p = addr & PAGE_MASK;
+	size = PAGE_ALIGN(addr + size) - p;
+
+	/*
+	 * If the address lies within the first 16 MB, assume it's in ISA
+	 * memory space
+	 */
+	if (p < 16*1024*1024)
+		p += _ISA_MEM_BASE;
+
+	/*
+	 * Don't allow anybody to remap normal RAM that we're using.
+	 * mem_init() sets high_memory so only do the check after that.
+	 */
+	if (mem_init_done && (p < virt_to_phys(high_memory))) {
+		printk("__ioremap(): phys addr 0x%llx is RAM lr %p\n",
+		       (unsigned long long)p, __builtin_return_address(0));
+		return NULL;
+	}
+
+	if (size == 0)
+		return NULL;
+
+	/*
+	 * Is it already mapped?  Perhaps overlapped by a previous
+	 * BAT mapping.  If the whole area is mapped then we're done,
+	 * otherwise remap it since we want to keep the virt addrs for
+	 * each request contiguous.
+	 *
+	 * We make the assumption here that if the bottom and top
+	 * of the range we want are mapped then it's mapped to the
+	 * same virt address (and this is contiguous).
+	 *  -- Cort
+	 */
+	if ((v = p_mapped_by_bats(p)) /*&& p_mapped_by_bats(p+size-1)*/ )
+		goto out;
+
+	if ((v = p_mapped_by_tlbcam(p)))
+		goto out;
+
+	if (mem_init_done) {
+		struct vm_struct *area;
+		area = get_vm_area(size, VM_IOREMAP);
+		if (area == 0)
+			return NULL;
+		v = (unsigned long) area->addr;
+	} else {
+		v = (ioremap_bot -= size);
+	}
+
+	/*
+	 * Should check if it is a candidate for a BAT mapping
+	 */
+
+	err = 0;
+	for (i = 0; i < size && err == 0; i += PAGE_SIZE)
+		err = map_page(v+i, p+i, flags);
+	if (err) {
+		if (mem_init_done)
+			vunmap((void *)v);
+		return NULL;
+	}
+
+out:
+	return (void __iomem *) (v + ((unsigned long)addr & ~PAGE_MASK));
+}
+EXPORT_SYMBOL(__ioremap);
+
+void iounmap(volatile void __iomem *addr)
+{
+	/*
+	 * If mapped by BATs then there is nothing to do.
+	 * Calling vfree() generates a benign warning.
+	 */
+	if (v_mapped_by_bats((unsigned long)addr)) return;
+
+	if (addr > high_memory && (unsigned long) addr < ioremap_bot)
+		vunmap((void *) (PAGE_MASK & (unsigned long)addr));
+}
+EXPORT_SYMBOL(iounmap);
+
+int map_page(unsigned long va, phys_addr_t pa, int flags)
+{
+	pmd_t *pd;
+	pte_t *pg;
+	int err = -ENOMEM;
+
+	/* Use upper 10 bits of VA to index the first level map */
+	pd = pmd_offset(pud_offset(pgd_offset_k(va), va), va);
+	/* Use middle 10 bits of VA to index the second-level map */
+	pg = pte_alloc_kernel(pd, va);
+	if (pg != 0) {
+		err = 0;
+		/* The PTE should never be already set nor present in the
+		 * hash table
+		 */
+		BUG_ON(pte_val(*pg) & (_PAGE_PRESENT | _PAGE_HASHPTE));
+		set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT,
+						     __pgprot(flags)));
+	}
+	return err;
+}
+
+/*
+ * Map in a big chunk of physical memory starting at KERNELBASE.
+ */
+void __init mapin_ram(void)
+{
+	unsigned long v, s, f;
+	phys_addr_t p;
+	int ktext;
+
+	s = mmu_mapin_ram();
+	v = KERNELBASE + s;
+	p = memstart_addr + s;
+	for (; s < total_lowmem; s += PAGE_SIZE) {
+		ktext = ((char *) v >= _stext && (char *) v < etext);
+		f = ktext ?_PAGE_RAM_TEXT : _PAGE_RAM;
+		map_page(v, p, f);
+#ifdef CONFIG_PPC_STD_MMU_32
+		if (ktext)
+			hash_preload(&init_mm, v, 0, 0x300);
+#endif
+		v += PAGE_SIZE;
+		p += PAGE_SIZE;
+	}
+}
+
+/* Scan the real Linux page tables and return a PTE pointer for
+ * a virtual address in a context.
+ * Returns true (1) if PTE was found, zero otherwise.  The pointer to
+ * the PTE pointer is unmodified if PTE is not found.
+ */
+int
+get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp)
+{
+        pgd_t	*pgd;
+	pud_t	*pud;
+        pmd_t	*pmd;
+        pte_t	*pte;
+        int     retval = 0;
+
+        pgd = pgd_offset(mm, addr & PAGE_MASK);
+        if (pgd) {
+		pud = pud_offset(pgd, addr & PAGE_MASK);
+		if (pud && pud_present(*pud)) {
+			pmd = pmd_offset(pud, addr & PAGE_MASK);
+			if (pmd_present(*pmd)) {
+				pte = pte_offset_map(pmd, addr & PAGE_MASK);
+				if (pte) {
+					retval = 1;
+					*ptep = pte;
+					if (pmdp)
+						*pmdp = pmd;
+					/* XXX caller needs to do pte_unmap, yuck */
+				}
+			}
+		}
+        }
+        return(retval);
+}
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+
+static int __change_page_attr(struct page *page, pgprot_t prot)
+{
+	pte_t *kpte;
+	pmd_t *kpmd;
+	unsigned long address;
+
+	BUG_ON(PageHighMem(page));
+	address = (unsigned long)page_address(page);
+
+	if (v_mapped_by_bats(address) || v_mapped_by_tlbcam(address))
+		return 0;
+	if (!get_pteptr(&init_mm, address, &kpte, &kpmd))
+		return -EINVAL;
+	set_pte_at(&init_mm, address, kpte, mk_pte(page, prot));
+	wmb();
+	flush_HPTE(0, address, pmd_val(*kpmd));
+	pte_unmap(kpte);
+
+	return 0;
+}
+
+/*
+ * Change the page attributes of an page in the linear mapping.
+ *
+ * THIS CONFLICTS WITH BAT MAPPINGS, DEBUG USE ONLY
+ */
+static int change_page_attr(struct page *page, int numpages, pgprot_t prot)
+{
+	int i, err = 0;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	for (i = 0; i < numpages; i++, page++) {
+		err = __change_page_attr(page, prot);
+		if (err)
+			break;
+	}
+	local_irq_restore(flags);
+	return err;
+}
+
+
+void kernel_map_pages(struct page *page, int numpages, int enable)
+{
+	if (PageHighMem(page))
+		return;
+
+	change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
+}
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
+static int fixmaps;
+unsigned long FIXADDR_TOP = 0xfffff000;
+EXPORT_SYMBOL(FIXADDR_TOP);
+
+void __set_fixmap (enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
+{
+	unsigned long address = __fix_to_virt(idx);
+
+	if (idx >= __end_of_fixed_addresses) {
+		BUG();
+		return;
+	}
+
+	map_page(address, phys, pgprot_val(flags));
+	fixmaps++;
+}
+
+void __this_fixmap_does_not_exist(void)
+{
+	WARN_ON(1);
+}
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
new file mode 100644
index 0000000..365e61a
--- /dev/null
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -0,0 +1,250 @@
+/*
+ *  This file contains ioremap and related functions for 64-bit machines.
+ *
+ *  Derived from arch/ppc64/mm/init.c
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@samba.org)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  Dave Engebretsen <engebret@us.ibm.com>
+ *      Rework for PPC64 port.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+
+#include <asm/pgalloc.h>
+#include <asm/page.h>
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/smp.h>
+#include <asm/machdep.h>
+#include <asm/tlb.h>
+#include <asm/processor.h>
+#include <asm/cputable.h>
+#include <asm/sections.h>
+#include <asm/system.h>
+#include <asm/abs_addr.h>
+#include <asm/firmware.h>
+
+#include "mmu_decl.h"
+
+unsigned long ioremap_bot = IOREMAP_BASE;
+
+/*
+ * map_io_page currently only called by __ioremap
+ * map_io_page adds an entry to the ioremap page table
+ * and adds an entry to the HPT, possibly bolting it
+ */
+static int map_io_page(unsigned long ea, unsigned long pa, int flags)
+{
+	pgd_t *pgdp;
+	pud_t *pudp;
+	pmd_t *pmdp;
+	pte_t *ptep;
+
+	if (mem_init_done) {
+		pgdp = pgd_offset_k(ea);
+		pudp = pud_alloc(&init_mm, pgdp, ea);
+		if (!pudp)
+			return -ENOMEM;
+		pmdp = pmd_alloc(&init_mm, pudp, ea);
+		if (!pmdp)
+			return -ENOMEM;
+		ptep = pte_alloc_kernel(pmdp, ea);
+		if (!ptep)
+			return -ENOMEM;
+		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
+							  __pgprot(flags)));
+	} else {
+		/*
+		 * If the mm subsystem is not fully up, we cannot create a
+		 * linux page table entry for this mapping.  Simply bolt an
+		 * entry in the hardware page table.
+		 *
+		 */
+		if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
+				      mmu_io_psize, mmu_kernel_ssize)) {
+			printk(KERN_ERR "Failed to do bolted mapping IO "
+			       "memory at %016lx !\n", pa);
+			return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
+
+/**
+ * __ioremap_at - Low level function to establish the page tables
+ *                for an IO mapping
+ */
+void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
+			    unsigned long flags)
+{
+	unsigned long i;
+
+	/* Make sure we have the base flags */
+	if ((flags & _PAGE_PRESENT) == 0)
+		flags |= pgprot_val(PAGE_KERNEL);
+
+	/* Non-cacheable page cannot be coherent */
+	if (flags & _PAGE_NO_CACHE)
+		flags &= ~_PAGE_COHERENT;
+
+	/* We don't support the 4K PFN hack with ioremap */
+	if (flags & _PAGE_4K_PFN)
+		return NULL;
+
+	WARN_ON(pa & ~PAGE_MASK);
+	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
+	WARN_ON(size & ~PAGE_MASK);
+
+	for (i = 0; i < size; i += PAGE_SIZE)
+		if (map_io_page((unsigned long)ea+i, pa+i, flags))
+			return NULL;
+
+	return (void __iomem *)ea;
+}
+
+/**
+ * __iounmap_from - Low level function to tear down the page tables
+ *                  for an IO mapping. This is used for mappings that
+ *                  are manipulated manually, like partial unmapping of
+ *                  PCI IOs or ISA space.
+ */
+void __iounmap_at(void *ea, unsigned long size)
+{
+	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
+	WARN_ON(size & ~PAGE_MASK);
+
+	unmap_kernel_range((unsigned long)ea, size);
+}
+
+void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
+			 unsigned long flags)
+{
+	phys_addr_t paligned;
+	void __iomem *ret;
+
+	/*
+	 * Choose an address to map it to.
+	 * Once the imalloc system is running, we use it.
+	 * Before that, we map using addresses going
+	 * up from ioremap_bot.  imalloc will use
+	 * the addresses from ioremap_bot through
+	 * IMALLOC_END
+	 * 
+	 */
+	paligned = addr & PAGE_MASK;
+	size = PAGE_ALIGN(addr + size) - paligned;
+
+	if ((size == 0) || (paligned == 0))
+		return NULL;
+
+	if (mem_init_done) {
+		struct vm_struct *area;
+
+		area = __get_vm_area(size, VM_IOREMAP,
+				     ioremap_bot, IOREMAP_END);
+		if (area == NULL)
+			return NULL;
+		ret = __ioremap_at(paligned, area->addr, size, flags);
+		if (!ret)
+			vunmap(area->addr);
+	} else {
+		ret = __ioremap_at(paligned, (void *)ioremap_bot, size, flags);
+		if (ret)
+			ioremap_bot += size;
+	}
+
+	if (ret)
+		ret += addr & ~PAGE_MASK;
+	return ret;
+}
+
+
+void __iomem * ioremap(phys_addr_t addr, unsigned long size)
+{
+	unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED;
+
+	if (ppc_md.ioremap)
+		return ppc_md.ioremap(addr, size, flags);
+	return __ioremap(addr, size, flags);
+}
+
+void __iomem * ioremap_flags(phys_addr_t addr, unsigned long size,
+			     unsigned long flags)
+{
+	/* writeable implies dirty for kernel addresses */
+	if (flags & _PAGE_RW)
+		flags |= _PAGE_DIRTY;
+
+	/* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
+	flags &= ~(_PAGE_USER | _PAGE_EXEC);
+
+	if (ppc_md.ioremap)
+		return ppc_md.ioremap(addr, size, flags);
+	return __ioremap(addr, size, flags);
+}
+
+
+/*  
+ * Unmap an IO region and remove it from imalloc'd list.
+ * Access to IO memory should be serialized by driver.
+ */
+void __iounmap(volatile void __iomem *token)
+{
+	void *addr;
+
+	if (!mem_init_done)
+		return;
+	
+	addr = (void *) ((unsigned long __force)
+			 PCI_FIX_ADDR(token) & PAGE_MASK);
+	if ((unsigned long)addr < ioremap_bot) {
+		printk(KERN_WARNING "Attempt to iounmap early bolted mapping"
+		       " at 0x%p\n", addr);
+		return;
+	}
+	vunmap(addr);
+}
+
+void iounmap(volatile void __iomem *token)
+{
+	if (ppc_md.iounmap)
+		ppc_md.iounmap(token);
+	else
+		__iounmap(token);
+}
+
+EXPORT_SYMBOL(ioremap);
+EXPORT_SYMBOL(ioremap_flags);
+EXPORT_SYMBOL(__ioremap);
+EXPORT_SYMBOL(__ioremap_at);
+EXPORT_SYMBOL(iounmap);
+EXPORT_SYMBOL(__iounmap);
+EXPORT_SYMBOL(__iounmap_at);
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
new file mode 100644
index 0000000..6aa1208
--- /dev/null
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -0,0 +1,280 @@
+/*
+ * This file contains the routines for handling the MMU on those
+ * PowerPC implementations where the MMU substantially follows the
+ * architecture specification.  This includes the 6xx, 7xx, 7xxx,
+ * 8260, and POWER3 implementations but excludes the 8xx and 4xx.
+ *  -- paulus
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/lmb.h>
+
+#include <asm/prom.h>
+#include <asm/mmu.h>
+#include <asm/machdep.h>
+
+#include "mmu_decl.h"
+
+struct hash_pte *Hash, *Hash_end;
+unsigned long Hash_size, Hash_mask;
+unsigned long _SDR1;
+
+struct ppc_bat BATS[8][2];	/* 8 pairs of IBAT, DBAT */
+
+struct batrange {		/* stores address ranges mapped by BATs */
+	unsigned long start;
+	unsigned long limit;
+	phys_addr_t phys;
+} bat_addrs[8];
+
+/*
+ * Return PA for this VA if it is mapped by a BAT, or 0
+ */
+phys_addr_t v_mapped_by_bats(unsigned long va)
+{
+	int b;
+	for (b = 0; b < 4; ++b)
+		if (va >= bat_addrs[b].start && va < bat_addrs[b].limit)
+			return bat_addrs[b].phys + (va - bat_addrs[b].start);
+	return 0;
+}
+
+/*
+ * Return VA for a given PA or 0 if not mapped
+ */
+unsigned long p_mapped_by_bats(phys_addr_t pa)
+{
+	int b;
+	for (b = 0; b < 4; ++b)
+		if (pa >= bat_addrs[b].phys
+	    	    && pa < (bat_addrs[b].limit-bat_addrs[b].start)
+		              +bat_addrs[b].phys)
+			return bat_addrs[b].start+(pa-bat_addrs[b].phys);
+	return 0;
+}
+
+unsigned long __init mmu_mapin_ram(void)
+{
+#ifdef CONFIG_POWER4
+	return 0;
+#else
+	unsigned long tot, bl, done;
+	unsigned long max_size = (256<<20);
+
+	if (__map_without_bats) {
+		printk(KERN_DEBUG "RAM mapped without BATs\n");
+		return 0;
+	}
+
+	/* Set up BAT2 and if necessary BAT3 to cover RAM. */
+
+	/* Make sure we don't map a block larger than the
+	   smallest alignment of the physical address. */
+	tot = total_lowmem;
+	for (bl = 128<<10; bl < max_size; bl <<= 1) {
+		if (bl * 2 > tot)
+			break;
+	}
+
+	setbat(2, KERNELBASE, 0, bl, _PAGE_RAM);
+	done = (unsigned long)bat_addrs[2].limit - KERNELBASE + 1;
+	if ((done < tot) && !bat_addrs[3].limit) {
+		/* use BAT3 to cover a bit more */
+		tot -= done;
+		for (bl = 128<<10; bl < max_size; bl <<= 1)
+			if (bl * 2 > tot)
+				break;
+		setbat(3, KERNELBASE+done, done, bl, _PAGE_RAM);
+		done = (unsigned long)bat_addrs[3].limit - KERNELBASE + 1;
+	}
+
+	return done;
+#endif
+}
+
+/*
+ * Set up one of the I/D BAT (block address translation) register pairs.
+ * The parameters are not checked; in particular size must be a power
+ * of 2 between 128k and 256M.
+ */
+void __init setbat(int index, unsigned long virt, phys_addr_t phys,
+		   unsigned int size, int flags)
+{
+	unsigned int bl;
+	int wimgxpp;
+	struct ppc_bat *bat = BATS[index];
+
+	if (((flags & _PAGE_NO_CACHE) == 0) &&
+	    cpu_has_feature(CPU_FTR_NEED_COHERENT))
+		flags |= _PAGE_COHERENT;
+
+	bl = (size >> 17) - 1;
+	if (PVR_VER(mfspr(SPRN_PVR)) != 1) {
+		/* 603, 604, etc. */
+		/* Do DBAT first */
+		wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
+				   | _PAGE_COHERENT | _PAGE_GUARDED);
+		wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX;
+		bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
+		bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp;
+#ifndef CONFIG_KGDB /* want user access for breakpoints */
+		if (flags & _PAGE_USER)
+#endif
+			bat[1].batu |= 1; 	/* Vp = 1 */
+		if (flags & _PAGE_GUARDED) {
+			/* G bit must be zero in IBATs */
+			bat[0].batu = bat[0].batl = 0;
+		} else {
+			/* make IBAT same as DBAT */
+			bat[0] = bat[1];
+		}
+	} else {
+		/* 601 cpu */
+		if (bl > BL_8M)
+			bl = BL_8M;
+		wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
+				   | _PAGE_COHERENT);
+		wimgxpp |= (flags & _PAGE_RW)?
+			((flags & _PAGE_USER)? PP_RWRW: PP_RWXX): PP_RXRX;
+		bat->batu = virt | wimgxpp | 4;	/* Ks=0, Ku=1 */
+		bat->batl = phys | bl | 0x40;	/* V=1 */
+	}
+
+	bat_addrs[index].start = virt;
+	bat_addrs[index].limit = virt + ((bl + 1) << 17) - 1;
+	bat_addrs[index].phys = phys;
+}
+
+/*
+ * Preload a translation in the hash table
+ */
+void hash_preload(struct mm_struct *mm, unsigned long ea,
+		  unsigned long access, unsigned long trap)
+{
+	pmd_t *pmd;
+
+	if (Hash == 0)
+		return;
+	pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea);
+	if (!pmd_none(*pmd))
+		add_hash_page(mm->context.id, ea, pmd_val(*pmd));
+}
+
+/*
+ * Initialize the hash table and patch the instructions in hashtable.S.
+ */
+void __init MMU_init_hw(void)
+{
+	unsigned int hmask, mb, mb2;
+	unsigned int n_hpteg, lg_n_hpteg;
+
+	extern unsigned int hash_page_patch_A[];
+	extern unsigned int hash_page_patch_B[], hash_page_patch_C[];
+	extern unsigned int hash_page[];
+	extern unsigned int flush_hash_patch_A[], flush_hash_patch_B[];
+
+	if (!cpu_has_feature(CPU_FTR_HPTE_TABLE)) {
+		/*
+		 * Put a blr (procedure return) instruction at the
+		 * start of hash_page, since we can still get DSI
+		 * exceptions on a 603.
+		 */
+		hash_page[0] = 0x4e800020;
+		flush_icache_range((unsigned long) &hash_page[0],
+				   (unsigned long) &hash_page[1]);
+		return;
+	}
+
+	if ( ppc_md.progress ) ppc_md.progress("hash:enter", 0x105);
+
+#define LG_HPTEG_SIZE	6		/* 64 bytes per HPTEG */
+#define SDR1_LOW_BITS	((n_hpteg - 1) >> 10)
+#define MIN_N_HPTEG	1024		/* min 64kB hash table */
+
+	/*
+	 * Allow 1 HPTE (1/8 HPTEG) for each page of memory.
+	 * This is less than the recommended amount, but then
+	 * Linux ain't AIX.
+	 */
+	n_hpteg = total_memory / (PAGE_SIZE * 8);
+	if (n_hpteg < MIN_N_HPTEG)
+		n_hpteg = MIN_N_HPTEG;
+	lg_n_hpteg = __ilog2(n_hpteg);
+	if (n_hpteg & (n_hpteg - 1)) {
+		++lg_n_hpteg;		/* round up if not power of 2 */
+		n_hpteg = 1 << lg_n_hpteg;
+	}
+	Hash_size = n_hpteg << LG_HPTEG_SIZE;
+
+	/*
+	 * Find some memory for the hash table.
+	 */
+	if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322);
+	Hash = __va(lmb_alloc_base(Hash_size, Hash_size,
+				   __initial_memory_limit_addr));
+	cacheable_memzero(Hash, Hash_size);
+	_SDR1 = __pa(Hash) | SDR1_LOW_BITS;
+
+	Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size);
+
+	printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n",
+	       (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash);
+
+
+	/*
+	 * Patch up the instructions in hashtable.S:create_hpte
+	 */
+	if ( ppc_md.progress ) ppc_md.progress("hash:patch", 0x345);
+	Hash_mask = n_hpteg - 1;
+	hmask = Hash_mask >> (16 - LG_HPTEG_SIZE);
+	mb2 = mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg;
+	if (lg_n_hpteg > 16)
+		mb2 = 16 - LG_HPTEG_SIZE;
+
+	hash_page_patch_A[0] = (hash_page_patch_A[0] & ~0xffff)
+		| ((unsigned int)(Hash) >> 16);
+	hash_page_patch_A[1] = (hash_page_patch_A[1] & ~0x7c0) | (mb << 6);
+	hash_page_patch_A[2] = (hash_page_patch_A[2] & ~0x7c0) | (mb2 << 6);
+	hash_page_patch_B[0] = (hash_page_patch_B[0] & ~0xffff) | hmask;
+	hash_page_patch_C[0] = (hash_page_patch_C[0] & ~0xffff) | hmask;
+
+	/*
+	 * Ensure that the locations we've patched have been written
+	 * out from the data cache and invalidated in the instruction
+	 * cache, on those machines with split caches.
+	 */
+	flush_icache_range((unsigned long) &hash_page_patch_A[0],
+			   (unsigned long) &hash_page_patch_C[1]);
+
+	/*
+	 * Patch up the instructions in hashtable.S:flush_hash_page
+	 */
+	flush_hash_patch_A[0] = (flush_hash_patch_A[0] & ~0xffff)
+		| ((unsigned int)(Hash) >> 16);
+	flush_hash_patch_A[1] = (flush_hash_patch_A[1] & ~0x7c0) | (mb << 6);
+	flush_hash_patch_A[2] = (flush_hash_patch_A[2] & ~0x7c0) | (mb2 << 6);
+	flush_hash_patch_B[0] = (flush_hash_patch_B[0] & ~0xffff) | hmask;
+	flush_icache_range((unsigned long) &flush_hash_patch_A[0],
+			   (unsigned long) &flush_hash_patch_B[1]);
+
+	if ( ppc_md.progress ) ppc_md.progress("hash:done", 0x205);
+}
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
new file mode 100644
index 0000000..89497fb
--- /dev/null
+++ b/arch/powerpc/mm/slb.c
@@ -0,0 +1,328 @@
+/*
+ * PowerPC64 SLB support.
+ *
+ * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
+ * Based on earlier code writteh by:
+ * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
+ *    Copyright (c) 2001 Dave Engebretsen
+ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/paca.h>
+#include <asm/cputable.h>
+#include <asm/cacheflush.h>
+#include <asm/smp.h>
+#include <asm/firmware.h>
+#include <linux/compiler.h>
+#include <asm/udbg.h>
+
+#ifdef DEBUG
+#define DBG(fmt...) printk(fmt)
+#else
+#define DBG pr_debug
+#endif
+
+extern void slb_allocate_realmode(unsigned long ea);
+extern void slb_allocate_user(unsigned long ea);
+
+static void slb_allocate(unsigned long ea)
+{
+	/* Currently, we do real mode for all SLBs including user, but
+	 * that will change if we bring back dynamic VSIDs
+	 */
+	slb_allocate_realmode(ea);
+}
+
+#define slb_esid_mask(ssize)	\
+	(((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T)
+
+static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
+					 unsigned long slot)
+{
+	return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | slot;
+}
+
+#define slb_vsid_shift(ssize)	\
+	((ssize) == MMU_SEGSIZE_256M? SLB_VSID_SHIFT: SLB_VSID_SHIFT_1T)
+
+static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
+					 unsigned long flags)
+{
+	return (get_kernel_vsid(ea, ssize) << slb_vsid_shift(ssize)) | flags |
+		((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT);
+}
+
+static inline void slb_shadow_update(unsigned long ea, int ssize,
+				     unsigned long flags,
+				     unsigned long entry)
+{
+	/*
+	 * Clear the ESID first so the entry is not valid while we are
+	 * updating it.  No write barriers are needed here, provided
+	 * we only update the current CPU's SLB shadow buffer.
+	 */
+	get_slb_shadow()->save_area[entry].esid = 0;
+	get_slb_shadow()->save_area[entry].vsid = mk_vsid_data(ea, ssize, flags);
+	get_slb_shadow()->save_area[entry].esid = mk_esid_data(ea, ssize, entry);
+}
+
+static inline void slb_shadow_clear(unsigned long entry)
+{
+	get_slb_shadow()->save_area[entry].esid = 0;
+}
+
+static inline void create_shadowed_slbe(unsigned long ea, int ssize,
+					unsigned long flags,
+					unsigned long entry)
+{
+	/*
+	 * Updating the shadow buffer before writing the SLB ensures
+	 * we don't get a stale entry here if we get preempted by PHYP
+	 * between these two statements.
+	 */
+	slb_shadow_update(ea, ssize, flags, entry);
+
+	asm volatile("slbmte  %0,%1" :
+		     : "r" (mk_vsid_data(ea, ssize, flags)),
+		       "r" (mk_esid_data(ea, ssize, entry))
+		     : "memory" );
+}
+
+void slb_flush_and_rebolt(void)
+{
+	/* If you change this make sure you change SLB_NUM_BOLTED
+	 * appropriately too. */
+	unsigned long linear_llp, vmalloc_llp, lflags, vflags;
+	unsigned long ksp_esid_data, ksp_vsid_data;
+
+	WARN_ON(!irqs_disabled());
+
+	linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
+	vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
+	lflags = SLB_VSID_KERNEL | linear_llp;
+	vflags = SLB_VSID_KERNEL | vmalloc_llp;
+
+	ksp_esid_data = mk_esid_data(get_paca()->kstack, mmu_kernel_ssize, 2);
+	if ((ksp_esid_data & ~0xfffffffUL) <= PAGE_OFFSET) {
+		ksp_esid_data &= ~SLB_ESID_V;
+		ksp_vsid_data = 0;
+		slb_shadow_clear(2);
+	} else {
+		/* Update stack entry; others don't change */
+		slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, 2);
+		ksp_vsid_data = get_slb_shadow()->save_area[2].vsid;
+	}
+
+	/*
+	 * We can't take a PMU exception in the following code, so hard
+	 * disable interrupts.
+	 */
+	hard_irq_disable();
+
+	/* We need to do this all in asm, so we're sure we don't touch
+	 * the stack between the slbia and rebolting it. */
+	asm volatile("isync\n"
+		     "slbia\n"
+		     /* Slot 1 - first VMALLOC segment */
+		     "slbmte	%0,%1\n"
+		     /* Slot 2 - kernel stack */
+		     "slbmte	%2,%3\n"
+		     "isync"
+		     :: "r"(mk_vsid_data(VMALLOC_START, mmu_kernel_ssize, vflags)),
+		        "r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, 1)),
+		        "r"(ksp_vsid_data),
+		        "r"(ksp_esid_data)
+		     : "memory");
+}
+
+void slb_vmalloc_update(void)
+{
+	unsigned long vflags;
+
+	vflags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmalloc_psize].sllp;
+	slb_shadow_update(VMALLOC_START, mmu_kernel_ssize, vflags, 1);
+	slb_flush_and_rebolt();
+}
+
+/* Helper function to compare esids.  There are four cases to handle.
+ * 1. The system is not 1T segment size capable.  Use the GET_ESID compare.
+ * 2. The system is 1T capable, both addresses are < 1T, use the GET_ESID compare.
+ * 3. The system is 1T capable, only one of the two addresses is > 1T.  This is not a match.
+ * 4. The system is 1T capable, both addresses are > 1T, use the GET_ESID_1T macro to compare.
+ */
+static inline int esids_match(unsigned long addr1, unsigned long addr2)
+{
+	int esid_1t_count;
+
+	/* System is not 1T segment size capable. */
+	if (!cpu_has_feature(CPU_FTR_1T_SEGMENT))
+		return (GET_ESID(addr1) == GET_ESID(addr2));
+
+	esid_1t_count = (((addr1 >> SID_SHIFT_1T) != 0) +
+				((addr2 >> SID_SHIFT_1T) != 0));
+
+	/* both addresses are < 1T */
+	if (esid_1t_count == 0)
+		return (GET_ESID(addr1) == GET_ESID(addr2));
+
+	/* One address < 1T, the other > 1T.  Not a match */
+	if (esid_1t_count == 1)
+		return 0;
+
+	/* Both addresses are > 1T. */
+	return (GET_ESID_1T(addr1) == GET_ESID_1T(addr2));
+}
+
+/* Flush all user entries from the segment table of the current processor. */
+void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
+{
+	unsigned long offset = get_paca()->slb_cache_ptr;
+	unsigned long slbie_data = 0;
+	unsigned long pc = KSTK_EIP(tsk);
+	unsigned long stack = KSTK_ESP(tsk);
+	unsigned long unmapped_base;
+
+	if (!cpu_has_feature(CPU_FTR_NO_SLBIE_B) &&
+	    offset <= SLB_CACHE_ENTRIES) {
+		int i;
+		asm volatile("isync" : : : "memory");
+		for (i = 0; i < offset; i++) {
+			slbie_data = (unsigned long)get_paca()->slb_cache[i]
+				<< SID_SHIFT; /* EA */
+			slbie_data |= user_segment_size(slbie_data)
+				<< SLBIE_SSIZE_SHIFT;
+			slbie_data |= SLBIE_C; /* C set for user addresses */
+			asm volatile("slbie %0" : : "r" (slbie_data));
+		}
+		asm volatile("isync" : : : "memory");
+	} else {
+		slb_flush_and_rebolt();
+	}
+
+	/* Workaround POWER5 < DD2.1 issue */
+	if (offset == 1 || offset > SLB_CACHE_ENTRIES)
+		asm volatile("slbie %0" : : "r" (slbie_data));
+
+	get_paca()->slb_cache_ptr = 0;
+	get_paca()->context = mm->context;
+
+	/*
+	 * preload some userspace segments into the SLB.
+	 */
+	if (test_tsk_thread_flag(tsk, TIF_32BIT))
+		unmapped_base = TASK_UNMAPPED_BASE_USER32;
+	else
+		unmapped_base = TASK_UNMAPPED_BASE_USER64;
+
+	if (is_kernel_addr(pc))
+		return;
+	slb_allocate(pc);
+
+	if (esids_match(pc,stack))
+		return;
+
+	if (is_kernel_addr(stack))
+		return;
+	slb_allocate(stack);
+
+	if (esids_match(pc,unmapped_base) || esids_match(stack,unmapped_base))
+		return;
+
+	if (is_kernel_addr(unmapped_base))
+		return;
+	slb_allocate(unmapped_base);
+}
+
+static inline void patch_slb_encoding(unsigned int *insn_addr,
+				      unsigned int immed)
+{
+	/* Assume the instruction had a "0" immediate value, just
+	 * "or" in the new value
+	 */
+	*insn_addr |= immed;
+	flush_icache_range((unsigned long)insn_addr, 4+
+			   (unsigned long)insn_addr);
+}
+
+void slb_initialize(void)
+{
+	unsigned long linear_llp, vmalloc_llp, io_llp;
+	unsigned long lflags, vflags;
+	static int slb_encoding_inited;
+	extern unsigned int *slb_miss_kernel_load_linear;
+	extern unsigned int *slb_miss_kernel_load_io;
+	extern unsigned int *slb_compare_rr_to_size;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	extern unsigned int *slb_miss_kernel_load_vmemmap;
+	unsigned long vmemmap_llp;
+#endif
+
+	/* Prepare our SLB miss handler based on our page size */
+	linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
+	io_llp = mmu_psize_defs[mmu_io_psize].sllp;
+	vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
+	get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp;
+#endif
+	if (!slb_encoding_inited) {
+		slb_encoding_inited = 1;
+		patch_slb_encoding(slb_miss_kernel_load_linear,
+				   SLB_VSID_KERNEL | linear_llp);
+		patch_slb_encoding(slb_miss_kernel_load_io,
+				   SLB_VSID_KERNEL | io_llp);
+		patch_slb_encoding(slb_compare_rr_to_size,
+				   mmu_slb_size);
+
+		DBG("SLB: linear  LLP = %04lx\n", linear_llp);
+		DBG("SLB: io      LLP = %04lx\n", io_llp);
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+		patch_slb_encoding(slb_miss_kernel_load_vmemmap,
+				   SLB_VSID_KERNEL | vmemmap_llp);
+		DBG("SLB: vmemmap LLP = %04lx\n", vmemmap_llp);
+#endif
+	}
+
+	get_paca()->stab_rr = SLB_NUM_BOLTED;
+
+	/* On iSeries the bolted entries have already been set up by
+	 * the hypervisor from the lparMap data in head.S */
+	if (firmware_has_feature(FW_FEATURE_ISERIES))
+		return;
+
+	lflags = SLB_VSID_KERNEL | linear_llp;
+	vflags = SLB_VSID_KERNEL | vmalloc_llp;
+
+	/* Invalidate the entire SLB (even slot 0) & all the ERATS */
+	asm volatile("isync":::"memory");
+	asm volatile("slbmte  %0,%0"::"r" (0) : "memory");
+	asm volatile("isync; slbia; isync":::"memory");
+	create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, 0);
+
+	create_shadowed_slbe(VMALLOC_START, mmu_kernel_ssize, vflags, 1);
+
+	/* For the boot cpu, we're running on the stack in init_thread_union,
+	 * which is in the first segment of the linear mapping, and also
+	 * get_paca()->kstack hasn't been initialized yet.
+	 * For secondary cpus, we need to bolt the kernel stack entry now.
+	 */
+	slb_shadow_clear(2);
+	if (raw_smp_processor_id() != boot_cpuid &&
+	    (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET)
+		create_shadowed_slbe(get_paca()->kstack,
+				     mmu_kernel_ssize, lflags, 2);
+
+	asm volatile("isync":::"memory");
+}
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
new file mode 100644
index 0000000..bc44dc4
--- /dev/null
+++ b/arch/powerpc/mm/slb_low.S
@@ -0,0 +1,301 @@
+/*
+ * Low-level SLB routines
+ *
+ * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
+ *
+ * Based on earlier C version:
+ * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
+ *    Copyright (c) 2001 Dave Engebretsen
+ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/cputable.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/firmware.h>
+
+/* void slb_allocate_realmode(unsigned long ea);
+ *
+ * Create an SLB entry for the given EA (user or kernel).
+ * 	r3 = faulting address, r13 = PACA
+ *	r9, r10, r11 are clobbered by this function
+ * No other registers are examined or changed.
+ */
+_GLOBAL(slb_allocate_realmode)
+	/* r3 = faulting address */
+
+	srdi	r9,r3,60		/* get region */
+	srdi	r10,r3,28		/* get esid */
+	cmpldi	cr7,r9,0xc		/* cmp PAGE_OFFSET for later use */
+
+	/* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */
+	blt	cr7,0f			/* user or kernel? */
+
+	/* kernel address: proto-VSID = ESID */
+	/* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but
+	 * this code will generate the protoVSID 0xfffffffff for the
+	 * top segment.  That's ok, the scramble below will translate
+	 * it to VSID 0, which is reserved as a bad VSID - one which
+	 * will never have any pages in it.  */
+
+	/* Check if hitting the linear mapping or some other kernel space
+	*/
+	bne	cr7,1f
+
+	/* Linear mapping encoding bits, the "li" instruction below will
+	 * be patched by the kernel at boot
+	 */
+_GLOBAL(slb_miss_kernel_load_linear)
+	li	r11,0
+BEGIN_FTR_SECTION
+	b	slb_finish_load
+END_FTR_SECTION_IFCLR(CPU_FTR_1T_SEGMENT)
+	b	slb_finish_load_1T
+
+1:
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	/* Check virtual memmap region. To be patches at kernel boot */
+	cmpldi	cr0,r9,0xf
+	bne	1f
+_GLOBAL(slb_miss_kernel_load_vmemmap)
+	li	r11,0
+	b	6f
+1:
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
+	/* vmalloc/ioremap mapping encoding bits, the "li" instructions below
+	 * will be patched by the kernel at boot
+	 */
+BEGIN_FTR_SECTION
+	/* check whether this is in vmalloc or ioremap space */
+	clrldi	r11,r10,48
+	cmpldi	r11,(VMALLOC_SIZE >> 28) - 1
+	bgt	5f
+	lhz	r11,PACAVMALLOCSLLP(r13)
+	b	6f
+5:
+END_FTR_SECTION_IFCLR(CPU_FTR_CI_LARGE_PAGE)
+_GLOBAL(slb_miss_kernel_load_io)
+	li	r11,0
+6:
+BEGIN_FTR_SECTION
+	b	slb_finish_load
+END_FTR_SECTION_IFCLR(CPU_FTR_1T_SEGMENT)
+	b	slb_finish_load_1T
+
+0:	/* user address: proto-VSID = context << 15 | ESID. First check
+	 * if the address is within the boundaries of the user region
+	 */
+	srdi.	r9,r10,USER_ESID_BITS
+	bne-	8f			/* invalid ea bits set */
+
+
+	/* when using slices, we extract the psize off the slice bitmaps
+	 * and then we need to get the sllp encoding off the mmu_psize_defs
+	 * array.
+	 *
+	 * XXX This is a bit inefficient especially for the normal case,
+	 * so we should try to implement a fast path for the standard page
+	 * size using the old sllp value so we avoid the array. We cannot
+	 * really do dynamic patching unfortunately as processes might flip
+	 * between 4k and 64k standard page size
+	 */
+#ifdef CONFIG_PPC_MM_SLICES
+	cmpldi	r10,16
+
+	/* Get the slice index * 4 in r11 and matching slice size mask in r9 */
+	ld	r9,PACALOWSLICESPSIZE(r13)
+	sldi	r11,r10,2
+	blt	5f
+	ld	r9,PACAHIGHSLICEPSIZE(r13)
+	srdi	r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT - 2)
+	andi.	r11,r11,0x3c
+
+5:	/* Extract the psize and multiply to get an array offset */
+	srd	r9,r9,r11
+	andi.	r9,r9,0xf
+	mulli	r9,r9,MMUPSIZEDEFSIZE
+
+	/* Now get to the array and obtain the sllp
+	 */
+	ld	r11,PACATOC(r13)
+	ld	r11,mmu_psize_defs@got(r11)
+	add	r11,r11,r9
+	ld	r11,MMUPSIZESLLP(r11)
+	ori	r11,r11,SLB_VSID_USER
+#else
+	/* paca context sllp already contains the SLB_VSID_USER bits */
+	lhz	r11,PACACONTEXTSLLP(r13)
+#endif /* CONFIG_PPC_MM_SLICES */
+
+	ld	r9,PACACONTEXTID(r13)
+BEGIN_FTR_SECTION
+	cmpldi	r10,0x1000
+END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
+	rldimi	r10,r9,USER_ESID_BITS,0
+BEGIN_FTR_SECTION
+	bge	slb_finish_load_1T
+END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
+	b	slb_finish_load
+
+8:	/* invalid EA */
+	li	r10,0			/* BAD_VSID */
+	li	r11,SLB_VSID_USER	/* flags don't much matter */
+	b	slb_finish_load
+
+#ifdef __DISABLED__
+
+/* void slb_allocate_user(unsigned long ea);
+ *
+ * Create an SLB entry for the given EA (user or kernel).
+ * 	r3 = faulting address, r13 = PACA
+ *	r9, r10, r11 are clobbered by this function
+ * No other registers are examined or changed.
+ *
+ * It is called with translation enabled in order to be able to walk the
+ * page tables. This is not currently used.
+ */
+_GLOBAL(slb_allocate_user)
+	/* r3 = faulting address */
+	srdi	r10,r3,28		/* get esid */
+
+	crset	4*cr7+lt		/* set "user" flag for later */
+
+	/* check if we fit in the range covered by the pagetables*/
+	srdi.	r9,r3,PGTABLE_EADDR_SIZE
+	crnot	4*cr0+eq,4*cr0+eq
+	beqlr
+
+	/* now we need to get to the page tables in order to get the page
+	 * size encoding from the PMD. In the future, we'll be able to deal
+	 * with 1T segments too by getting the encoding from the PGD instead
+	 */
+	ld	r9,PACAPGDIR(r13)
+	cmpldi	cr0,r9,0
+	beqlr
+	rlwinm	r11,r10,8,25,28
+	ldx	r9,r9,r11		/* get pgd_t */
+	cmpldi	cr0,r9,0
+	beqlr
+	rlwinm	r11,r10,3,17,28
+	ldx	r9,r9,r11		/* get pmd_t */
+	cmpldi	cr0,r9,0
+	beqlr
+
+	/* build vsid flags */
+	andi.	r11,r9,SLB_VSID_LLP
+	ori	r11,r11,SLB_VSID_USER
+
+	/* get context to calculate proto-VSID */
+	ld	r9,PACACONTEXTID(r13)
+	rldimi	r10,r9,USER_ESID_BITS,0
+
+	/* fall through slb_finish_load */
+
+#endif /* __DISABLED__ */
+
+
+/*
+ * Finish loading of an SLB entry and return
+ *
+ * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET
+ */
+slb_finish_load:
+	ASM_VSID_SCRAMBLE(r10,r9,256M)
+	rldimi	r11,r10,SLB_VSID_SHIFT,16	/* combine VSID and flags */
+
+	/* r3 = EA, r11 = VSID data */
+	/*
+	 * Find a slot, round robin. Previously we tried to find a
+	 * free slot first but that took too long. Unfortunately we
+ 	 * dont have any LRU information to help us choose a slot.
+ 	 */
+#ifdef CONFIG_PPC_ISERIES
+BEGIN_FW_FTR_SECTION
+	/*
+	 * On iSeries, the "bolted" stack segment can be cast out on
+	 * shared processor switch so we need to check for a miss on
+	 * it and restore it to the right slot.
+	 */
+	ld	r9,PACAKSAVE(r13)
+	clrrdi	r9,r9,28
+	clrrdi	r3,r3,28
+	li	r10,SLB_NUM_BOLTED-1	/* Stack goes in last bolted slot */
+	cmpld	r9,r3
+	beq	3f
+END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
+#endif /* CONFIG_PPC_ISERIES */
+
+7:	ld	r10,PACASTABRR(r13)
+	addi	r10,r10,1
+	/* This gets soft patched on boot. */
+_GLOBAL(slb_compare_rr_to_size)
+	cmpldi	r10,0
+
+	blt+	4f
+	li	r10,SLB_NUM_BOLTED
+
+4:
+	std	r10,PACASTABRR(r13)
+
+3:
+	rldimi	r3,r10,0,36		/* r3= EA[0:35] | entry */
+	oris	r10,r3,SLB_ESID_V@h	/* r3 |= SLB_ESID_V */
+
+	/* r3 = ESID data, r11 = VSID data */
+
+	/*
+	 * No need for an isync before or after this slbmte. The exception
+	 * we enter with and the rfid we exit with are context synchronizing.
+	 */
+	slbmte	r11,r10
+
+	/* we're done for kernel addresses */
+	crclr	4*cr0+eq		/* set result to "success" */
+	bgelr	cr7
+
+	/* Update the slb cache */
+	lhz	r3,PACASLBCACHEPTR(r13)	/* offset = paca->slb_cache_ptr */
+	cmpldi	r3,SLB_CACHE_ENTRIES
+	bge	1f
+
+	/* still room in the slb cache */
+	sldi	r11,r3,1		/* r11 = offset * sizeof(u16) */
+	rldicl	r10,r10,36,28		/* get low 16 bits of the ESID */
+	add	r11,r11,r13		/* r11 = (u16 *)paca + offset */
+	sth	r10,PACASLBCACHE(r11)	/* paca->slb_cache[offset] = esid */
+	addi	r3,r3,1			/* offset++ */
+	b	2f
+1:					/* offset >= SLB_CACHE_ENTRIES */
+	li	r3,SLB_CACHE_ENTRIES+1
+2:
+	sth	r3,PACASLBCACHEPTR(r13)	/* paca->slb_cache_ptr = offset */
+	crclr	4*cr0+eq		/* set result to "success" */
+	blr
+
+/*
+ * Finish loading of a 1T SLB entry (for the kernel linear mapping) and return.
+ * We assume legacy iSeries will never have 1T segments.
+ *
+ * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9
+ */
+slb_finish_load_1T:
+	srdi	r10,r10,40-28		/* get 1T ESID */
+	ASM_VSID_SCRAMBLE(r10,r9,1T)
+	rldimi	r11,r10,SLB_VSID_SHIFT_1T,16	/* combine VSID and flags */
+	li	r10,MMU_SEGSIZE_1T
+	rldimi	r11,r10,SLB_VSID_SSIZE_SHIFT,0	/* insert segment size */
+
+	/* r3 = EA, r11 = VSID data */
+	clrrdi	r3,r3,SID_SHIFT_1T	/* clear out non-ESID bits */
+	b	7b
+
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
new file mode 100644
index 0000000..ba51948
--- /dev/null
+++ b/arch/powerpc/mm/slice.c
@@ -0,0 +1,734 @@
+/*
+ * address space "slices" (meta-segments) support
+ *
+ * Copyright (C) 2007 Benjamin Herrenschmidt, IBM Corporation.
+ *
+ * Based on hugetlb implementation
+ *
+ * Copyright (C) 2003 David Gibson, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <asm/mman.h>
+#include <asm/mmu.h>
+#include <asm/spu.h>
+
+static DEFINE_SPINLOCK(slice_convert_lock);
+
+
+#ifdef DEBUG
+int _slice_debug = 1;
+
+static void slice_print_mask(const char *label, struct slice_mask mask)
+{
+	char	*p, buf[16 + 3 + 16 + 1];
+	int	i;
+
+	if (!_slice_debug)
+		return;
+	p = buf;
+	for (i = 0; i < SLICE_NUM_LOW; i++)
+		*(p++) = (mask.low_slices & (1 << i)) ? '1' : '0';
+	*(p++) = ' ';
+	*(p++) = '-';
+	*(p++) = ' ';
+	for (i = 0; i < SLICE_NUM_HIGH; i++)
+		*(p++) = (mask.high_slices & (1 << i)) ? '1' : '0';
+	*(p++) = 0;
+
+	printk(KERN_DEBUG "%s:%s\n", label, buf);
+}
+
+#define slice_dbg(fmt...) do { if (_slice_debug) pr_debug(fmt); } while(0)
+
+#else
+
+static void slice_print_mask(const char *label, struct slice_mask mask) {}
+#define slice_dbg(fmt...)
+
+#endif
+
+static struct slice_mask slice_range_to_mask(unsigned long start,
+					     unsigned long len)
+{
+	unsigned long end = start + len - 1;
+	struct slice_mask ret = { 0, 0 };
+
+	if (start < SLICE_LOW_TOP) {
+		unsigned long mend = min(end, SLICE_LOW_TOP);
+		unsigned long mstart = min(start, SLICE_LOW_TOP);
+
+		ret.low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1))
+			- (1u << GET_LOW_SLICE_INDEX(mstart));
+	}
+
+	if ((start + len) > SLICE_LOW_TOP)
+		ret.high_slices = (1u << (GET_HIGH_SLICE_INDEX(end) + 1))
+			- (1u << GET_HIGH_SLICE_INDEX(start));
+
+	return ret;
+}
+
+static int slice_area_is_free(struct mm_struct *mm, unsigned long addr,
+			      unsigned long len)
+{
+	struct vm_area_struct *vma;
+
+	if ((mm->task_size - len) < addr)
+		return 0;
+	vma = find_vma(mm, addr);
+	return (!vma || (addr + len) <= vma->vm_start);
+}
+
+static int slice_low_has_vma(struct mm_struct *mm, unsigned long slice)
+{
+	return !slice_area_is_free(mm, slice << SLICE_LOW_SHIFT,
+				   1ul << SLICE_LOW_SHIFT);
+}
+
+static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice)
+{
+	unsigned long start = slice << SLICE_HIGH_SHIFT;
+	unsigned long end = start + (1ul << SLICE_HIGH_SHIFT);
+
+	/* Hack, so that each addresses is controlled by exactly one
+	 * of the high or low area bitmaps, the first high area starts
+	 * at 4GB, not 0 */
+	if (start == 0)
+		start = SLICE_LOW_TOP;
+
+	return !slice_area_is_free(mm, start, end - start);
+}
+
+static struct slice_mask slice_mask_for_free(struct mm_struct *mm)
+{
+	struct slice_mask ret = { 0, 0 };
+	unsigned long i;
+
+	for (i = 0; i < SLICE_NUM_LOW; i++)
+		if (!slice_low_has_vma(mm, i))
+			ret.low_slices |= 1u << i;
+
+	if (mm->task_size <= SLICE_LOW_TOP)
+		return ret;
+
+	for (i = 0; i < SLICE_NUM_HIGH; i++)
+		if (!slice_high_has_vma(mm, i))
+			ret.high_slices |= 1u << i;
+
+	return ret;
+}
+
+static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize)
+{
+	struct slice_mask ret = { 0, 0 };
+	unsigned long i;
+	u64 psizes;
+
+	psizes = mm->context.low_slices_psize;
+	for (i = 0; i < SLICE_NUM_LOW; i++)
+		if (((psizes >> (i * 4)) & 0xf) == psize)
+			ret.low_slices |= 1u << i;
+
+	psizes = mm->context.high_slices_psize;
+	for (i = 0; i < SLICE_NUM_HIGH; i++)
+		if (((psizes >> (i * 4)) & 0xf) == psize)
+			ret.high_slices |= 1u << i;
+
+	return ret;
+}
+
+static int slice_check_fit(struct slice_mask mask, struct slice_mask available)
+{
+	return (mask.low_slices & available.low_slices) == mask.low_slices &&
+		(mask.high_slices & available.high_slices) == mask.high_slices;
+}
+
+static void slice_flush_segments(void *parm)
+{
+	struct mm_struct *mm = parm;
+	unsigned long flags;
+
+	if (mm != current->active_mm)
+		return;
+
+	/* update the paca copy of the context struct */
+	get_paca()->context = current->active_mm->context;
+
+	local_irq_save(flags);
+	slb_flush_and_rebolt();
+	local_irq_restore(flags);
+}
+
+static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psize)
+{
+	/* Write the new slice psize bits */
+	u64 lpsizes, hpsizes;
+	unsigned long i, flags;
+
+	slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize);
+	slice_print_mask(" mask", mask);
+
+	/* We need to use a spinlock here to protect against
+	 * concurrent 64k -> 4k demotion ...
+	 */
+	spin_lock_irqsave(&slice_convert_lock, flags);
+
+	lpsizes = mm->context.low_slices_psize;
+	for (i = 0; i < SLICE_NUM_LOW; i++)
+		if (mask.low_slices & (1u << i))
+			lpsizes = (lpsizes & ~(0xful << (i * 4))) |
+				(((unsigned long)psize) << (i * 4));
+
+	hpsizes = mm->context.high_slices_psize;
+	for (i = 0; i < SLICE_NUM_HIGH; i++)
+		if (mask.high_slices & (1u << i))
+			hpsizes = (hpsizes & ~(0xful << (i * 4))) |
+				(((unsigned long)psize) << (i * 4));
+
+	mm->context.low_slices_psize = lpsizes;
+	mm->context.high_slices_psize = hpsizes;
+
+	slice_dbg(" lsps=%lx, hsps=%lx\n",
+		  mm->context.low_slices_psize,
+		  mm->context.high_slices_psize);
+
+	spin_unlock_irqrestore(&slice_convert_lock, flags);
+
+#ifdef CONFIG_SPU_BASE
+	spu_flush_all_slbs(mm);
+#endif
+}
+
+static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
+					      unsigned long len,
+					      struct slice_mask available,
+					      int psize, int use_cache)
+{
+	struct vm_area_struct *vma;
+	unsigned long start_addr, addr;
+	struct slice_mask mask;
+	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+
+	if (use_cache) {
+		if (len <= mm->cached_hole_size) {
+			start_addr = addr = TASK_UNMAPPED_BASE;
+			mm->cached_hole_size = 0;
+		} else
+			start_addr = addr = mm->free_area_cache;
+	} else
+		start_addr = addr = TASK_UNMAPPED_BASE;
+
+full_search:
+	for (;;) {
+		addr = _ALIGN_UP(addr, 1ul << pshift);
+		if ((TASK_SIZE - len) < addr)
+			break;
+		vma = find_vma(mm, addr);
+		BUG_ON(vma && (addr >= vma->vm_end));
+
+		mask = slice_range_to_mask(addr, len);
+		if (!slice_check_fit(mask, available)) {
+			if (addr < SLICE_LOW_TOP)
+				addr = _ALIGN_UP(addr + 1,  1ul << SLICE_LOW_SHIFT);
+			else
+				addr = _ALIGN_UP(addr + 1,  1ul << SLICE_HIGH_SHIFT);
+			continue;
+		}
+		if (!vma || addr + len <= vma->vm_start) {
+			/*
+			 * Remember the place where we stopped the search:
+			 */
+			if (use_cache)
+				mm->free_area_cache = addr + len;
+			return addr;
+		}
+		if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
+		addr = vma->vm_end;
+	}
+
+	/* Make sure we didn't miss any holes */
+	if (use_cache && start_addr != TASK_UNMAPPED_BASE) {
+		start_addr = addr = TASK_UNMAPPED_BASE;
+		mm->cached_hole_size = 0;
+		goto full_search;
+	}
+	return -ENOMEM;
+}
+
+static unsigned long slice_find_area_topdown(struct mm_struct *mm,
+					     unsigned long len,
+					     struct slice_mask available,
+					     int psize, int use_cache)
+{
+	struct vm_area_struct *vma;
+	unsigned long addr;
+	struct slice_mask mask;
+	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+
+	/* check if free_area_cache is useful for us */
+	if (use_cache) {
+		if (len <= mm->cached_hole_size) {
+			mm->cached_hole_size = 0;
+			mm->free_area_cache = mm->mmap_base;
+		}
+
+		/* either no address requested or can't fit in requested
+		 * address hole
+		 */
+		addr = mm->free_area_cache;
+
+		/* make sure it can fit in the remaining address space */
+		if (addr > len) {
+			addr = _ALIGN_DOWN(addr - len, 1ul << pshift);
+			mask = slice_range_to_mask(addr, len);
+			if (slice_check_fit(mask, available) &&
+			    slice_area_is_free(mm, addr, len))
+					/* remember the address as a hint for
+					 * next time
+					 */
+					return (mm->free_area_cache = addr);
+		}
+	}
+
+	addr = mm->mmap_base;
+	while (addr > len) {
+		/* Go down by chunk size */
+		addr = _ALIGN_DOWN(addr - len, 1ul << pshift);
+
+		/* Check for hit with different page size */
+		mask = slice_range_to_mask(addr, len);
+		if (!slice_check_fit(mask, available)) {
+			if (addr < SLICE_LOW_TOP)
+				addr = _ALIGN_DOWN(addr, 1ul << SLICE_LOW_SHIFT);
+			else if (addr < (1ul << SLICE_HIGH_SHIFT))
+				addr = SLICE_LOW_TOP;
+			else
+				addr = _ALIGN_DOWN(addr, 1ul << SLICE_HIGH_SHIFT);
+			continue;
+		}
+
+		/*
+		 * Lookup failure means no vma is above this address,
+		 * else if new region fits below vma->vm_start,
+		 * return with success:
+		 */
+		vma = find_vma(mm, addr);
+		if (!vma || (addr + len) <= vma->vm_start) {
+			/* remember the address as a hint for next time */
+			if (use_cache)
+				mm->free_area_cache = addr;
+			return addr;
+		}
+
+		/* remember the largest hole we saw so far */
+		if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
+
+		/* try just below the current vma->vm_start */
+		addr = vma->vm_start;
+	}
+
+	/*
+	 * A failed mmap() very likely causes application failure,
+	 * so fall back to the bottom-up function here. This scenario
+	 * can happen with large stack limits and large mmap()
+	 * allocations.
+	 */
+	addr = slice_find_area_bottomup(mm, len, available, psize, 0);
+
+	/*
+	 * Restore the topdown base:
+	 */
+	if (use_cache) {
+		mm->free_area_cache = mm->mmap_base;
+		mm->cached_hole_size = ~0UL;
+	}
+
+	return addr;
+}
+
+
+static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
+				     struct slice_mask mask, int psize,
+				     int topdown, int use_cache)
+{
+	if (topdown)
+		return slice_find_area_topdown(mm, len, mask, psize, use_cache);
+	else
+		return slice_find_area_bottomup(mm, len, mask, psize, use_cache);
+}
+
+#define or_mask(dst, src)	do {			\
+	(dst).low_slices |= (src).low_slices;		\
+	(dst).high_slices |= (src).high_slices;		\
+} while (0)
+
+#define andnot_mask(dst, src)	do {			\
+	(dst).low_slices &= ~(src).low_slices;		\
+	(dst).high_slices &= ~(src).high_slices;	\
+} while (0)
+
+#ifdef CONFIG_PPC_64K_PAGES
+#define MMU_PAGE_BASE	MMU_PAGE_64K
+#else
+#define MMU_PAGE_BASE	MMU_PAGE_4K
+#endif
+
+unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
+				      unsigned long flags, unsigned int psize,
+				      int topdown, int use_cache)
+{
+	struct slice_mask mask = {0, 0};
+	struct slice_mask good_mask;
+	struct slice_mask potential_mask = {0,0} /* silence stupid warning */;
+	struct slice_mask compat_mask = {0, 0};
+	int fixed = (flags & MAP_FIXED);
+	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+	struct mm_struct *mm = current->mm;
+	unsigned long newaddr;
+
+	/* Sanity checks */
+	BUG_ON(mm->task_size == 0);
+
+	slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
+	slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d, use_cache=%d\n",
+		  addr, len, flags, topdown, use_cache);
+
+	if (len > mm->task_size)
+		return -ENOMEM;
+	if (len & ((1ul << pshift) - 1))
+		return -EINVAL;
+	if (fixed && (addr & ((1ul << pshift) - 1)))
+		return -EINVAL;
+	if (fixed && addr > (mm->task_size - len))
+		return -EINVAL;
+
+	/* If hint, make sure it matches our alignment restrictions */
+	if (!fixed && addr) {
+		addr = _ALIGN_UP(addr, 1ul << pshift);
+		slice_dbg(" aligned addr=%lx\n", addr);
+		/* Ignore hint if it's too large or overlaps a VMA */
+		if (addr > mm->task_size - len ||
+		    !slice_area_is_free(mm, addr, len))
+			addr = 0;
+	}
+
+	/* First make up a "good" mask of slices that have the right size
+	 * already
+	 */
+	good_mask = slice_mask_for_size(mm, psize);
+	slice_print_mask(" good_mask", good_mask);
+
+	/*
+	 * Here "good" means slices that are already the right page size,
+	 * "compat" means slices that have a compatible page size (i.e.
+	 * 4k in a 64k pagesize kernel), and "free" means slices without
+	 * any VMAs.
+	 *
+	 * If MAP_FIXED:
+	 *	check if fits in good | compat => OK
+	 *	check if fits in good | compat | free => convert free
+	 *	else bad
+	 * If have hint:
+	 *	check if hint fits in good => OK
+	 *	check if hint fits in good | free => convert free
+	 * Otherwise:
+	 *	search in good, found => OK
+	 *	search in good | free, found => convert free
+	 *	search in good | compat | free, found => convert free.
+	 */
+
+#ifdef CONFIG_PPC_64K_PAGES
+	/* If we support combo pages, we can allow 64k pages in 4k slices */
+	if (psize == MMU_PAGE_64K) {
+		compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K);
+		if (fixed)
+			or_mask(good_mask, compat_mask);
+	}
+#endif
+
+	/* First check hint if it's valid or if we have MAP_FIXED */
+	if (addr != 0 || fixed) {
+		/* Build a mask for the requested range */
+		mask = slice_range_to_mask(addr, len);
+		slice_print_mask(" mask", mask);
+
+		/* Check if we fit in the good mask. If we do, we just return,
+		 * nothing else to do
+		 */
+		if (slice_check_fit(mask, good_mask)) {
+			slice_dbg(" fits good !\n");
+			return addr;
+		}
+	} else {
+		/* Now let's see if we can find something in the existing
+		 * slices for that size
+		 */
+		newaddr = slice_find_area(mm, len, good_mask, psize, topdown,
+					  use_cache);
+		if (newaddr != -ENOMEM) {
+			/* Found within the good mask, we don't have to setup,
+			 * we thus return directly
+			 */
+			slice_dbg(" found area at 0x%lx\n", newaddr);
+			return newaddr;
+		}
+	}
+
+	/* We don't fit in the good mask, check what other slices are
+	 * empty and thus can be converted
+	 */
+	potential_mask = slice_mask_for_free(mm);
+	or_mask(potential_mask, good_mask);
+	slice_print_mask(" potential", potential_mask);
+
+	if ((addr != 0 || fixed) && slice_check_fit(mask, potential_mask)) {
+		slice_dbg(" fits potential !\n");
+		goto convert;
+	}
+
+	/* If we have MAP_FIXED and failed the above steps, then error out */
+	if (fixed)
+		return -EBUSY;
+
+	slice_dbg(" search...\n");
+
+	/* If we had a hint that didn't work out, see if we can fit
+	 * anywhere in the good area.
+	 */
+	if (addr) {
+		addr = slice_find_area(mm, len, good_mask, psize, topdown,
+				       use_cache);
+		if (addr != -ENOMEM) {
+			slice_dbg(" found area at 0x%lx\n", addr);
+			return addr;
+		}
+	}
+
+	/* Now let's see if we can find something in the existing slices
+	 * for that size plus free slices
+	 */
+	addr = slice_find_area(mm, len, potential_mask, psize, topdown,
+			       use_cache);
+
+#ifdef CONFIG_PPC_64K_PAGES
+	if (addr == -ENOMEM && psize == MMU_PAGE_64K) {
+		/* retry the search with 4k-page slices included */
+		or_mask(potential_mask, compat_mask);
+		addr = slice_find_area(mm, len, potential_mask, psize,
+				       topdown, use_cache);
+	}
+#endif
+
+	if (addr == -ENOMEM)
+		return -ENOMEM;
+
+	mask = slice_range_to_mask(addr, len);
+	slice_dbg(" found potential area at 0x%lx\n", addr);
+	slice_print_mask(" mask", mask);
+
+ convert:
+	andnot_mask(mask, good_mask);
+	andnot_mask(mask, compat_mask);
+	if (mask.low_slices || mask.high_slices) {
+		slice_convert(mm, mask, psize);
+		if (psize > MMU_PAGE_BASE)
+			on_each_cpu(slice_flush_segments, mm, 1);
+	}
+	return addr;
+
+}
+EXPORT_SYMBOL_GPL(slice_get_unmapped_area);
+
+unsigned long arch_get_unmapped_area(struct file *filp,
+				     unsigned long addr,
+				     unsigned long len,
+				     unsigned long pgoff,
+				     unsigned long flags)
+{
+	return slice_get_unmapped_area(addr, len, flags,
+				       current->mm->context.user_psize,
+				       0, 1);
+}
+
+unsigned long arch_get_unmapped_area_topdown(struct file *filp,
+					     const unsigned long addr0,
+					     const unsigned long len,
+					     const unsigned long pgoff,
+					     const unsigned long flags)
+{
+	return slice_get_unmapped_area(addr0, len, flags,
+				       current->mm->context.user_psize,
+				       1, 1);
+}
+
+unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr)
+{
+	u64 psizes;
+	int index;
+
+	if (addr < SLICE_LOW_TOP) {
+		psizes = mm->context.low_slices_psize;
+		index = GET_LOW_SLICE_INDEX(addr);
+	} else {
+		psizes = mm->context.high_slices_psize;
+		index = GET_HIGH_SLICE_INDEX(addr);
+	}
+
+	return (psizes >> (index * 4)) & 0xf;
+}
+EXPORT_SYMBOL_GPL(get_slice_psize);
+
+/*
+ * This is called by hash_page when it needs to do a lazy conversion of
+ * an address space from real 64K pages to combo 4K pages (typically
+ * when hitting a non cacheable mapping on a processor or hypervisor
+ * that won't allow them for 64K pages).
+ *
+ * This is also called in init_new_context() to change back the user
+ * psize from whatever the parent context had it set to
+ * N.B. This may be called before mm->context.id has been set.
+ *
+ * This function will only change the content of the {low,high)_slice_psize
+ * masks, it will not flush SLBs as this shall be handled lazily by the
+ * caller.
+ */
+void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
+{
+	unsigned long flags, lpsizes, hpsizes;
+	unsigned int old_psize;
+	int i;
+
+	slice_dbg("slice_set_user_psize(mm=%p, psize=%d)\n", mm, psize);
+
+	spin_lock_irqsave(&slice_convert_lock, flags);
+
+	old_psize = mm->context.user_psize;
+	slice_dbg(" old_psize=%d\n", old_psize);
+	if (old_psize == psize)
+		goto bail;
+
+	mm->context.user_psize = psize;
+	wmb();
+
+	lpsizes = mm->context.low_slices_psize;
+	for (i = 0; i < SLICE_NUM_LOW; i++)
+		if (((lpsizes >> (i * 4)) & 0xf) == old_psize)
+			lpsizes = (lpsizes & ~(0xful << (i * 4))) |
+				(((unsigned long)psize) << (i * 4));
+
+	hpsizes = mm->context.high_slices_psize;
+	for (i = 0; i < SLICE_NUM_HIGH; i++)
+		if (((hpsizes >> (i * 4)) & 0xf) == old_psize)
+			hpsizes = (hpsizes & ~(0xful << (i * 4))) |
+				(((unsigned long)psize) << (i * 4));
+
+	mm->context.low_slices_psize = lpsizes;
+	mm->context.high_slices_psize = hpsizes;
+
+	slice_dbg(" lsps=%lx, hsps=%lx\n",
+		  mm->context.low_slices_psize,
+		  mm->context.high_slices_psize);
+
+ bail:
+	spin_unlock_irqrestore(&slice_convert_lock, flags);
+}
+
+void slice_set_psize(struct mm_struct *mm, unsigned long address,
+		     unsigned int psize)
+{
+	unsigned long i, flags;
+	u64 *p;
+
+	spin_lock_irqsave(&slice_convert_lock, flags);
+	if (address < SLICE_LOW_TOP) {
+		i = GET_LOW_SLICE_INDEX(address);
+		p = &mm->context.low_slices_psize;
+	} else {
+		i = GET_HIGH_SLICE_INDEX(address);
+		p = &mm->context.high_slices_psize;
+	}
+	*p = (*p & ~(0xful << (i * 4))) | ((unsigned long) psize << (i * 4));
+	spin_unlock_irqrestore(&slice_convert_lock, flags);
+
+#ifdef CONFIG_SPU_BASE
+	spu_flush_all_slbs(mm);
+#endif
+}
+
+void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
+			   unsigned long len, unsigned int psize)
+{
+	struct slice_mask mask = slice_range_to_mask(start, len);
+
+	slice_convert(mm, mask, psize);
+}
+
+/*
+ * is_hugepage_only_range() is used by generic code to verify wether
+ * a normal mmap mapping (non hugetlbfs) is valid on a given area.
+ *
+ * until the generic code provides a more generic hook and/or starts
+ * calling arch get_unmapped_area for MAP_FIXED (which our implementation
+ * here knows how to deal with), we hijack it to keep standard mappings
+ * away from us.
+ *
+ * because of that generic code limitation, MAP_FIXED mapping cannot
+ * "convert" back a slice with no VMAs to the standard page size, only
+ * get_unmapped_area() can. It would be possible to fix it here but I
+ * prefer working on fixing the generic code instead.
+ *
+ * WARNING: This will not work if hugetlbfs isn't enabled since the
+ * generic code will redefine that function as 0 in that. This is ok
+ * for now as we only use slices with hugetlbfs enabled. This should
+ * be fixed as the generic code gets fixed.
+ */
+int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
+			   unsigned long len)
+{
+	struct slice_mask mask, available;
+	unsigned int psize = mm->context.user_psize;
+
+	mask = slice_range_to_mask(addr, len);
+	available = slice_mask_for_size(mm, psize);
+#ifdef CONFIG_PPC_64K_PAGES
+	/* We need to account for 4k slices too */
+	if (psize == MMU_PAGE_64K) {
+		struct slice_mask compat_mask;
+		compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K);
+		or_mask(available, compat_mask);
+	}
+#endif
+
+#if 0 /* too verbose */
+	slice_dbg("is_hugepage_only_range(mm=%p, addr=%lx, len=%lx)\n",
+		 mm, addr, len);
+	slice_print_mask(" mask", mask);
+	slice_print_mask(" available", available);
+#endif
+	return !slice_check_fit(mask, available);
+}
+
diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c
new file mode 100644
index 0000000..60e6032
--- /dev/null
+++ b/arch/powerpc/mm/stab.c
@@ -0,0 +1,287 @@
+/*
+ * PowerPC64 Segment Translation Support.
+ *
+ * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
+ *    Copyright (c) 2001 Dave Engebretsen
+ *
+ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/lmb.h>
+
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/paca.h>
+#include <asm/cputable.h>
+#include <asm/prom.h>
+#include <asm/abs_addr.h>
+#include <asm/firmware.h>
+#include <asm/iseries/hv_call.h>
+
+struct stab_entry {
+	unsigned long esid_data;
+	unsigned long vsid_data;
+};
+
+#define NR_STAB_CACHE_ENTRIES 8
+static DEFINE_PER_CPU(long, stab_cache_ptr);
+static DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]);
+
+/*
+ * Create a segment table entry for the given esid/vsid pair.
+ */
+static int make_ste(unsigned long stab, unsigned long esid, unsigned long vsid)
+{
+	unsigned long esid_data, vsid_data;
+	unsigned long entry, group, old_esid, castout_entry, i;
+	unsigned int global_entry;
+	struct stab_entry *ste, *castout_ste;
+	unsigned long kernel_segment = (esid << SID_SHIFT) >= PAGE_OFFSET;
+
+	vsid_data = vsid << STE_VSID_SHIFT;
+	esid_data = esid << SID_SHIFT | STE_ESID_KP | STE_ESID_V;
+	if (! kernel_segment)
+		esid_data |= STE_ESID_KS;
+
+	/* Search the primary group first. */
+	global_entry = (esid & 0x1f) << 3;
+	ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7));
+
+	/* Find an empty entry, if one exists. */
+	for (group = 0; group < 2; group++) {
+		for (entry = 0; entry < 8; entry++, ste++) {
+			if (!(ste->esid_data & STE_ESID_V)) {
+				ste->vsid_data = vsid_data;
+				eieio();
+				ste->esid_data = esid_data;
+				return (global_entry | entry);
+			}
+		}
+		/* Now search the secondary group. */
+		global_entry = ((~esid) & 0x1f) << 3;
+		ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7));
+	}
+
+	/*
+	 * Could not find empty entry, pick one with a round robin selection.
+	 * Search all entries in the two groups.
+	 */
+	castout_entry = get_paca()->stab_rr;
+	for (i = 0; i < 16; i++) {
+		if (castout_entry < 8) {
+			global_entry = (esid & 0x1f) << 3;
+			ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7));
+			castout_ste = ste + castout_entry;
+		} else {
+			global_entry = ((~esid) & 0x1f) << 3;
+			ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7));
+			castout_ste = ste + (castout_entry - 8);
+		}
+
+		/* Dont cast out the first kernel segment */
+		if ((castout_ste->esid_data & ESID_MASK) != PAGE_OFFSET)
+			break;
+
+		castout_entry = (castout_entry + 1) & 0xf;
+	}
+
+	get_paca()->stab_rr = (castout_entry + 1) & 0xf;
+
+	/* Modify the old entry to the new value. */
+
+	/* Force previous translations to complete. DRENG */
+	asm volatile("isync" : : : "memory");
+
+	old_esid = castout_ste->esid_data >> SID_SHIFT;
+	castout_ste->esid_data = 0;		/* Invalidate old entry */
+
+	asm volatile("sync" : : : "memory");    /* Order update */
+
+	castout_ste->vsid_data = vsid_data;
+	eieio();				/* Order update */
+	castout_ste->esid_data = esid_data;
+
+	asm volatile("slbie  %0" : : "r" (old_esid << SID_SHIFT));
+	/* Ensure completion of slbie */
+	asm volatile("sync" : : : "memory");
+
+	return (global_entry | (castout_entry & 0x7));
+}
+
+/*
+ * Allocate a segment table entry for the given ea and mm
+ */
+static int __ste_allocate(unsigned long ea, struct mm_struct *mm)
+{
+	unsigned long vsid;
+	unsigned char stab_entry;
+	unsigned long offset;
+
+	/* Kernel or user address? */
+	if (is_kernel_addr(ea)) {
+		vsid = get_kernel_vsid(ea, MMU_SEGSIZE_256M);
+	} else {
+		if ((ea >= TASK_SIZE_USER64) || (! mm))
+			return 1;
+
+		vsid = get_vsid(mm->context.id, ea, MMU_SEGSIZE_256M);
+	}
+
+	stab_entry = make_ste(get_paca()->stab_addr, GET_ESID(ea), vsid);
+
+	if (!is_kernel_addr(ea)) {
+		offset = __get_cpu_var(stab_cache_ptr);
+		if (offset < NR_STAB_CACHE_ENTRIES)
+			__get_cpu_var(stab_cache[offset++]) = stab_entry;
+		else
+			offset = NR_STAB_CACHE_ENTRIES+1;
+		__get_cpu_var(stab_cache_ptr) = offset;
+
+		/* Order update */
+		asm volatile("sync":::"memory");
+	}
+
+	return 0;
+}
+
+int ste_allocate(unsigned long ea)
+{
+	return __ste_allocate(ea, current->mm);
+}
+
+/*
+ * Do the segment table work for a context switch: flush all user
+ * entries from the table, then preload some probably useful entries
+ * for the new task
+ */
+void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
+{
+	struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr;
+	struct stab_entry *ste;
+	unsigned long offset = __get_cpu_var(stab_cache_ptr);
+	unsigned long pc = KSTK_EIP(tsk);
+	unsigned long stack = KSTK_ESP(tsk);
+	unsigned long unmapped_base;
+
+	/* Force previous translations to complete. DRENG */
+	asm volatile("isync" : : : "memory");
+
+	if (offset <= NR_STAB_CACHE_ENTRIES) {
+		int i;
+
+		for (i = 0; i < offset; i++) {
+			ste = stab + __get_cpu_var(stab_cache[i]);
+			ste->esid_data = 0; /* invalidate entry */
+		}
+	} else {
+		unsigned long entry;
+
+		/* Invalidate all entries. */
+		ste = stab;
+
+		/* Never flush the first entry. */
+		ste += 1;
+		for (entry = 1;
+		     entry < (HW_PAGE_SIZE / sizeof(struct stab_entry));
+		     entry++, ste++) {
+			unsigned long ea;
+			ea = ste->esid_data & ESID_MASK;
+			if (!is_kernel_addr(ea)) {
+				ste->esid_data = 0;
+			}
+		}
+	}
+
+	asm volatile("sync; slbia; sync":::"memory");
+
+	__get_cpu_var(stab_cache_ptr) = 0;
+
+	/* Now preload some entries for the new task */
+	if (test_tsk_thread_flag(tsk, TIF_32BIT))
+		unmapped_base = TASK_UNMAPPED_BASE_USER32;
+	else
+		unmapped_base = TASK_UNMAPPED_BASE_USER64;
+
+	__ste_allocate(pc, mm);
+
+	if (GET_ESID(pc) == GET_ESID(stack))
+		return;
+
+	__ste_allocate(stack, mm);
+
+	if ((GET_ESID(pc) == GET_ESID(unmapped_base))
+	    || (GET_ESID(stack) == GET_ESID(unmapped_base)))
+		return;
+
+	__ste_allocate(unmapped_base, mm);
+
+	/* Order update */
+	asm volatile("sync" : : : "memory");
+}
+
+/*
+ * Allocate segment tables for secondary CPUs.  These must all go in
+ * the first (bolted) segment, so that do_stab_bolted won't get a
+ * recursive segment miss on the segment table itself.
+ */
+void __init stabs_alloc(void)
+{
+	int cpu;
+
+	if (cpu_has_feature(CPU_FTR_SLB))
+		return;
+
+	for_each_possible_cpu(cpu) {
+		unsigned long newstab;
+
+		if (cpu == 0)
+			continue; /* stab for CPU 0 is statically allocated */
+
+		newstab = lmb_alloc_base(HW_PAGE_SIZE, HW_PAGE_SIZE,
+					 1<<SID_SHIFT);
+		newstab = (unsigned long)__va(newstab);
+
+		memset((void *)newstab, 0, HW_PAGE_SIZE);
+
+		paca[cpu].stab_addr = newstab;
+		paca[cpu].stab_real = virt_to_abs(newstab);
+		printk(KERN_INFO "Segment table for CPU %d at 0x%lx "
+		       "virtual, 0x%lx absolute\n",
+		       cpu, paca[cpu].stab_addr, paca[cpu].stab_real);
+	}
+}
+
+/*
+ * Build an entry for the base kernel segment and put it into
+ * the segment table or SLB.  All other segment table or SLB
+ * entries are faulted in.
+ */
+void stab_initialize(unsigned long stab)
+{
+	unsigned long vsid = get_kernel_vsid(PAGE_OFFSET, MMU_SEGSIZE_256M);
+	unsigned long stabreal;
+
+	asm volatile("isync; slbia; isync":::"memory");
+	make_ste(stab, GET_ESID(PAGE_OFFSET), vsid);
+
+	/* Order update */
+	asm volatile("sync":::"memory");
+
+	/* Set ASR */
+	stabreal = get_paca()->stab_real | 0x1ul;
+
+#ifdef CONFIG_PPC_ISERIES
+	if (firmware_has_feature(FW_FEATURE_ISERIES)) {
+		HvCall1(HvCallBaseSetASR, stabreal);
+		return;
+	}
+#endif /* CONFIG_PPC_ISERIES */
+
+	mtspr(SPRN_ASR, stabreal);
+}
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
new file mode 100644
index 0000000..4cafc0c
--- /dev/null
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -0,0 +1,213 @@
+/*
+ * Copyright 2007-2008 Paul Mackerras, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/tlbflush.h>
+
+/*
+ * Free all pages allocated for subpage protection maps and pointers.
+ * Also makes sure that the subpage_prot_table structure is
+ * reinitialized for the next user.
+ */
+void subpage_prot_free(pgd_t *pgd)
+{
+	struct subpage_prot_table *spt = pgd_subpage_prot(pgd);
+	unsigned long i, j, addr;
+	u32 **p;
+
+	for (i = 0; i < 4; ++i) {
+		if (spt->low_prot[i]) {
+			free_page((unsigned long)spt->low_prot[i]);
+			spt->low_prot[i] = NULL;
+		}
+	}
+	addr = 0;
+	for (i = 0; i < 2; ++i) {
+		p = spt->protptrs[i];
+		if (!p)
+			continue;
+		spt->protptrs[i] = NULL;
+		for (j = 0; j < SBP_L2_COUNT && addr < spt->maxaddr;
+		     ++j, addr += PAGE_SIZE)
+			if (p[j])
+				free_page((unsigned long)p[j]);
+		free_page((unsigned long)p);
+	}
+	spt->maxaddr = 0;
+}
+
+static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
+			     int npages)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	spinlock_t *ptl;
+
+	pgd = pgd_offset(mm, addr);
+	if (pgd_none(*pgd))
+		return;
+	pud = pud_offset(pgd, addr);
+	if (pud_none(*pud))
+		return;
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none(*pmd))
+		return;
+	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	arch_enter_lazy_mmu_mode();
+	for (; npages > 0; --npages) {
+		pte_update(mm, addr, pte, 0, 0);
+		addr += PAGE_SIZE;
+		++pte;
+	}
+	arch_leave_lazy_mmu_mode();
+	pte_unmap_unlock(pte - 1, ptl);
+}
+
+/*
+ * Clear the subpage protection map for an address range, allowing
+ * all accesses that are allowed by the pte permissions.
+ */
+static void subpage_prot_clear(unsigned long addr, unsigned long len)
+{
+	struct mm_struct *mm = current->mm;
+	struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd);
+	u32 **spm, *spp;
+	int i, nw;
+	unsigned long next, limit;
+
+	down_write(&mm->mmap_sem);
+	limit = addr + len;
+	if (limit > spt->maxaddr)
+		limit = spt->maxaddr;
+	for (; addr < limit; addr = next) {
+		next = pmd_addr_end(addr, limit);
+		if (addr < 0x100000000) {
+			spm = spt->low_prot;
+		} else {
+			spm = spt->protptrs[addr >> SBP_L3_SHIFT];
+			if (!spm)
+				continue;
+		}
+		spp = spm[(addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)];
+		if (!spp)
+			continue;
+		spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1);
+
+		i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+		nw = PTRS_PER_PTE - i;
+		if (addr + (nw << PAGE_SHIFT) > next)
+			nw = (next - addr) >> PAGE_SHIFT;
+
+		memset(spp, 0, nw * sizeof(u32));
+
+		/* now flush any existing HPTEs for the range */
+		hpte_flush_range(mm, addr, nw);
+	}
+	up_write(&mm->mmap_sem);
+}
+
+/*
+ * Copy in a subpage protection map for an address range.
+ * The map has 2 bits per 4k subpage, so 32 bits per 64k page.
+ * Each 2-bit field is 0 to allow any access, 1 to prevent writes,
+ * 2 or 3 to prevent all accesses.
+ * Note that the normal page protections also apply; the subpage
+ * protection mechanism is an additional constraint, so putting 0
+ * in a 2-bit field won't allow writes to a page that is otherwise
+ * write-protected.
+ */
+long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
+{
+	struct mm_struct *mm = current->mm;
+	struct subpage_prot_table *spt = pgd_subpage_prot(mm->pgd);
+	u32 **spm, *spp;
+	int i, nw;
+	unsigned long next, limit;
+	int err;
+
+	/* Check parameters */
+	if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) ||
+	    addr >= TASK_SIZE || len >= TASK_SIZE || addr + len > TASK_SIZE)
+		return -EINVAL;
+
+	if (is_hugepage_only_range(mm, addr, len))
+		return -EINVAL;
+
+	if (!map) {
+		/* Clear out the protection map for the address range */
+		subpage_prot_clear(addr, len);
+		return 0;
+	}
+
+	if (!access_ok(VERIFY_READ, map, (len >> PAGE_SHIFT) * sizeof(u32)))
+		return -EFAULT;
+
+	down_write(&mm->mmap_sem);
+	for (limit = addr + len; addr < limit; addr = next) {
+		next = pmd_addr_end(addr, limit);
+		err = -ENOMEM;
+		if (addr < 0x100000000) {
+			spm = spt->low_prot;
+		} else {
+			spm = spt->protptrs[addr >> SBP_L3_SHIFT];
+			if (!spm) {
+				spm = (u32 **)get_zeroed_page(GFP_KERNEL);
+				if (!spm)
+					goto out;
+				spt->protptrs[addr >> SBP_L3_SHIFT] = spm;
+			}
+		}
+		spm += (addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1);
+		spp = *spm;
+		if (!spp) {
+			spp = (u32 *)get_zeroed_page(GFP_KERNEL);
+			if (!spp)
+				goto out;
+			*spm = spp;
+		}
+		spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1);
+
+		local_irq_disable();
+		demote_segment_4k(mm, addr);
+		local_irq_enable();
+
+		i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+		nw = PTRS_PER_PTE - i;
+		if (addr + (nw << PAGE_SHIFT) > next)
+			nw = (next - addr) >> PAGE_SHIFT;
+
+		up_write(&mm->mmap_sem);
+		err = -EFAULT;
+		if (__copy_from_user(spp, map, nw * sizeof(u32)))
+			goto out2;
+		map += nw;
+		down_write(&mm->mmap_sem);
+
+		/* now flush any existing HPTEs for the range */
+		hpte_flush_range(mm, addr, nw);
+	}
+	if (limit > spt->maxaddr)
+		spt->maxaddr = limit;
+	err = 0;
+ out:
+	up_write(&mm->mmap_sem);
+ out2:
+	return err;
+}
diff --git a/arch/powerpc/mm/tlb_32.c b/arch/powerpc/mm/tlb_32.c
new file mode 100644
index 0000000..f9a47fe
--- /dev/null
+++ b/arch/powerpc/mm/tlb_32.c
@@ -0,0 +1,190 @@
+/*
+ * This file contains the routines for TLB flushing.
+ * On machines where the MMU uses a hash table to store virtual to
+ * physical translations, these routines flush entries from the
+ * hash table also.
+ *  -- paulus
+ *
+ *  Derived from arch/ppc/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+
+#include "mmu_decl.h"
+
+/*
+ * Called when unmapping pages to flush entries from the TLB/hash table.
+ */
+void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr)
+{
+	unsigned long ptephys;
+
+	if (Hash != 0) {
+		ptephys = __pa(ptep) & PAGE_MASK;
+		flush_hash_pages(mm->context.id, addr, ptephys, 1);
+	}
+}
+EXPORT_SYMBOL(flush_hash_entry);
+
+/*
+ * Called by ptep_set_access_flags, must flush on CPUs for which the
+ * DSI handler can't just "fixup" the TLB on a write fault
+ */
+void flush_tlb_page_nohash(struct vm_area_struct *vma, unsigned long addr)
+{
+	if (Hash != 0)
+		return;
+	_tlbie(addr);
+}
+
+/*
+ * Called at the end of a mmu_gather operation to make sure the
+ * TLB flush is completely done.
+ */
+void tlb_flush(struct mmu_gather *tlb)
+{
+	if (Hash == 0) {
+		/*
+		 * 603 needs to flush the whole TLB here since
+		 * it doesn't use a hash table.
+		 */
+		_tlbia();
+	}
+}
+
+/*
+ * TLB flushing:
+ *
+ *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ *  - flush_tlb_page(vma, vmaddr) flushes one page
+ *  - flush_tlb_range(vma, start, end) flushes a range of pages
+ *  - flush_tlb_kernel_range(start, end) flushes kernel pages
+ *
+ * since the hardware hash table functions as an extension of the
+ * tlb as far as the linux tables are concerned, flush it too.
+ *    -- Cort
+ */
+
+/*
+ * 750 SMP is a Bad Idea because the 750 doesn't broadcast all
+ * the cache operations on the bus.  Hence we need to use an IPI
+ * to get the other CPU(s) to invalidate their TLBs.
+ */
+#ifdef CONFIG_SMP_750
+#define FINISH_FLUSH	smp_send_tlb_invalidate(0)
+#else
+#define FINISH_FLUSH	do { } while (0)
+#endif
+
+static void flush_range(struct mm_struct *mm, unsigned long start,
+			unsigned long end)
+{
+	pmd_t *pmd;
+	unsigned long pmd_end;
+	int count;
+	unsigned int ctx = mm->context.id;
+
+	if (Hash == 0) {
+		_tlbia();
+		return;
+	}
+	start &= PAGE_MASK;
+	if (start >= end)
+		return;
+	end = (end - 1) | ~PAGE_MASK;
+	pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start);
+	for (;;) {
+		pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1;
+		if (pmd_end > end)
+			pmd_end = end;
+		if (!pmd_none(*pmd)) {
+			count = ((pmd_end - start) >> PAGE_SHIFT) + 1;
+			flush_hash_pages(ctx, start, pmd_val(*pmd), count);
+		}
+		if (pmd_end == end)
+			break;
+		start = pmd_end + 1;
+		++pmd;
+	}
+}
+
+/*
+ * Flush kernel TLB entries in the given range
+ */
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+	flush_range(&init_mm, start, end);
+	FINISH_FLUSH;
+}
+
+/*
+ * Flush all the (user) entries for the address space described by mm.
+ */
+void flush_tlb_mm(struct mm_struct *mm)
+{
+	struct vm_area_struct *mp;
+
+	if (Hash == 0) {
+		_tlbia();
+		return;
+	}
+
+	/*
+	 * It is safe to go down the mm's list of vmas when called
+	 * from dup_mmap, holding mmap_sem.  It would also be safe from
+	 * unmap_region or exit_mmap, but not from vmtruncate on SMP -
+	 * but it seems dup_mmap is the only SMP case which gets here.
+	 */
+	for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
+		flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
+	FINISH_FLUSH;
+}
+
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+	struct mm_struct *mm;
+	pmd_t *pmd;
+
+	if (Hash == 0) {
+		_tlbie(vmaddr);
+		return;
+	}
+	mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm;
+	pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr);
+	if (!pmd_none(*pmd))
+		flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
+	FINISH_FLUSH;
+}
+
+/*
+ * For each address in the range, find the pte for the address
+ * and check _PAGE_HASHPTE bit; if it is set, find and destroy
+ * the corresponding HPTE.
+ */
+void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+		     unsigned long end)
+{
+	flush_range(vma->vm_mm, start, end);
+	FINISH_FLUSH;
+}
diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_64.c
new file mode 100644
index 0000000..be7dd42
--- /dev/null
+++ b/arch/powerpc/mm/tlb_64.c
@@ -0,0 +1,297 @@
+/*
+ * This file contains the routines for flushing entries from the
+ * TLB and MMU hash table.
+ *
+ *  Derived from arch/ppc64/mm/init.c:
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ *    Copyright (C) 1996 Paul Mackerras
+ *
+ *  Derived from "arch/i386/mm/init.c"
+ *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  Dave Engebretsen <engebret@us.ibm.com>
+ *      Rework for PPC64 port.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/bug.h>
+
+DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
+
+/* This is declared as we are using the more or less generic
+ * arch/powerpc/include/asm/tlb.h file -- tgall
+ */
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+static DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
+static unsigned long pte_freelist_forced_free;
+
+struct pte_freelist_batch
+{
+	struct rcu_head	rcu;
+	unsigned int	index;
+	pgtable_free_t	tables[0];
+};
+
+#define PTE_FREELIST_SIZE \
+	((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
+	  / sizeof(pgtable_free_t))
+
+static void pte_free_smp_sync(void *arg)
+{
+	/* Do nothing, just ensure we sync with all CPUs */
+}
+
+/* This is only called when we are critically out of memory
+ * (and fail to get a page in pte_free_tlb).
+ */
+static void pgtable_free_now(pgtable_free_t pgf)
+{
+	pte_freelist_forced_free++;
+
+	smp_call_function(pte_free_smp_sync, NULL, 1);
+
+	pgtable_free(pgf);
+}
+
+static void pte_free_rcu_callback(struct rcu_head *head)
+{
+	struct pte_freelist_batch *batch =
+		container_of(head, struct pte_freelist_batch, rcu);
+	unsigned int i;
+
+	for (i = 0; i < batch->index; i++)
+		pgtable_free(batch->tables[i]);
+
+	free_page((unsigned long)batch);
+}
+
+static void pte_free_submit(struct pte_freelist_batch *batch)
+{
+	INIT_RCU_HEAD(&batch->rcu);
+	call_rcu(&batch->rcu, pte_free_rcu_callback);
+}
+
+void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
+{
+	/* This is safe since tlb_gather_mmu has disabled preemption */
+        cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());
+	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+
+	if (atomic_read(&tlb->mm->mm_users) < 2 ||
+	    cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) {
+		pgtable_free(pgf);
+		return;
+	}
+
+	if (*batchp == NULL) {
+		*batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
+		if (*batchp == NULL) {
+			pgtable_free_now(pgf);
+			return;
+		}
+		(*batchp)->index = 0;
+	}
+	(*batchp)->tables[(*batchp)->index++] = pgf;
+	if ((*batchp)->index == PTE_FREELIST_SIZE) {
+		pte_free_submit(*batchp);
+		*batchp = NULL;
+	}
+}
+
+/*
+ * A linux PTE was changed and the corresponding hash table entry
+ * neesd to be flushed. This function will either perform the flush
+ * immediately or will batch it up if the current CPU has an active
+ * batch on it.
+ *
+ * Must be called from within some kind of spinlock/non-preempt region...
+ */
+void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, unsigned long pte, int huge)
+{
+	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+	unsigned long vsid, vaddr;
+	unsigned int psize;
+	int ssize;
+	real_pte_t rpte;
+	int i;
+
+	i = batch->index;
+
+	/* We mask the address for the base page size. Huge pages will
+	 * have applied their own masking already
+	 */
+	addr &= PAGE_MASK;
+
+	/* Get page size (maybe move back to caller).
+	 *
+	 * NOTE: when using special 64K mappings in 4K environment like
+	 * for SPEs, we obtain the page size from the slice, which thus
+	 * must still exist (and thus the VMA not reused) at the time
+	 * of this call
+	 */
+	if (huge) {
+#ifdef CONFIG_HUGETLB_PAGE
+		psize = get_slice_psize(mm, addr);;
+#else
+		BUG();
+		psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
+#endif
+	} else
+		psize = pte_pagesize_index(mm, addr, pte);
+
+	/* Build full vaddr */
+	if (!is_kernel_addr(addr)) {
+		ssize = user_segment_size(addr);
+		vsid = get_vsid(mm->context.id, addr, ssize);
+		WARN_ON(vsid == 0);
+	} else {
+		vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+		ssize = mmu_kernel_ssize;
+	}
+	vaddr = hpt_va(addr, vsid, ssize);
+	rpte = __real_pte(__pte(pte), ptep);
+
+	/*
+	 * Check if we have an active batch on this CPU. If not, just
+	 * flush now and return. For now, we don global invalidates
+	 * in that case, might be worth testing the mm cpu mask though
+	 * and decide to use local invalidates instead...
+	 */
+	if (!batch->active) {
+		flush_hash_page(vaddr, rpte, psize, ssize, 0);
+		return;
+	}
+
+	/*
+	 * This can happen when we are in the middle of a TLB batch and
+	 * we encounter memory pressure (eg copy_page_range when it tries
+	 * to allocate a new pte). If we have to reclaim memory and end
+	 * up scanning and resetting referenced bits then our batch context
+	 * will change mid stream.
+	 *
+	 * We also need to ensure only one page size is present in a given
+	 * batch
+	 */
+	if (i != 0 && (mm != batch->mm || batch->psize != psize ||
+		       batch->ssize != ssize)) {
+		__flush_tlb_pending(batch);
+		i = 0;
+	}
+	if (i == 0) {
+		batch->mm = mm;
+		batch->psize = psize;
+		batch->ssize = ssize;
+	}
+	batch->pte[i] = rpte;
+	batch->vaddr[i] = vaddr;
+	batch->index = ++i;
+	if (i >= PPC64_TLB_BATCH_NR)
+		__flush_tlb_pending(batch);
+}
+
+/*
+ * This function is called when terminating an mmu batch or when a batch
+ * is full. It will perform the flush of all the entries currently stored
+ * in a batch.
+ *
+ * Must be called from within some kind of spinlock/non-preempt region...
+ */
+void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
+{
+	cpumask_t tmp;
+	int i, local = 0;
+
+	i = batch->index;
+	tmp = cpumask_of_cpu(smp_processor_id());
+	if (cpus_equal(batch->mm->cpu_vm_mask, tmp))
+		local = 1;
+	if (i == 1)
+		flush_hash_page(batch->vaddr[0], batch->pte[0],
+				batch->psize, batch->ssize, local);
+	else
+		flush_hash_range(i, local);
+	batch->index = 0;
+}
+
+void pte_free_finish(void)
+{
+	/* This is safe since tlb_gather_mmu has disabled preemption */
+	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+
+	if (*batchp == NULL)
+		return;
+	pte_free_submit(*batchp);
+	*batchp = NULL;
+}
+
+/**
+ * __flush_hash_table_range - Flush all HPTEs for a given address range
+ *                            from the hash table (and the TLB). But keeps
+ *                            the linux PTEs intact.
+ *
+ * @mm		: mm_struct of the target address space (generally init_mm)
+ * @start	: starting address
+ * @end         : ending address (not included in the flush)
+ *
+ * This function is mostly to be used by some IO hotplug code in order
+ * to remove all hash entries from a given address range used to map IO
+ * space on a removed PCI-PCI bidge without tearing down the full mapping
+ * since 64K pages may overlap with other bridges when using 64K pages
+ * with 4K HW pages on IO space.
+ *
+ * Because of that usage pattern, it's only available with CONFIG_HOTPLUG
+ * and is implemented for small size rather than speed.
+ */
+#ifdef CONFIG_HOTPLUG
+
+void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
+			      unsigned long end)
+{
+	unsigned long flags;
+
+	start = _ALIGN_DOWN(start, PAGE_SIZE);
+	end = _ALIGN_UP(end, PAGE_SIZE);
+
+	BUG_ON(!mm->pgd);
+
+	/* Note: Normally, we should only ever use a batch within a
+	 * PTE locked section. This violates the rule, but will work
+	 * since we don't actually modify the PTEs, we just flush the
+	 * hash while leaving the PTEs intact (including their reference
+	 * to being hashed). This is not the most performance oriented
+	 * way to do things but is fine for our needs here.
+	 */
+	local_irq_save(flags);
+	arch_enter_lazy_mmu_mode();
+	for (; start < end; start += PAGE_SIZE) {
+		pte_t *ptep = find_linux_pte(mm->pgd, start);
+		unsigned long pte;
+
+		if (ptep == NULL)
+			continue;
+		pte = pte_val(*ptep);
+		if (!(pte & _PAGE_HASHPTE))
+			continue;
+		hpte_need_flush(mm, start, ptep, pte, 0);
+	}
+	arch_leave_lazy_mmu_mode();
+	local_irq_restore(flags);
+}
+
+#endif /* CONFIG_HOTPLUG */
author	Timothy Pearson <tpearson@raptorengineering.com>	2017-08-23 14:45:25 -0500
committer	Timothy Pearson <tpearson@raptorengineering.com>	2017-08-23 14:45:25 -0500
commit	fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204 (patch)
tree	22962a4387943edc841c72a4e636a068c66d58fd /arch/powerpc/mm
download	ast2050-linux-kernel-fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204.zip ast2050-linux-kernel-fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204.tar.gz