From 41151e77a4d96ea138cede6d84c955aa4769ce74 Mon Sep 17 00:00:00 2001
From: Becky Bruce <beckyb@kernel.crashing.org>
Date: Tue, 28 Jun 2011 09:54:48 +0000
Subject: powerpc: Hugetlb for BookE

Enable hugepages on Freescale BookE processors.  This allows the kernel to
use huge TLB entries to map pages, which can greatly reduce the number of
TLB misses and the amount of TLB thrashing experienced by applications with
large memory footprints.  Care should be taken when using this on FSL
processors, as the number of large TLB entries supported by the core is low
(16-64) on current processors.

The supported set of hugepage sizes include 4m, 16m, 64m, 256m, and 1g.
Page sizes larger than the max zone size are called "gigantic" pages and
must be allocated on the command line (and cannot be deallocated).

This is currently only fully implemented for Freescale 32-bit BookE
processors, but there is some infrastructure in the code for
64-bit BooKE.

Signed-off-by: Becky Bruce <beckyb@kernel.crashing.org>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/mm/Makefile             |   1 +
 arch/powerpc/mm/hash_utils_64.c      |   3 -
 arch/powerpc/mm/hugetlbpage-book3e.c | 121 +++++++++++
 arch/powerpc/mm/hugetlbpage.c        | 379 +++++++++++++++++++++++++++++++----
 arch/powerpc/mm/init_32.c            |   9 +
 arch/powerpc/mm/mem.c                |   5 +
 arch/powerpc/mm/mmu_context_nohash.c |   5 +
 arch/powerpc/mm/pgtable.c            |   3 +-
 arch/powerpc/mm/tlb_low_64e.S        |  24 +--
 arch/powerpc/mm/tlb_nohash.c         |  46 ++++-
 10 files changed, 536 insertions(+), 60 deletions(-)
 create mode 100644 arch/powerpc/mm/hugetlbpage-book3e.c

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index bdca46e..991ee81 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
 ifeq ($(CONFIG_HUGETLB_PAGE),y)
 obj-y				+= hugetlbpage.o
 obj-$(CONFIG_PPC_STD_MMU_64)	+= hugetlbpage-hash64.o
+obj-$(CONFIG_PPC_BOOK3E_MMU)	+= hugetlbpage-book3e.o
 endif
 obj-$(CONFIG_PPC_SUBPAGE_PROT)	+= subpage-prot.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 26b2872..1f8b2a0 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -105,9 +105,6 @@ int mmu_kernel_ssize = MMU_SEGSIZE_256M;
 int mmu_highuser_ssize = MMU_SEGSIZE_256M;
 u16 mmu_slb_size = 64;
 EXPORT_SYMBOL_GPL(mmu_slb_size);
-#ifdef CONFIG_HUGETLB_PAGE
-unsigned int HPAGE_SHIFT;
-#endif
 #ifdef CONFIG_PPC_64K_PAGES
 int mmu_ci_restrictions;
 #endif
diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c
new file mode 100644
index 0000000..1295b7c
--- /dev/null
+++ b/arch/powerpc/mm/hugetlbpage-book3e.c
@@ -0,0 +1,121 @@
+/*
+ * PPC Huge TLB Page Support for Book3E MMU
+ *
+ * Copyright (C) 2009 David Gibson, IBM Corporation.
+ * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
+ *
+ */
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+
+static inline int mmu_get_tsize(int psize)
+{
+	return mmu_psize_defs[psize].enc;
+}
+
+static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid)
+{
+	int found = 0;
+
+	mtspr(SPRN_MAS6, pid << 16);
+	if (mmu_has_feature(MMU_FTR_USE_TLBRSRV)) {
+		asm volatile(
+			"li	%0,0\n"
+			"tlbsx.	0,%1\n"
+			"bne	1f\n"
+			"li	%0,1\n"
+			"1:\n"
+			: "=&r"(found) : "r"(ea));
+	} else {
+		asm volatile(
+			"tlbsx	0,%1\n"
+			"mfspr	%0,0x271\n"
+			"srwi	%0,%0,31\n"
+			: "=&r"(found) : "r"(ea));
+	}
+
+	return found;
+}
+
+void book3e_hugetlb_preload(struct mm_struct *mm, unsigned long ea, pte_t pte)
+{
+	unsigned long mas1, mas2;
+	u64 mas7_3;
+	unsigned long psize, tsize, shift;
+	unsigned long flags;
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	int index, lz, ncams;
+	struct vm_area_struct *vma;
+#endif
+
+	if (unlikely(is_kernel_addr(ea)))
+		return;
+
+#ifdef CONFIG_MM_SLICES
+	psize = mmu_get_tsize(get_slice_psize(mm, ea));
+	tsize = mmu_get_psize(psize);
+	shift = mmu_psize_defs[psize].shift;
+#else
+	vma = find_vma(mm, ea);
+	psize = vma_mmu_pagesize(vma);	/* returns actual size in bytes */
+	asm (PPC_CNTLZL "%0,%1" : "=r" (lz) : "r" (psize));
+	shift = 31 - lz;
+	tsize = 21 - lz;
+#endif
+
+	/*
+	 * We can't be interrupted while we're setting up the MAS
+	 * regusters or after we've confirmed that no tlb exists.
+	 */
+	local_irq_save(flags);
+
+	if (unlikely(book3e_tlb_exists(ea, mm->context.id))) {
+		local_irq_restore(flags);
+		return;
+	}
+
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+
+	/* We have to use the CAM(TLB1) on FSL parts for hugepages */
+	index = __get_cpu_var(next_tlbcam_idx);
+	mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1));
+
+	/* Just round-robin the entries and wrap when we hit the end */
+	if (unlikely(index == ncams - 1))
+		__get_cpu_var(next_tlbcam_idx) = tlbcam_index;
+	else
+		__get_cpu_var(next_tlbcam_idx)++;
+#endif
+	mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize);
+	mas2 = ea & ~((1UL << shift) - 1);
+	mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK;
+	mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT;
+	mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK;
+	if (!pte_dirty(pte))
+		mas7_3 &= ~(MAS3_SW|MAS3_UW);
+
+	mtspr(SPRN_MAS1, mas1);
+	mtspr(SPRN_MAS2, mas2);
+
+	if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) {
+		mtspr(SPRN_MAS7_MAS3, mas7_3);
+	} else {
+		mtspr(SPRN_MAS7, upper_32_bits(mas7_3));
+		mtspr(SPRN_MAS3, lower_32_bits(mas7_3));
+	}
+
+	asm volatile ("tlbwe");
+
+	local_irq_restore(flags);
+}
+
+void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
+{
+	struct hstate *hstate = hstate_file(vma->vm_file);
+	unsigned long tsize = huge_page_shift(hstate) - 10;
+
+	__flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, tsize, 0);
+
+}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0b9a5c1..3a5f59d 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -1,7 +1,8 @@
 /*
- * PPC64 (POWER4) Huge TLB Page Support for Kernel.
+ * PPC Huge TLB Page Support for Kernel.
  *
  * Copyright (C) 2003 David Gibson, IBM Corporation.
+ * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
  *
  * Based on the IA-32 version:
  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
@@ -11,24 +12,39 @@
 #include <linux/io.h>
 #include <linux/slab.h>
 #include <linux/hugetlb.h>
+#include <linux/of_fdt.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
+#include <asm/setup.h>
 
 #define PAGE_SHIFT_64K	16
 #define PAGE_SHIFT_16M	24
 #define PAGE_SHIFT_16G	34
 
-#define MAX_NUMBER_GPAGES	1024
+unsigned int HPAGE_SHIFT;
 
-/* Tracks the 16G pages after the device tree is scanned and before the
- * huge_boot_pages list is ready.  */
-static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
+/*
+ * Tracks gpages after the device tree is scanned and before the
+ * huge_boot_pages list is ready.  On 64-bit implementations, this is
+ * just used to track 16G pages and so is a single array.  32-bit
+ * implementations may have more than one gpage size due to limitations
+ * of the memory allocators, so we need multiple arrays
+ */
+#ifdef CONFIG_PPC64
+#define MAX_NUMBER_GPAGES	1024
+static u64 gpage_freearray[MAX_NUMBER_GPAGES];
 static unsigned nr_gpages;
-
-/* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
- * will choke on pointers to hugepte tables, which is handy for
- * catching screwups early. */
+#else
+#define MAX_NUMBER_GPAGES	128
+struct psize_gpages {
+	u64 gpage_list[MAX_NUMBER_GPAGES];
+	unsigned int nr_gpages;
+};
+static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT];
+#endif
 
 static inline int shift_to_mmu_psize(unsigned int shift)
 {
@@ -49,25 +65,6 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
 
 #define hugepd_none(hpd)	((hpd).pd == 0)
 
-static inline pte_t *hugepd_page(hugepd_t hpd)
-{
-	BUG_ON(!hugepd_ok(hpd));
-	return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000);
-}
-
-static inline unsigned int hugepd_shift(hugepd_t hpd)
-{
-	return hpd.pd & HUGEPD_SHIFT_MASK;
-}
-
-static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift)
-{
-	unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp);
-	pte_t *dir = hugepd_page(*hpdp);
-
-	return dir + idx;
-}
-
 pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
 {
 	pgd_t *pg;
@@ -93,7 +90,7 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
 			if (is_hugepd(pm))
 				hpdp = (hugepd_t *)pm;
 			else if (!pmd_none(*pm)) {
-				return pte_offset_map(pm, ea);
+				return pte_offset_kernel(pm, ea);
 			}
 		}
 	}
@@ -114,8 +111,18 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 			   unsigned long address, unsigned pdshift, unsigned pshift)
 {
-	pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift),
-				       GFP_KERNEL|__GFP_REPEAT);
+	struct kmem_cache *cachep;
+	pte_t *new;
+
+#ifdef CONFIG_PPC64
+	cachep = PGT_CACHE(pdshift - pshift);
+#else
+	int i;
+	int num_hugepd = 1 << (pshift - pdshift);
+	cachep = hugepte_cache;
+#endif
+
+	new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT);
 
 	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
 	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
@@ -124,10 +131,31 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 		return -ENOMEM;
 
 	spin_lock(&mm->page_table_lock);
+#ifdef CONFIG_PPC64
 	if (!hugepd_none(*hpdp))
-		kmem_cache_free(PGT_CACHE(pdshift - pshift), new);
+		kmem_cache_free(cachep, new);
 	else
-		hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift;
+		hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
+#else
+	/*
+	 * We have multiple higher-level entries that point to the same
+	 * actual pte location.  Fill in each as we go and backtrack on error.
+	 * We need all of these so the DTLB pgtable walk code can find the
+	 * right higher-level entry without knowing if it's a hugepage or not.
+	 */
+	for (i = 0; i < num_hugepd; i++, hpdp++) {
+		if (unlikely(!hugepd_none(*hpdp)))
+			break;
+		else
+			hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
+	}
+	/* If we bailed from the for loop early, an error occurred, clean up */
+	if (i < num_hugepd) {
+		for (i = i - 1 ; i >= 0; i--, hpdp--)
+			hpdp->pd = 0;
+		kmem_cache_free(cachep, new);
+	}
+#endif
 	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
@@ -169,11 +197,132 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
 	return hugepte_offset(hpdp, addr, pdshift);
 }
 
+#ifdef CONFIG_PPC32
 /* Build list of addresses of gigantic pages.  This function is used in early
  * boot before the buddy or bootmem allocator is setup.
  */
-void add_gpage(unsigned long addr, unsigned long page_size,
-	unsigned long number_of_pages)
+void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
+{
+	unsigned int idx = shift_to_mmu_psize(__ffs(page_size));
+	int i;
+
+	if (addr == 0)
+		return;
+
+	gpage_freearray[idx].nr_gpages = number_of_pages;
+
+	for (i = 0; i < number_of_pages; i++) {
+		gpage_freearray[idx].gpage_list[i] = addr;
+		addr += page_size;
+	}
+}
+
+/*
+ * Moves the gigantic page addresses from the temporary list to the
+ * huge_boot_pages list.
+ */
+int alloc_bootmem_huge_page(struct hstate *hstate)
+{
+	struct huge_bootmem_page *m;
+	int idx = shift_to_mmu_psize(hstate->order + PAGE_SHIFT);
+	int nr_gpages = gpage_freearray[idx].nr_gpages;
+
+	if (nr_gpages == 0)
+		return 0;
+
+#ifdef CONFIG_HIGHMEM
+	/*
+	 * If gpages can be in highmem we can't use the trick of storing the
+	 * data structure in the page; allocate space for this
+	 */
+	m = alloc_bootmem(sizeof(struct huge_bootmem_page));
+	m->phys = gpage_freearray[idx].gpage_list[--nr_gpages];
+#else
+	m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]);
+#endif
+
+	list_add(&m->list, &huge_boot_pages);
+	gpage_freearray[idx].nr_gpages = nr_gpages;
+	gpage_freearray[idx].gpage_list[nr_gpages] = 0;
+	m->hstate = hstate;
+
+	return 1;
+}
+/*
+ * Scan the command line hugepagesz= options for gigantic pages; store those in
+ * a list that we use to allocate the memory once all options are parsed.
+ */
+
+unsigned long gpage_npages[MMU_PAGE_COUNT];
+
+static int __init do_gpage_early_setup(char *param, char *val)
+{
+	static phys_addr_t size;
+	unsigned long npages;
+
+	/*
+	 * The hugepagesz and hugepages cmdline options are interleaved.  We
+	 * use the size variable to keep track of whether or not this was done
+	 * properly and skip over instances where it is incorrect.  Other
+	 * command-line parsing code will issue warnings, so we don't need to.
+	 *
+	 */
+	if ((strcmp(param, "default_hugepagesz") == 0) ||
+	    (strcmp(param, "hugepagesz") == 0)) {
+		size = memparse(val, NULL);
+	} else if (strcmp(param, "hugepages") == 0) {
+		if (size != 0) {
+			if (sscanf(val, "%lu", &npages) <= 0)
+				npages = 0;
+			gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages;
+			size = 0;
+		}
+	}
+	return 0;
+}
+
+
+/*
+ * This function allocates physical space for pages that are larger than the
+ * buddy allocator can handle.  We want to allocate these in highmem because
+ * the amount of lowmem is limited.  This means that this function MUST be
+ * called before lowmem_end_addr is set up in MMU_init() in order for the lmb
+ * allocate to grab highmem.
+ */
+void __init reserve_hugetlb_gpages(void)
+{
+	static __initdata char cmdline[COMMAND_LINE_SIZE];
+	phys_addr_t size, base;
+	int i;
+
+	strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
+	parse_args("hugetlb gpages", cmdline, NULL, 0, &do_gpage_early_setup);
+
+	/*
+	 * Walk gpage list in reverse, allocating larger page sizes first.
+	 * Skip over unsupported sizes, or sizes that have 0 gpages allocated.
+	 * When we reach the point in the list where pages are no longer
+	 * considered gpages, we're done.
+	 */
+	for (i = MMU_PAGE_COUNT-1; i >= 0; i--) {
+		if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0)
+			continue;
+		else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT))
+			break;
+
+		size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i));
+		base = memblock_alloc_base(size * gpage_npages[i], size,
+					   MEMBLOCK_ALLOC_ANYWHERE);
+		add_gpage(base, size, gpage_npages[i]);
+	}
+}
+
+#else /* PPC64 */
+
+/* Build list of addresses of gigantic pages.  This function is used in early
+ * boot before the buddy or bootmem allocator is setup.
+ */
+void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
 {
 	if (!addr)
 		return;
@@ -199,19 +348,79 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
 	m->hstate = hstate;
 	return 1;
 }
+#endif
 
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 {
 	return 0;
 }
 
+#ifdef CONFIG_PPC32
+#define HUGEPD_FREELIST_SIZE \
+	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
+
+struct hugepd_freelist {
+	struct rcu_head	rcu;
+	unsigned int index;
+	void *ptes[0];
+};
+
+static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
+
+static void hugepd_free_rcu_callback(struct rcu_head *head)
+{
+	struct hugepd_freelist *batch =
+		container_of(head, struct hugepd_freelist, rcu);
+	unsigned int i;
+
+	for (i = 0; i < batch->index; i++)
+		kmem_cache_free(hugepte_cache, batch->ptes[i]);
+
+	free_page((unsigned long)batch);
+}
+
+static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
+{
+	struct hugepd_freelist **batchp;
+
+	batchp = &__get_cpu_var(hugepd_freelist_cur);
+
+	if (atomic_read(&tlb->mm->mm_users) < 2 ||
+	    cpumask_equal(mm_cpumask(tlb->mm),
+			  cpumask_of(smp_processor_id()))) {
+		kmem_cache_free(hugepte_cache, hugepte);
+		return;
+	}
+
+	if (*batchp == NULL) {
+		*batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
+		(*batchp)->index = 0;
+	}
+
+	(*batchp)->ptes[(*batchp)->index++] = hugepte;
+	if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
+		call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
+		*batchp = NULL;
+	}
+}
+#endif
+
 static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
 			      unsigned long start, unsigned long end,
 			      unsigned long floor, unsigned long ceiling)
 {
 	pte_t *hugepte = hugepd_page(*hpdp);
-	unsigned shift = hugepd_shift(*hpdp);
+	int i;
+
 	unsigned long pdmask = ~((1UL << pdshift) - 1);
+	unsigned int num_hugepd = 1;
+
+#ifdef CONFIG_PPC64
+	unsigned int shift = hugepd_shift(*hpdp);
+#else
+	/* Note: On 32-bit the hpdp may be the first of several */
+	num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift));
+#endif
 
 	start &= pdmask;
 	if (start < floor)
@@ -224,9 +433,15 @@ static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif
 	if (end - 1 > ceiling - 1)
 		return;
 
-	hpdp->pd = 0;
+	for (i = 0; i < num_hugepd; i++, hpdp++)
+		hpdp->pd = 0;
+
 	tlb->need_flush = 1;
+#ifdef CONFIG_PPC64
 	pgtable_free_tlb(tlb, hugepte, pdshift - shift);
+#else
+	hugepd_free(tlb, hugepte);
+#endif
 }
 
 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -331,18 +546,27 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 	 * too.
 	 */
 
-	pgd = pgd_offset(tlb->mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
+		pgd = pgd_offset(tlb->mm, addr);
 		if (!is_hugepd(pgd)) {
 			if (pgd_none_or_clear_bad(pgd))
 				continue;
 			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
 		} else {
+#ifdef CONFIG_PPC32
+			/*
+			 * Increment next by the size of the huge mapping since
+			 * on 32-bit there may be more than one entry at the pgd
+			 * level for a single hugepage, but all of them point to
+			 * the same kmem cache that holds the hugepte.
+			 */
+			next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
+#endif
 			free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
 					  addr, next, floor, ceiling);
 		}
-	} while (pgd++, addr = next, addr != end);
+	} while (addr = next, addr != end);
 }
 
 struct page *
@@ -466,17 +690,35 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 					unsigned long len, unsigned long pgoff,
 					unsigned long flags)
 {
+#ifdef CONFIG_MM_SLICES
 	struct hstate *hstate = hstate_file(file);
 	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
 
 	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
+#else
+	return get_unmapped_area(file, addr, len, pgoff, flags);
+#endif
 }
 
 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
 {
+#ifdef CONFIG_MM_SLICES
 	unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
 
 	return 1UL << mmu_psize_to_shift(psize);
+#else
+	if (!is_vm_hugetlb_page(vma))
+		return PAGE_SIZE;
+
+	return huge_page_size(hstate_vma(vma));
+#endif
+}
+
+static inline bool is_power_of_4(unsigned long x)
+{
+	if (is_power_of_2(x))
+		return (__ilog2(x) % 2) ? false : true;
+	return false;
 }
 
 static int __init add_huge_page_size(unsigned long long size)
@@ -486,9 +728,14 @@ static int __init add_huge_page_size(unsigned long long size)
 
 	/* Check that it is a page size supported by the hardware and
 	 * that it fits within pagetable and slice limits. */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	if ((size < PAGE_SIZE) || !is_power_of_4(size))
+		return -EINVAL;
+#else
 	if (!is_power_of_2(size)
 	    || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
 		return -EINVAL;
+#endif
 
 	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
 		return -EINVAL;
@@ -525,6 +772,46 @@ static int __init hugepage_setup_sz(char *str)
 }
 __setup("hugepagesz=", hugepage_setup_sz);
 
+#ifdef CONFIG_FSL_BOOKE
+struct kmem_cache *hugepte_cache;
+static int __init hugetlbpage_init(void)
+{
+	int psize;
+
+	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+		unsigned shift;
+
+		if (!mmu_psize_defs[psize].shift)
+			continue;
+
+		shift = mmu_psize_to_shift(psize);
+
+		/* Don't treat normal page sizes as huge... */
+		if (shift != PAGE_SHIFT)
+			if (add_huge_page_size(1ULL << shift) < 0)
+				continue;
+	}
+
+	/*
+	 * Create a kmem cache for hugeptes.  The bottom bits in the pte have
+	 * size information encoded in them, so align them to allow this
+	 */
+	hugepte_cache =  kmem_cache_create("hugepte-cache", sizeof(pte_t),
+					   HUGEPD_SHIFT_MASK + 1, 0, NULL);
+	if (hugepte_cache == NULL)
+		panic("%s: Unable to create kmem cache for hugeptes\n",
+		      __func__);
+
+	/* Default hpage size = 4M */
+	if (mmu_psize_defs[MMU_PAGE_4M].shift)
+		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
+	else
+		panic("%s: Unable to set default huge page size\n", __func__);
+
+
+	return 0;
+}
+#else
 static int __init hugetlbpage_init(void)
 {
 	int psize;
@@ -567,15 +854,23 @@ static int __init hugetlbpage_init(void)
 
 	return 0;
 }
-
+#endif
 module_init(hugetlbpage_init);
 
 void flush_dcache_icache_hugepage(struct page *page)
 {
 	int i;
+	void *start;
 
 	BUG_ON(!PageCompound(page));
 
-	for (i = 0; i < (1UL << compound_order(page)); i++)
-		__flush_dcache_icache(page_address(page+i));
+	for (i = 0; i < (1UL << compound_order(page)); i++) {
+		if (!PageHighMem(page)) {
+			__flush_dcache_icache(page_address(page+i));
+		} else {
+			start = kmap_atomic(page+i, KM_PPC_SYNC_ICACHE);
+			__flush_dcache_icache(start);
+			kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
+		}
+	}
 }
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index c77fef5..161cefd 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -32,6 +32,8 @@
 #include <linux/pagemap.h>
 #include <linux/memblock.h>
 #include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
 
 #include <asm/pgalloc.h>
 #include <asm/prom.h>
@@ -44,6 +46,7 @@
 #include <asm/tlb.h>
 #include <asm/sections.h>
 #include <asm/system.h>
+#include <asm/hugetlb.h>
 
 #include "mmu_decl.h"
 
@@ -123,6 +126,12 @@ void __init MMU_init(void)
 	/* parse args from command line */
 	MMU_setup();
 
+	/*
+	 * Reserve gigantic pages for hugetlb.  This MUST occur before
+	 * lowmem_end_addr is initialized below.
+	 */
+	reserve_hugetlb_gpages();
+
 	if (memblock.memory.cnt > 1) {
 #ifndef CONFIG_WII
 		memblock.memory.cnt = 1;
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index c781bbc..ad9cf49 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -548,4 +548,9 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 		return;
 	hash_preload(vma->vm_mm, address, access, trap);
 #endif /* CONFIG_PPC_STD_MMU */
+#if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \
+	&& defined(CONFIG_HUGETLB_PAGE)
+	if (is_vm_hugetlb_page(vma))
+		book3e_hugetlb_preload(vma->vm_mm, address, *ptep);
+#endif
 }
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index 336807d..5b63bd3 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -292,6 +292,11 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm)
 	mm->context.id = MMU_NO_CONTEXT;
 	mm->context.active = 0;
 
+#ifdef CONFIG_PPC_MM_SLICES
+	if (slice_mm_new_context(mm))
+		slice_set_user_psize(mm, mmu_virtual_psize);
+#endif
+
 	return 0;
 }
 
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index af40c87..214130a 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -27,6 +27,7 @@
 #include <linux/init.h>
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
+#include <linux/hugetlb.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
@@ -212,7 +213,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
 	entry = set_access_flags_filter(entry, vma, dirty);
 	changed = !pte_same(*(ptep), entry);
 	if (changed) {
-		if (!(vma->vm_flags & VM_HUGETLB))
+		if (!is_vm_hugetlb_page(vma))
 			assert_pte_locked(vma->vm_mm, address);
 		__ptep_set_access_flags(ptep, entry);
 		flush_tlb_page_nohash(vma, address);
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index 4ebb34b..dc4a5f3 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -553,24 +553,24 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
 	rldicl	r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3
 	clrrdi	r10,r11,3
 	ldx	r15,r10,r15
-	cmpldi	cr0,r15,0
-	beq	virt_page_table_tlb_miss_fault
+	cmpdi	cr0,r15,0
+	bge	virt_page_table_tlb_miss_fault
 
 #ifndef CONFIG_PPC_64K_PAGES
 	/* Get to PUD entry */
 	rldicl	r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3
 	clrrdi	r10,r11,3
 	ldx	r15,r10,r15
-	cmpldi	cr0,r15,0
-	beq	virt_page_table_tlb_miss_fault
+	cmpdi	cr0,r15,0
+	bge	virt_page_table_tlb_miss_fault
 #endif /* CONFIG_PPC_64K_PAGES */
 
 	/* Get to PMD entry */
 	rldicl	r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3
 	clrrdi	r10,r11,3
 	ldx	r15,r10,r15
-	cmpldi	cr0,r15,0
-	beq	virt_page_table_tlb_miss_fault
+	cmpdi	cr0,r15,0
+	bge	virt_page_table_tlb_miss_fault
 
 	/* Ok, we're all right, we can now create a kernel translation for
 	 * a 4K or 64K page from r16 -> r15.
@@ -802,24 +802,24 @@ htw_tlb_miss:
 	rldicl	r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3
 	clrrdi	r10,r11,3
 	ldx	r15,r10,r15
-	cmpldi	cr0,r15,0
-	beq	htw_tlb_miss_fault
+	cmpdi	cr0,r15,0
+	bge	htw_tlb_miss_fault
 
 #ifndef CONFIG_PPC_64K_PAGES
 	/* Get to PUD entry */
 	rldicl	r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3
 	clrrdi	r10,r11,3
 	ldx	r15,r10,r15
-	cmpldi	cr0,r15,0
-	beq	htw_tlb_miss_fault
+	cmpdi	cr0,r15,0
+	bge	htw_tlb_miss_fault
 #endif /* CONFIG_PPC_64K_PAGES */
 
 	/* Get to PMD entry */
 	rldicl	r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3
 	clrrdi	r10,r11,3
 	ldx	r15,r10,r15
-	cmpldi	cr0,r15,0
-	beq	htw_tlb_miss_fault
+	cmpdi	cr0,r15,0
+	bge	htw_tlb_miss_fault
 
 	/* Ok, we're all right, we can now create an indirect entry for
 	 * a 1M or 256M page.
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index d32ec64..afc95c7 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -36,14 +36,49 @@
 #include <linux/spinlock.h>
 #include <linux/memblock.h>
 #include <linux/of_fdt.h>
+#include <linux/hugetlb.h>
 
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <asm/code-patching.h>
+#include <asm/hugetlb.h>
 
 #include "mmu_decl.h"
 
-#ifdef CONFIG_PPC_BOOK3E
+/*
+ * This struct lists the sw-supported page sizes.  The hardawre MMU may support
+ * other sizes not listed here.   The .ind field is only used on MMUs that have
+ * indirect page table entries.
+ */
+#ifdef CONFIG_PPC_BOOK3E_MMU
+#ifdef CONFIG_FSL_BOOKE
+struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
+	[MMU_PAGE_4K] = {
+		.shift	= 12,
+		.enc	= BOOK3E_PAGESZ_4K,
+	},
+	[MMU_PAGE_4M] = {
+		.shift	= 22,
+		.enc	= BOOK3E_PAGESZ_4M,
+	},
+	[MMU_PAGE_16M] = {
+		.shift	= 24,
+		.enc	= BOOK3E_PAGESZ_16M,
+	},
+	[MMU_PAGE_64M] = {
+		.shift	= 26,
+		.enc	= BOOK3E_PAGESZ_64M,
+	},
+	[MMU_PAGE_256M] = {
+		.shift	= 28,
+		.enc	= BOOK3E_PAGESZ_256M,
+	},
+	[MMU_PAGE_1G] = {
+		.shift	= 30,
+		.enc	= BOOK3E_PAGESZ_1GB,
+	},
+};
+#else
 struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
 	[MMU_PAGE_4K] = {
 		.shift	= 12,
@@ -77,6 +112,8 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
 		.enc	= BOOK3E_PAGESZ_1GB,
 	},
 };
+#endif /* CONFIG_FSL_BOOKE */
+
 static inline int mmu_get_tsize(int psize)
 {
 	return mmu_psize_defs[psize].enc;
@@ -87,7 +124,7 @@ static inline int mmu_get_tsize(int psize)
 	/* This isn't used on !Book3E for now */
 	return 0;
 }
-#endif
+#endif /* CONFIG_PPC_BOOK3E_MMU */
 
 /* The variables below are currently only used on 64-bit Book3E
  * though this will probably be made common with other nohash
@@ -266,6 +303,11 @@ void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
 
 void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
 {
+#ifdef CONFIG_HUGETLB_PAGE
+	if (is_vm_hugetlb_page(vma))
+		flush_hugetlb_page(vma, vmaddr);
+#endif
+
 	__flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr,
 			 mmu_get_tsize(mmu_virtual_psize), 0);
 }
-- 
cgit v1.1


From 6083184269fd723affca4f6340e491950267622a Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Wed, 10 Aug 2011 20:44:21 +0000
Subject: powerpc/numa: Remove double of_node_put in hot_add_node_scn_to_nid

During memory hotplug testing, I got the following warning:

ERROR: Bad of_node_put() on /memory@0

of_node_release
kref_put
of_node_put
of_find_node_by_type
hot_add_node_scn_to_nid
hot_add_scn_to_nid
memory_add_physaddr_to_nid
...

of_find_node_by_type() loop does the of_node_put for us so we only
need the handle the case where we terminate the loop early.

As suggested by Stephen Rothwell we can do the of_node_put
unconditionally outside of the loop since of_node_put handles a
NULL argument fine.

Signed-off-by: Anton Blanchard <anton@samba.org>
Cc: stable@kernel.org
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/mm/numa.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 2164006..2c1ae7a 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1214,11 +1214,12 @@ int hot_add_node_scn_to_nid(unsigned long scn_addr)
 			break;
 		}
 
-		of_node_put(memory);
 		if (nid >= 0)
 			break;
 	}
 
+	of_node_put(memory);
+
 	return nid;
 }
 
-- 
cgit v1.1


From 94db7c5e14f44b943febe54e089d077cd983d284 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Wed, 10 Aug 2011 20:44:22 +0000
Subject: powerpc: Use for_each_node_by_type instead of open coding it

Use for_each_node_by_type instead of open coding it.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/mm/numa.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 2c1ae7a..00cc090 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -710,7 +710,7 @@ static void __init parse_drconf_memory(struct device_node *memory)
 static int __init parse_numa_properties(void)
 {
 	struct device_node *cpu = NULL;
-	struct device_node *memory = NULL;
+	struct device_node *memory;
 	int default_nid = 0;
 	unsigned long i;
 
@@ -750,8 +750,8 @@ static int __init parse_numa_properties(void)
 	}
 
 	get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
-	memory = NULL;
-	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
+
+	for_each_node_by_type(memory, "memory") {
 		unsigned long start;
 		unsigned long size;
 		int nid;
@@ -1187,10 +1187,10 @@ static int hot_add_drconf_scn_to_nid(struct device_node *memory,
  */
 int hot_add_node_scn_to_nid(unsigned long scn_addr)
 {
-	struct device_node *memory = NULL;
+	struct device_node *memory;
 	int nid = -1;
 
-	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
+	for_each_node_by_type(memory, "memory") {
 		unsigned long start, size;
 		int ranges;
 		const unsigned int *memcell_buf;
-- 
cgit v1.1


From dfbe93a222e74b6f96ad84eff2b04a0f864fac65 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Wed, 10 Aug 2011 20:44:23 +0000
Subject: powerpc: Coding style cleanups

While converting code to use for_each_node_by_type I noticed a
number of coding style issues.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/mm/numa.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 00cc090..0bfb90c 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -709,7 +709,6 @@ static void __init parse_drconf_memory(struct device_node *memory)
 
 static int __init parse_numa_properties(void)
 {
-	struct device_node *cpu = NULL;
 	struct device_node *memory;
 	int default_nid = 0;
 	unsigned long i;
@@ -732,6 +731,7 @@ static int __init parse_numa_properties(void)
 	 * each node to be onlined must have NODE_DATA etc backing it.
 	 */
 	for_each_present_cpu(i) {
+		struct device_node *cpu;
 		int nid;
 
 		cpu = of_get_cpu_node(i, NULL);
@@ -800,8 +800,9 @@ new_range:
 	}
 
 	/*
-	 * Now do the same thing for each MEMBLOCK listed in the ibm,dynamic-memory
-	 * property in the ibm,dynamic-reconfiguration-memory node.
+	 * Now do the same thing for each MEMBLOCK listed in the
+	 * ibm,dynamic-memory property in the
+	 * ibm,dynamic-reconfiguration-memory node.
 	 */
 	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
 	if (memory)
-- 
cgit v1.1


From a11940978bd598e65996b4f807cf4904793f7025 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Wed, 10 Aug 2011 20:44:24 +0000
Subject: powerpc: Fix oops when echoing bad values to
 /sys/devices/system/memory/probe

If we echo an address the hypervisor doesn't like to
/sys/devices/system/memory/probe we oops the box:

# echo 0x10000000000 > /sys/devices/system/memory/probe

kernel BUG at arch/powerpc/mm/hash_utils_64.c:541!

The backtrace is:

create_section_mapping
arch_add_memory
add_memory
memory_probe_store
sysdev_class_store
sysfs_write_file
vfs_write
SyS_write

In create_section_mapping we BUG if htab_bolt_mapping returned
an error. A better approach is to return an error which will
propagate back to userspace.

Rerunning the test with this patch applied:

# echo 0x10000000000 > /sys/devices/system/memory/probe
-bash: echo: write error: Invalid argument

Signed-off-by: Anton Blanchard <anton@samba.org>
Cc: stable@kernel.org
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/mm/hash_utils_64.c | 6 +++---
 arch/powerpc/mm/mem.c           | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 1f8b2a0..1628201 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -531,11 +531,11 @@ static unsigned long __init htab_get_table_size(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-void create_section_mapping(unsigned long start, unsigned long end)
+int create_section_mapping(unsigned long start, unsigned long end)
 {
-	BUG_ON(htab_bolt_mapping(start, end, __pa(start),
+	return htab_bolt_mapping(start, end, __pa(start),
 				 pgprot_val(PAGE_KERNEL), mmu_linear_psize,
-				 mmu_kernel_ssize));
+				 mmu_kernel_ssize);
 }
 
 int remove_section_mapping(unsigned long start, unsigned long end)
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index ad9cf49..5db316c 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -123,7 +123,8 @@ int arch_add_memory(int nid, u64 start, u64 size)
 	pgdata = NODE_DATA(nid);
 
 	start = (unsigned long)__va(start);
-	create_section_mapping(start, start + size);
+	if (create_section_mapping(start, start + size))
+		return -EINVAL;
 
 	/* this should work for most non-highmem platforms */
 	zone = pgdata->node_zones;
-- 
cgit v1.1


From 8bdafa39a47265bc029838b35cc6585f69224afa Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Wed, 14 Sep 2011 09:43:15 +0000
Subject: powerpc: Fix deadlock in icswx code

The icswx code introduced an A-B B-A deadlock:

     CPU0                    CPU1
     ----                    ----
lock(&anon_vma->mutex);
                             lock(&mm->mmap_sem);
                             lock(&anon_vma->mutex);
lock(&mm->mmap_sem);

Instead of using the mmap_sem to keep mm_users constant, take the
page table spinlock.

Signed-off-by: Anton Blanchard <anton@samba.org>
Cc: <stable@kernel.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/mm/mmu_context_hash64.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
index 3bafc3d..4ff587e 100644
--- a/arch/powerpc/mm/mmu_context_hash64.c
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -136,8 +136,8 @@ int use_cop(unsigned long acop, struct mm_struct *mm)
 	if (!mm || !acop)
 		return -EINVAL;
 
-	/* We need to make sure mm_users doesn't change */
-	down_read(&mm->mmap_sem);
+	/* The page_table_lock ensures mm_users won't change under us */
+	spin_lock(&mm->page_table_lock);
 	spin_lock(mm->context.cop_lockp);
 
 	if (mm->context.cop_pid == COP_PID_NONE) {
@@ -164,7 +164,7 @@ int use_cop(unsigned long acop, struct mm_struct *mm)
 
 out:
 	spin_unlock(mm->context.cop_lockp);
-	up_read(&mm->mmap_sem);
+	spin_unlock(&mm->page_table_lock);
 
 	return ret;
 }
@@ -185,8 +185,8 @@ void drop_cop(unsigned long acop, struct mm_struct *mm)
 	if (WARN_ON_ONCE(!mm))
 		return;
 
-	/* We need to make sure mm_users doesn't change */
-	down_read(&mm->mmap_sem);
+	/* The page_table_lock ensures mm_users won't change under us */
+	spin_lock(&mm->page_table_lock);
 	spin_lock(mm->context.cop_lockp);
 
 	mm->context.acop &= ~acop;
@@ -213,7 +213,7 @@ void drop_cop(unsigned long acop, struct mm_struct *mm)
 	}
 
 	spin_unlock(mm->context.cop_lockp);
-	up_read(&mm->mmap_sem);
+	spin_unlock(&mm->page_table_lock);
 }
 EXPORT_SYMBOL_GPL(drop_cop);
 
-- 
cgit v1.1


From 25c29f9e3242071bca1bee7ad919baf1888ae436 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 20 Sep 2011 19:58:10 +0000
Subject: powerpc: Fix hugetlb with CONFIG_PPC_MM_SLICES=y

Commit 41151e77a4 ("powerpc: Hugetlb for BookE") added some
#ifdef CONFIG_MM_SLICES conditionals to hugetlb_get_unmapped_area()
and vma_mmu_pagesize().  Unfortunately this is not the correct config
symbol; it should be CONFIG_PPC_MM_SLICES.  The result is that
attempting to use hugetlbfs on 64-bit Power server processors results
in an infinite stack recursion between get_unmapped_area() and
hugetlb_get_unmapped_area().

This fixes it by changing the #ifdef to use CONFIG_PPC_MM_SLICES
in those functions and also in book3e_hugetlb_preload().

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/mm/hugetlbpage-book3e.c | 2 +-
 arch/powerpc/mm/hugetlbpage.c        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c
index 1295b7c..343ad0b 100644
--- a/arch/powerpc/mm/hugetlbpage-book3e.c
+++ b/arch/powerpc/mm/hugetlbpage-book3e.c
@@ -52,7 +52,7 @@ void book3e_hugetlb_preload(struct mm_struct *mm, unsigned long ea, pte_t pte)
 	if (unlikely(is_kernel_addr(ea)))
 		return;
 
-#ifdef CONFIG_MM_SLICES
+#ifdef CONFIG_PPC_MM_SLICES
 	psize = mmu_get_tsize(get_slice_psize(mm, ea));
 	tsize = mmu_get_psize(psize);
 	shift = mmu_psize_defs[psize].shift;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 3a5f59d..48b65be 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -690,7 +690,7 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 					unsigned long len, unsigned long pgoff,
 					unsigned long flags)
 {
-#ifdef CONFIG_MM_SLICES
+#ifdef CONFIG_PPC_MM_SLICES
 	struct hstate *hstate = hstate_file(file);
 	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
 
@@ -702,7 +702,7 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 
 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
 {
-#ifdef CONFIG_MM_SLICES
+#ifdef CONFIG_PPC_MM_SLICES
 	unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
 
 	return 1UL << mmu_psize_to_shift(psize);
-- 
cgit v1.1


From 1dc91c3eb374ca01ec99dc0ca2a38babc509beb3 Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@kernel.crashing.org>
Date: Fri, 16 Sep 2011 10:39:59 -0500
Subject: powerpc/fsl-booke: Fix setup_initial_memory_limit to not blindly map

On FSL Book-E devices we support multiple large TLB sizes and so we can
get into situations in which the initial 1G TLB size is too big and
we're asked for a size that is not mappable by a single entry (like
512M).  The single entry is important because when we bring up secondary
cores they need to ensure any data structure they need to access (eg
PACA or stack) is always mapped.

So we really need to determine what size will actually be mapped by the
first TLB entry to ensure we limit early memory references to that
region.  We refactor the map_mem_in_cams() code to provider a helper
function that we can utilize to determine the size of the first TLB
entry while taking into account size and alignment constraints.

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 arch/powerpc/mm/fsl_booke_mmu.c | 31 +++++++++++++++++++------------
 arch/powerpc/mm/mmu_decl.h      |  2 ++
 arch/powerpc/mm/tlb_nohash.c    | 21 ++++++++++++++++++---
 3 files changed, 39 insertions(+), 15 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index f7802c8..6f593bd 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -146,29 +146,36 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys,
 	loadcam_entry(index);
 }
 
+unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
+			  phys_addr_t phys)
+{
+	unsigned int camsize = __ilog2(ram) & ~1U;
+	unsigned int align = __ffs(virt | phys) & ~1U;
+	unsigned long max_cam = (mfspr(SPRN_TLB1CFG) >> 16) & 0xf;
+
+	/* Convert (4^max) kB to (2^max) bytes */
+	max_cam = max_cam * 2 + 10;
+
+	if (camsize > align)
+		camsize = align;
+	if (camsize > max_cam)
+		camsize = max_cam;
+
+	return 1UL << camsize;
+}
+
 unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx)
 {
 	int i;
 	unsigned long virt = PAGE_OFFSET;
 	phys_addr_t phys = memstart_addr;
 	unsigned long amount_mapped = 0;
-	unsigned long max_cam = (mfspr(SPRN_TLB1CFG) >> 16) & 0xf;
-
-	/* Convert (4^max) kB to (2^max) bytes */
-	max_cam = max_cam * 2 + 10;
 
 	/* Calculate CAM values */
 	for (i = 0; ram && i < max_cam_idx; i++) {
-		unsigned int camsize = __ilog2(ram) & ~1U;
-		unsigned int align = __ffs(virt | phys) & ~1U;
 		unsigned long cam_sz;
 
-		if (camsize > align)
-			camsize = align;
-		if (camsize > max_cam)
-			camsize = max_cam;
-
-		cam_sz = 1UL << camsize;
+		cam_sz = calc_cam_sz(ram, virt, phys);
 		settlbcam(i, virt, phys, cam_sz, PAGE_KERNEL_X, 0);
 
 		ram -= cam_sz;
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index dd0a258..83eb5d5 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -142,6 +142,8 @@ extern unsigned long mmu_mapin_ram(unsigned long top);
 
 #elif defined(CONFIG_PPC_FSL_BOOK3E)
 extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx);
+extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
+				 phys_addr_t phys);
 #ifdef CONFIG_PPC32
 extern void MMU_init_hw(void);
 extern unsigned long mmu_mapin_ram(unsigned long top);
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index afc95c7..6c2eabf 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -642,13 +642,28 @@ void __cpuinit early_init_mmu_secondary(void)
 void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 				phys_addr_t first_memblock_size)
 {
-	/* On Embedded 64-bit, we adjust the RMA size to match
+	/* On non-FSL Embedded 64-bit, we adjust the RMA size to match
 	 * the bolted TLB entry. We know for now that only 1G
 	 * entries are supported though that may eventually
-	 * change. We crop it to the size of the first MEMBLOCK to
+	 * change.
+	 *
+	 * on FSL Embedded 64-bit, we adjust the RMA size to match the
+	 * first bolted TLB entry size.  We still limit max to 1G even if
+	 * the TLB could cover more.  This is due to what the early init
+	 * code is setup to do.
+	 *
+	 * We crop it to the size of the first MEMBLOCK to
 	 * avoid going over total available memory just in case...
 	 */
-	ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000);
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
+		unsigned long linear_sz;
+		linear_sz = calc_cam_sz(first_memblock_size, PAGE_OFFSET,
+					first_memblock_base);
+		ppc64_rma_size = min_t(u64, linear_sz, 0x40000000);
+	} else
+#endif
+		ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000);
 
 	/* Finally limit subsequent allocations */
 	memblock_set_current_limit(first_memblock_base + ppc64_rma_size);
-- 
cgit v1.1


From 4559424a0c34f0cb22fa31bc24015a06dc064b32 Mon Sep 17 00:00:00 2001
From: Becky Bruce <beckyb@kernel.crashing.org>
Date: Wed, 12 Oct 2011 16:17:02 -0500
Subject: powerpc/fsl-booke: Fix settlbcam for 64-bit

Currently, it does a cntlzd on the size and then subtracts it from
21.... this doesn't take into account the varying size of a "long".
Just use __ilog instead (and subtract the 10 we have to subtract
to get to the tsize encoding).

Also correct the comment about page sizes supported.

Signed-off-by: Becky Bruce <beckyb@kernel.crashing.org>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 arch/powerpc/mm/fsl_booke_mmu.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/powerpc/mm')

diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index 6f593bd..66a6fd3 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -101,17 +101,17 @@ unsigned long p_mapped_by_tlbcam(phys_addr_t pa)
 
 /*
  * Set up a variable-size TLB entry (tlbcam). The parameters are not checked;
- * in particular size must be a power of 4 between 4k and 256M (or 1G, for cpus
- * that support extended page sizes).  Note that while some cpus support a
- * page size of 4G, we don't allow its use here.
+ * in particular size must be a power of 4 between 4k and the max supported by
+ * an implementation; max may further be limited by what can be represented in
+ * an unsigned long (for example, 32-bit implementations cannot support a 4GB
+ * size).
  */
 static void settlbcam(int index, unsigned long virt, phys_addr_t phys,
 		unsigned long size, unsigned long flags, unsigned int pid)
 {
-	unsigned int tsize, lz;
+	unsigned int tsize;
 
-	asm (PPC_CNTLZL "%0,%1" : "=r" (lz) : "r" (size));
-	tsize = 21 - lz;
+	tsize = __ilog2(size) - 10;
 
 #ifdef CONFIG_SMP
 	if ((flags & _PAGE_NO_CACHE) == 0)
-- 
cgit v1.1