hugetlb, rmap: add reverse mapping for hugepage

This patch adds reverse mapping feature for hugepage by introducing mapcount for shared/private-mapped hugepage and anon_vma for private-mapped hugepage. While hugepage is not currently swappable, reverse mapping can be useful for memory error handler. Without this patch, memory error handler cannot identify processes using the bad hugepage nor unmap it from them. That is: - for shared hugepage: we can collect processes using a hugepage through pagecache, but can not unmap the hugepage because of the lack of mapcount. - for privately mapped hugepage: we can neither collect processes nor unmap the hugepage. This patch solves these problems. This patch include the bug fix given by commit 23be7468e8, so reverts it. Dependency: "hugetlb: move definition of is_vm_hugetlb_page() to hugepage_inline.h" ChangeLog since May 24. - create hugetlb_inline.h and move is_vm_hugetlb_index() in it. - move functions setting up anon_vma for hugepage into mm/rmap.c. ChangeLog since May 13. - rebased to 2.6.34 - fix logic error (in case that private mapping and shared mapping coexist) - move is_vm_hugetlb_page() into include/linux/mm.h to use this function from linear_page_index() - define and use linear_hugepage_index() instead of compound_order() - use page_move_anon_rmap() in hugetlb_cow() - copy exclusive switch of __set_page_anon_rmap() into hugepage counterpart. - revert commit 24be7468 completely Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Larry Woodman <lwoodman@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Acked-by: Fengguang Wu <fengguang.wu@intel.com> Acked-by: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Andi Kleen <ak@linux.intel.com>
author: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> 2010-05-28 09:29:16 +0900
committer: Andi Kleen <ak@linux.intel.com> 2010-08-11 09:21:15 +0200
commit: 0fe6e20b9c4c53b3e97096ee73a0857f60aad43f (patch)
tree: 3014636f2ed66fdebecb6f6bab338b39c3543a07 /mm
parent: 8edf344c66a3f214d709dad1421c29d678915b3f (diff)
download: op-kernel-dev-0fe6e20b9c4c53b3e97096ee73a0857f60aad43f.zip
op-kernel-dev-0fe6e20b9c4c53b3e97096ee73a0857f60aad43f.tar.gz
2 files changed, 101 insertions, 2 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 54d42b0..aa3c517 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,7 @@
 #include <linux/bootmem.h>
 #include <linux/sysfs.h>
 #include <linux/slab.h>
+#include <linux/rmap.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -220,6 +221,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
 			(vma->vm_pgoff >> huge_page_order(h));
 }
 
+pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
+				     unsigned long address)
+{
+	return vma_hugecache_offset(hstate_vma(vma), vma, address);
+}
+
 /*
  * Return the size of the pages allocated when backing a VMA. In the majority
  * cases this will be same size as used by the page table entries.
@@ -552,6 +559,7 @@ static void free_huge_page(struct page *page)
 	set_page_private(page, 0);
 	page->mapping = NULL;
 	BUG_ON(page_count(page));
+	BUG_ON(page_mapcount(page));
 	INIT_LIST_HEAD(&page->lru);
 
 	spin_lock(&hugetlb_lock);
@@ -2129,6 +2137,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			entry = huge_ptep_get(src_pte);
 			ptepage = pte_page(entry);
 			get_page(ptepage);
+			page_dup_rmap(ptepage);
 			set_huge_pte_at(dst, addr, dst_pte, entry);
 		}
 		spin_unlock(&src->page_table_lock);
@@ -2207,6 +2216,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	flush_tlb_range(vma, start, end);
 	mmu_notifier_invalidate_range_end(mm, start, end);
 	list_for_each_entry_safe(page, tmp, &page_list, lru) {
+		page_remove_rmap(page);
 		list_del(&page->lru);
 		put_page(page);
 	}
@@ -2272,6 +2282,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 	return 1;
 }
 
+/*
+ * Hugetlb_cow() should be called with page lock of the original hugepage held.
+ */
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, pte_t *ptep, pte_t pte,
 			struct page *pagecache_page)
@@ -2286,8 +2299,11 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 retry_avoidcopy:
 	/* If no-one else is actually using this page, avoid the copy
 	 * and just make the page writable */
-	avoidcopy = (page_count(old_page) == 1);
+	avoidcopy = (page_mapcount(old_page) == 1);
 	if (avoidcopy) {
+		if (!trylock_page(old_page))
+			if (PageAnon(old_page))
+				page_move_anon_rmap(old_page, vma, address);
 		set_huge_ptep_writable(vma, address, ptep);
 		return 0;
 	}
@@ -2338,6 +2354,13 @@ retry_avoidcopy:
 		return -PTR_ERR(new_page);
 	}
 
+	/*
+	 * When the original hugepage is shared one, it does not have
+	 * anon_vma prepared.
+	 */
+	if (unlikely(anon_vma_prepare(vma)))
+		return VM_FAULT_OOM;
+
 	copy_huge_page(new_page, old_page, address, vma);
 	__SetPageUptodate(new_page);
 
@@ -2352,6 +2375,8 @@ retry_avoidcopy:
 		huge_ptep_clear_flush(vma, address, ptep);
 		set_huge_pte_at(mm, address, ptep,
 				make_huge_pte(vma, new_page, 1));
+		page_remove_rmap(old_page);
+		hugepage_add_anon_rmap(new_page, vma, address);
 		/* Make the old page be freed below */
 		new_page = old_page;
 	}
@@ -2452,10 +2477,17 @@ retry:
 			spin_lock(&inode->i_lock);
 			inode->i_blocks += blocks_per_huge_page(h);
 			spin_unlock(&inode->i_lock);
+			page_dup_rmap(page);
 		} else {
 			lock_page(page);
-			page->mapping = HUGETLB_POISON;
+			if (unlikely(anon_vma_prepare(vma))) {
+				ret = VM_FAULT_OOM;
+				goto backout_unlocked;
+			}
+			hugepage_add_new_anon_rmap(page, vma, address);
 		}
+	} else {
+		page_dup_rmap(page);
 	}
 
 	/*
@@ -2507,6 +2539,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t *ptep;
 	pte_t entry;
 	int ret;
+	struct page *page = NULL;
 	struct page *pagecache_page = NULL;
 	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
 	struct hstate *h = hstate_vma(vma);
@@ -2548,6 +2581,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 								vma, address);
 	}
 
+	if (!pagecache_page) {
+		page = pte_page(entry);
+		lock_page(page);
+	}
+
 	spin_lock(&mm->page_table_lock);
 	/* Check for a racing update before calling hugetlb_cow */
 	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
@@ -2573,6 +2611,8 @@ out_page_table_lock:
 	if (pagecache_page) {
 		unlock_page(pagecache_page);
 		put_page(pagecache_page);
+	} else {
+		unlock_page(page);
 	}
 
 out_mutex:
diff --git a/mm/rmap.c b/mm/rmap.c
index 38a336e..0ad5357 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,6 +56,7 @@
 #include <linux/memcontrol.h>
 #include <linux/mmu_notifier.h>
 #include <linux/migrate.h>
+#include <linux/hugetlb.h>
 
 #include <asm/tlbflush.h>
 
@@ -326,6 +327,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	unsigned long address;
 
+	if (unlikely(is_vm_hugetlb_page(vma)))
+		pgoff = page->index << huge_page_order(page_hstate(page));
 	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
 		/* page should be within @vma mapping range */
@@ -369,6 +372,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
 	pte_t *pte;
 	spinlock_t *ptl;
 
+	if (unlikely(PageHuge(page))) {
+		pte = huge_pte_offset(mm, address);
+		ptl = &mm->page_table_lock;
+		goto check;
+	}
+
 	pgd = pgd_offset(mm, address);
 	if (!pgd_present(*pgd))
 		return NULL;
@@ -389,6 +398,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
 	}
 
 	ptl = pte_lockptr(mm, pmd);
+check:
 	spin_lock(ptl);
 	if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
 		*ptlp = ptl;
@@ -873,6 +883,12 @@ void page_remove_rmap(struct page *page)
 		page_clear_dirty(page);
 		set_page_dirty(page);
 	}
+	/*
+	 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
+	 * and not charged by memcg for now.
+	 */
+	if (unlikely(PageHuge(page)))
+		return;
 	if (PageAnon(page)) {
 		mem_cgroup_uncharge_page(page);
 		__dec_zone_page_state(page, NR_ANON_PAGES);
@@ -1445,3 +1461,46 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
 		return rmap_walk_file(page, rmap_one, arg);
 }
 #endif /* CONFIG_MIGRATION */
+
+#ifdef CONFIG_HUGETLBFS
+/*
+ * The following three functions are for anonymous (private mapped) hugepages.
+ * Unlike common anonymous pages, anonymous hugepages have no accounting code
+ * and no lru code, because we handle hugepages differently from common pages.
+ */
+static void __hugepage_set_anon_rmap(struct page *page,
+	struct vm_area_struct *vma, unsigned long address, int exclusive)
+{
+	struct anon_vma *anon_vma = vma->anon_vma;
+	BUG_ON(!anon_vma);
+	if (!exclusive) {
+		struct anon_vma_chain *avc;
+		avc = list_entry(vma->anon_vma_chain.prev,
+				 struct anon_vma_chain, same_vma);
+		anon_vma = avc->anon_vma;
+	}
+	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+	page->mapping = (struct address_space *) anon_vma;
+	page->index = linear_page_index(vma, address);
+}
+
+void hugepage_add_anon_rmap(struct page *page,
+			    struct vm_area_struct *vma, unsigned long address)
+{
+	struct anon_vma *anon_vma = vma->anon_vma;
+	int first;
+	BUG_ON(!anon_vma);
+	BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+	first = atomic_inc_and_test(&page->_mapcount);
+	if (first)
+		__hugepage_set_anon_rmap(page, vma, address, 0);
+}
+
+void hugepage_add_new_anon_rmap(struct page *page,
+			struct vm_area_struct *vma, unsigned long address)
+{
+	BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+	atomic_set(&page->_mapcount, 0);
+	__hugepage_set_anon_rmap(page, vma, address, 1);
+}
+#endif /* CONFIG_HUGETLBFS */
author	Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>	2010-05-28 09:29:16 +0900
committer	Andi Kleen <ak@linux.intel.com>	2010-08-11 09:21:15 +0200
commit	0fe6e20b9c4c53b3e97096ee73a0857f60aad43f (patch)
tree	3014636f2ed66fdebecb6f6bab338b39c3543a07 /mm
parent	8edf344c66a3f214d709dad1421c29d678915b3f (diff)
download	op-kernel-dev-0fe6e20b9c4c53b3e97096ee73a0857f60aad43f.zip op-kernel-dev-0fe6e20b9c4c53b3e97096ee73a0857f60aad43f.tar.gz