summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-01-17 12:58:52 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-17 12:58:52 -0800
commit0cbeafb245ca568bc0765645aa64f0451b716657 (patch)
tree663c09ff5a62a1b2b66a17c4dfe0413603530a36 /mm
parent58cf279acac3080ce03eeea5ca268210b3165fe1 (diff)
parent06b031de22d28ae76b2e5bfaf22c56a265a1e106 (diff)
downloadop-kernel-dev-0cbeafb245ca568bc0765645aa64f0451b716657.zip
op-kernel-dev-0cbeafb245ca568bc0765645aa64f0451b716657.tar.gz
Merge branch 'akpm' (patches from Andrew)
Merge second patch-bomb from Andrew Morton: - more MM stuff: - Kirill's page-flags rework - Kirill's now-allegedly-fixed THP rework - MADV_FREE implementation - DAX feature work (msync/fsync). This isn't quite complete but DAX is new and it's good enough and the guys have a handle on what needs to be done - I expect this to be wrapped in the next week or two. - some vsprintf maintenance work - various other misc bits * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (145 commits) printk: change recursion_bug type to bool lib/vsprintf: factor out %pN[F] handler as netdev_bits() lib/vsprintf: refactor duplicate code to special_hex_number() printk-formats.txt: remove unimplemented %pT printk: help pr_debug and pr_devel to optimize out arguments lib/test_printf.c: test dentry printing lib/test_printf.c: add test for large bitmaps lib/test_printf.c: account for kvasprintf tests lib/test_printf.c: add a few number() tests lib/test_printf.c: test precision quirks lib/test_printf.c: check for out-of-bound writes lib/test_printf.c: don't BUG lib/kasprintf.c: add sanity check to kvasprintf lib/vsprintf.c: warn about too large precisions and field widths lib/vsprintf.c: help gcc make number() smaller lib/vsprintf.c: expand field_width to 24 bits lib/vsprintf.c: eliminate potential race in string() lib/vsprintf.c: move string() below widen_string() lib/vsprintf.c: pull out padding code from dentry_name() printk: do cond_resched() between lines while outputting to consoles ...
Diffstat (limited to 'mm')
-rw-r--r--mm/debug.c8
-rw-r--r--mm/filemap.c25
-rw-r--r--mm/gup.c172
-rw-r--r--mm/huge_memory.c1506
-rw-r--r--mm/hugetlb.c12
-rw-r--r--mm/internal.h70
-rw-r--r--mm/ksm.c69
-rw-r--r--mm/madvise.c201
-rw-r--r--mm/memcontrol.c106
-rw-r--r--mm/memory-failure.c125
-rw-r--r--mm/memory.c101
-rw-r--r--mm/memory_hotplug.c67
-rw-r--r--mm/mempolicy.c45
-rw-r--r--mm/migrate.c21
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/mlock.c27
-rw-r--r--mm/mmap.c25
-rw-r--r--mm/mprotect.c7
-rw-r--r--mm/mremap.c15
-rw-r--r--mm/page_alloc.c47
-rw-r--r--mm/page_idle.c27
-rw-r--r--mm/page_isolation.c6
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/pgtable-generic.c14
-rw-r--r--mm/rmap.c369
-rw-r--r--mm/shmem.c25
-rw-r--r--mm/slub.c2
-rw-r--r--mm/sparse-vmemmap.c76
-rw-r--r--mm/sparse.c8
-rw-r--r--mm/swap.c319
-rw-r--r--mm/swap_state.c9
-rw-r--r--mm/swapfile.c34
-rw-r--r--mm/userfaultfd.c8
-rw-r--r--mm/util.c24
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c16
-rw-r--r--mm/vmstat.c5
37 files changed, 2155 insertions, 1444 deletions
diff --git a/mm/debug.c b/mm/debug.c
index 5d2072e..f05b2d5 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -40,9 +40,6 @@ static const struct trace_print_flags pageflag_names[] = {
#ifdef CONFIG_MEMORY_FAILURE
{1UL << PG_hwpoison, "hwpoison" },
#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- {1UL << PG_compound_lock, "compound_lock" },
-#endif
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
{1UL << PG_young, "young" },
{1UL << PG_idle, "idle" },
@@ -82,9 +79,12 @@ static void dump_flags(unsigned long flags,
void dump_page_badflags(struct page *page, const char *reason,
unsigned long badflags)
{
- pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
+ pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
page, atomic_read(&page->_count), page_mapcount(page),
page->mapping, page->index);
+ if (PageCompound(page))
+ pr_cont(" compound_mapcount: %d", compound_mapcount(page));
+ pr_cont("\n");
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
if (reason)
diff --git a/mm/filemap.c b/mm/filemap.c
index ff42d31..847ee43c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -204,7 +204,7 @@ void __delete_from_page_cache(struct page *page, void *shadow,
__dec_zone_page_state(page, NR_FILE_PAGES);
if (PageSwapBacked(page))
__dec_zone_page_state(page, NR_SHMEM);
- BUG_ON(page_mapped(page));
+ VM_BUG_ON_PAGE(page_mapped(page), page);
/*
* At this point page must be either written or cleaned by truncate.
@@ -618,7 +618,7 @@ static int __add_to_page_cache_locked(struct page *page,
if (!huge) {
error = mem_cgroup_try_charge(page, current->mm,
- gfp_mask, &memcg);
+ gfp_mask, &memcg, false);
if (error)
return error;
}
@@ -626,7 +626,7 @@ static int __add_to_page_cache_locked(struct page *page,
error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error) {
if (!huge)
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
return error;
}
@@ -645,7 +645,7 @@ static int __add_to_page_cache_locked(struct page *page,
__inc_zone_page_state(page, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
if (!huge)
- mem_cgroup_commit_charge(page, memcg, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
trace_mm_filemap_add_to_page_cache(page);
return 0;
err_insert:
@@ -653,7 +653,7 @@ err_insert:
/* Leave page->index set: truncation relies upon it */
spin_unlock_irq(&mapping->tree_lock);
if (!huge)
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
page_cache_release(page);
return error;
}
@@ -682,11 +682,11 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
void *shadow = NULL;
int ret;
- __set_page_locked(page);
+ __SetPageLocked(page);
ret = __add_to_page_cache_locked(page, mapping, offset,
gfp_mask, &shadow);
if (unlikely(ret))
- __clear_page_locked(page);
+ __ClearPageLocked(page);
else {
/*
* The page might have been evicted from cache only
@@ -809,6 +809,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
*/
void unlock_page(struct page *page)
{
+ page = compound_head(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
clear_bit_unlock(PG_locked, &page->flags);
smp_mb__after_atomic();
@@ -873,18 +874,20 @@ EXPORT_SYMBOL_GPL(page_endio);
*/
void __lock_page(struct page *page)
{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ struct page *page_head = compound_head(page);
+ DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io,
+ __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_page);
int __lock_page_killable(struct page *page)
{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ struct page *page_head = compound_head(page);
+ DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
- return __wait_on_bit_lock(page_waitqueue(page), &wait,
+ return __wait_on_bit_lock(page_waitqueue(page_head), &wait,
bit_wait_io, TASK_KILLABLE);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
diff --git a/mm/gup.c b/mm/gup.c
index deafa2c..b64a361 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -4,6 +4,7 @@
#include <linux/spinlock.h>
#include <linux/mm.h>
+#include <linux/memremap.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
@@ -62,6 +63,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, unsigned int flags)
{
struct mm_struct *mm = vma->vm_mm;
+ struct dev_pagemap *pgmap = NULL;
struct page *page;
spinlock_t *ptl;
pte_t *ptep, pte;
@@ -98,7 +100,17 @@ retry:
}
page = vm_normal_page(vma, address, pte);
- if (unlikely(!page)) {
+ if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
+ /*
+ * Only return device mapping pages in the FOLL_GET case since
+ * they are only valid while holding the pgmap reference.
+ */
+ pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
+ if (pgmap)
+ page = pte_page(pte);
+ else
+ goto no_page;
+ } else if (unlikely(!page)) {
if (flags & FOLL_DUMP) {
/* Avoid special (like zero) pages in core dumps */
page = ERR_PTR(-EFAULT);
@@ -116,8 +128,28 @@ retry:
}
}
- if (flags & FOLL_GET)
- get_page_foll(page);
+ if (flags & FOLL_SPLIT && PageTransCompound(page)) {
+ int ret;
+ get_page(page);
+ pte_unmap_unlock(ptep, ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (ret)
+ return ERR_PTR(ret);
+ goto retry;
+ }
+
+ if (flags & FOLL_GET) {
+ get_page(page);
+
+ /* drop the pgmap reference now that we hold the page */
+ if (pgmap) {
+ put_dev_pagemap(pgmap);
+ pgmap = NULL;
+ }
+ }
if (flags & FOLL_TOUCH) {
if ((flags & FOLL_WRITE) &&
!pte_dirty(pte) && !PageDirty(page))
@@ -130,6 +162,10 @@ retry:
mark_page_accessed(page);
}
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ /* Do not mlock pte-mapped THP */
+ if (PageTransCompound(page))
+ goto out;
+
/*
* The preliminary mapping check is mainly to avoid the
* pointless overhead of lock_page on the ZERO_PAGE
@@ -220,27 +256,45 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
}
if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
return no_page_table(vma, flags);
- if (pmd_trans_huge(*pmd)) {
- if (flags & FOLL_SPLIT) {
- split_huge_page_pmd(vma, address, pmd);
- return follow_page_pte(vma, address, pmd, flags);
- }
+ if (pmd_devmap(*pmd)) {
ptl = pmd_lock(mm, pmd);
- if (likely(pmd_trans_huge(*pmd))) {
- if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(ptl);
- wait_split_huge_page(vma->anon_vma, pmd);
- } else {
- page = follow_trans_huge_pmd(vma, address,
- pmd, flags);
- spin_unlock(ptl);
- *page_mask = HPAGE_PMD_NR - 1;
- return page;
- }
- } else
+ page = follow_devmap_pmd(vma, address, pmd, flags);
+ spin_unlock(ptl);
+ if (page)
+ return page;
+ }
+ if (likely(!pmd_trans_huge(*pmd)))
+ return follow_page_pte(vma, address, pmd, flags);
+
+ ptl = pmd_lock(mm, pmd);
+ if (unlikely(!pmd_trans_huge(*pmd))) {
+ spin_unlock(ptl);
+ return follow_page_pte(vma, address, pmd, flags);
+ }
+ if (flags & FOLL_SPLIT) {
+ int ret;
+ page = pmd_page(*pmd);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ ret = 0;
+ split_huge_pmd(vma, pmd, address);
+ } else {
+ get_page(page);
spin_unlock(ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ }
+
+ return ret ? ERR_PTR(ret) :
+ follow_page_pte(vma, address, pmd, flags);
}
- return follow_page_pte(vma, address, pmd, flags);
+
+ page = follow_trans_huge_pmd(vma, address, pmd, flags);
+ spin_unlock(ptl);
+ *page_mask = HPAGE_PMD_NR - 1;
+ return page;
}
static int get_gate_page(struct mm_struct *mm, unsigned long address,
@@ -564,6 +618,8 @@ EXPORT_SYMBOL(__get_user_pages);
* @mm: mm_struct of target mm
* @address: user address
* @fault_flags:flags to pass down to handle_mm_fault()
+ * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller
+ * does not allow retry
*
* This is meant to be called in the specific scenario where for locking reasons
* we try to access user memory in atomic context (within a pagefault_disable()
@@ -575,22 +631,28 @@ EXPORT_SYMBOL(__get_user_pages);
* The main difference with get_user_pages() is that this function will
* unconditionally call handle_mm_fault() which will in turn perform all the
* necessary SW fixup of the dirty and young bits in the PTE, while
- * handle_mm_fault() only guarantees to update these in the struct page.
+ * get_user_pages() only guarantees to update these in the struct page.
*
* This is important for some architectures where those bits also gate the
* access permission to the page because they are maintained in software. On
* such architectures, gup() will not be enough to make a subsequent access
* succeed.
*
- * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault().
+ * This function will not return with an unlocked mmap_sem. So it has not the
+ * same semantics wrt the @mm->mmap_sem as does filemap_fault().
*/
int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long address, unsigned int fault_flags)
+ unsigned long address, unsigned int fault_flags,
+ bool *unlocked)
{
struct vm_area_struct *vma;
vm_flags_t vm_flags;
- int ret;
+ int ret, major = 0;
+ if (unlocked)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+
+retry:
vma = find_extend_vma(mm, address);
if (!vma || address < vma->vm_start)
return -EFAULT;
@@ -600,6 +662,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
return -EFAULT;
ret = handle_mm_fault(mm, vma, address, fault_flags);
+ major |= ret & VM_FAULT_MAJOR;
if (ret & VM_FAULT_ERROR) {
if (ret & VM_FAULT_OOM)
return -ENOMEM;
@@ -609,8 +672,19 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
return -EFAULT;
BUG();
}
+
+ if (ret & VM_FAULT_RETRY) {
+ down_read(&mm->mmap_sem);
+ if (!(fault_flags & FAULT_FLAG_TRIED)) {
+ *unlocked = true;
+ fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
+ fault_flags |= FAULT_FLAG_TRIED;
+ goto retry;
+ }
+ }
+
if (tsk) {
- if (ret & VM_FAULT_MAJOR)
+ if (major)
tsk->maj_flt++;
else
tsk->min_flt++;
@@ -896,7 +970,6 @@ long populate_vma_page_range(struct vm_area_struct *vma,
gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
if (vma->vm_flags & VM_LOCKONFAULT)
gup_flags &= ~FOLL_POPULATE;
-
/*
* We want to touch writable mappings with a write fault in order
* to break COW, except for shared mappings because these don't COW
@@ -1036,9 +1109,6 @@ struct page *get_dump_page(unsigned long addr)
* *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
* pages containing page tables.
*
- * *) THP splits will broadcast an IPI, this can be achieved by overriding
- * pmdp_splitting_flush.
- *
* *) ptes can be read atomically by the architecture.
*
* *) access_ok is sufficient to validate userspace address ranges.
@@ -1066,7 +1136,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
* for an example see gup_get_pte in arch/x86/mm/gup.c
*/
pte_t pte = READ_ONCE(*ptep);
- struct page *page;
+ struct page *head, *page;
/*
* Similar to the PMD case below, NUMA hinting must take slow
@@ -1078,15 +1148,17 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
+ head = compound_head(page);
- if (!page_cache_get_speculative(page))
+ if (!page_cache_get_speculative(head))
goto pte_unmap;
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- put_page(page);
+ put_page(head);
goto pte_unmap;
}
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
(*nr)++;
@@ -1119,7 +1191,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
- struct page *head, *page, *tail;
+ struct page *head, *page;
int refs;
if (write && !pmd_write(orig))
@@ -1128,7 +1200,6 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
refs = 0;
head = pmd_page(orig);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
@@ -1149,24 +1220,13 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 0;
}
- /*
- * Any tail pages need their mapcount reference taken before we
- * return. (This allows the THP code to bump their ref count when
- * they are split into base pages).
- */
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
- struct page *head, *page, *tail;
+ struct page *head, *page;
int refs;
if (write && !pud_write(orig))
@@ -1175,7 +1235,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
refs = 0;
head = pud_page(orig);
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
@@ -1196,12 +1255,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return 0;
}
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
@@ -1210,7 +1263,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
struct page **pages, int *nr)
{
int refs;
- struct page *head, *page, *tail;
+ struct page *head, *page;
if (write && !pgd_write(orig))
return 0;
@@ -1218,7 +1271,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
refs = 0;
head = pgd_page(orig);
page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
@@ -1239,12 +1291,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
return 0;
}
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
@@ -1259,7 +1305,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmd_t pmd = READ_ONCE(*pmdp);
next = pmd_addr_end(addr, end);
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ if (pmd_none(pmd))
return 0;
if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f952f05..b2db981 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -16,12 +16,16 @@
#include <linux/swap.h>
#include <linux/shrinker.h>
#include <linux/mm_inline.h>
+#include <linux/swapops.h>
#include <linux/dax.h>
#include <linux/kthread.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
+#include <linux/pfn_t.h>
#include <linux/mman.h>
+#include <linux/memremap.h>
#include <linux/pagemap.h>
+#include <linux/debugfs.h>
#include <linux/migrate.h>
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
@@ -45,6 +49,7 @@ enum scan_result {
SCAN_PAGE_LRU,
SCAN_PAGE_LOCK,
SCAN_PAGE_ANON,
+ SCAN_PAGE_COMPOUND,
SCAN_ANY_PROCESS,
SCAN_VMA_NULL,
SCAN_VMA_CHECK,
@@ -133,6 +138,10 @@ static struct khugepaged_scan khugepaged_scan = {
.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
};
+static DEFINE_SPINLOCK(split_queue_lock);
+static LIST_HEAD(split_queue);
+static unsigned long split_queue_len;
+static struct shrinker deferred_split_shrinker;
static void set_recommended_min_free_kbytes(void)
{
@@ -665,6 +674,9 @@ static int __init hugepage_init(void)
err = register_shrinker(&huge_zero_page_shrinker);
if (err)
goto err_hzp_shrinker;
+ err = register_shrinker(&deferred_split_shrinker);
+ if (err)
+ goto err_split_shrinker;
/*
* By default disable transparent hugepages on smaller systems,
@@ -682,6 +694,8 @@ static int __init hugepage_init(void)
return 0;
err_khugepaged:
+ unregister_shrinker(&deferred_split_shrinker);
+err_split_shrinker:
unregister_shrinker(&huge_zero_page_shrinker);
err_hzp_shrinker:
khugepaged_slab_exit();
@@ -738,6 +752,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
return entry;
}
+static inline struct list_head *page_deferred_list(struct page *page)
+{
+ /*
+ * ->lru in the tail pages is occupied by compound_head.
+ * Let's use ->mapping + ->index in the second tail page as list_head.
+ */
+ return (struct list_head *)&page[2].mapping;
+}
+
+void prep_transhuge_page(struct page *page)
+{
+ /*
+ * we use page->mapping and page->indexlru in second tail page
+ * as list_head: assuming THP order >= 2
+ */
+ BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
+
+ INIT_LIST_HEAD(page_deferred_list(page));
+ set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
+}
+
static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
@@ -751,7 +786,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) {
+ if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -759,7 +794,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable)) {
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
return VM_FAULT_OOM;
}
@@ -775,7 +810,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_none(*pmd))) {
spin_unlock(ptl);
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(mm, pgtable);
} else {
@@ -786,7 +821,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
int ret;
spin_unlock(ptl);
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(mm, pgtable);
ret = handle_userfault(vma, address, flags,
@@ -797,8 +832,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- page_add_new_anon_rmap(page, vma, haddr);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, vma, haddr, true);
+ mem_cgroup_commit_charge(page, memcg, false, true);
lru_cache_add_active_or_unevictable(page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
@@ -892,32 +927,33 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
+ prep_transhuge_page(page);
return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
flags);
}
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write)
+ pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write)
{
struct mm_struct *mm = vma->vm_mm;
pmd_t entry;
spinlock_t *ptl;
ptl = pmd_lock(mm, pmd);
- if (pmd_none(*pmd)) {
- entry = pmd_mkhuge(pfn_pmd(pfn, prot));
- if (write) {
- entry = pmd_mkyoung(pmd_mkdirty(entry));
- entry = maybe_pmd_mkwrite(entry, vma);
- }
- set_pmd_at(mm, addr, pmd, entry);
- update_mmu_cache_pmd(vma, addr, pmd);
- }
+ entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
+ if (pfn_t_devmap(pfn))
+ entry = pmd_mkdevmap(entry);
+ if (write) {
+ entry = pmd_mkyoung(pmd_mkdirty(entry));
+ entry = maybe_pmd_mkwrite(entry, vma);
+ }
+ set_pmd_at(mm, addr, pmd, entry);
+ update_mmu_cache_pmd(vma, addr, pmd);
spin_unlock(ptl);
}
int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, unsigned long pfn, bool write)
+ pmd_t *pmd, pfn_t pfn, bool write)
{
pgprot_t pgprot = vma->vm_page_prot;
/*
@@ -929,7 +965,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
- BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+ BUG_ON(!pfn_t_devmap(pfn));
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
@@ -939,6 +975,63 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
return VM_FAULT_NOPAGE;
}
+static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd)
+{
+ pmd_t _pmd;
+
+ /*
+ * We should set the dirty bit only for FOLL_WRITE but for now
+ * the dirty bit in the pmd is meaningless. And if the dirty
+ * bit will become meaningful and we'll only set it with
+ * FOLL_WRITE, an atomic set_bit will be required on the pmd to
+ * set the young bit, instead of the current set_pmd_at.
+ */
+ _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+ if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
+ pmd, _pmd, 1))
+ update_mmu_cache_pmd(vma, addr, pmd);
+}
+
+struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, int flags)
+{
+ unsigned long pfn = pmd_pfn(*pmd);
+ struct mm_struct *mm = vma->vm_mm;
+ struct dev_pagemap *pgmap;
+ struct page *page;
+
+ assert_spin_locked(pmd_lockptr(mm, pmd));
+
+ if (flags & FOLL_WRITE && !pmd_write(*pmd))
+ return NULL;
+
+ if (pmd_present(*pmd) && pmd_devmap(*pmd))
+ /* pass */;
+ else
+ return NULL;
+
+ if (flags & FOLL_TOUCH)
+ touch_pmd(vma, addr, pmd);
+
+ /*
+ * device mapped pages can only be returned if the
+ * caller will manage the page reference count.
+ */
+ if (!(flags & FOLL_GET))
+ return ERR_PTR(-EEXIST);
+
+ pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
+ pgmap = get_dev_pagemap(pfn, NULL);
+ if (!pgmap)
+ return ERR_PTR(-EFAULT);
+ page = pfn_to_page(pfn);
+ get_page(page);
+ put_dev_pagemap(pgmap);
+
+ return page;
+}
+
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *vma)
@@ -960,7 +1053,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = -EAGAIN;
pmd = *src_pmd;
- if (unlikely(!pmd_trans_huge(pmd))) {
+ if (unlikely(!pmd_trans_huge(pmd) && !pmd_devmap(pmd))) {
pte_free(dst_mm, pgtable);
goto out_unlock;
}
@@ -983,26 +1076,20 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out_unlock;
}
- if (unlikely(pmd_trans_splitting(pmd))) {
- /* split huge page running from under us */
- spin_unlock(src_ptl);
- spin_unlock(dst_ptl);
- pte_free(dst_mm, pgtable);
-
- wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
- goto out;
+ if (pmd_trans_huge(pmd)) {
+ /* thp accounting separate from pmd_devmap accounting */
+ src_page = pmd_page(pmd);
+ VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
+ get_page(src_page);
+ page_dup_rmap(src_page, true);
+ add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ atomic_long_inc(&dst_mm->nr_ptes);
+ pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
}
- src_page = pmd_page(pmd);
- VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
- get_page(src_page);
- page_dup_rmap(src_page);
- add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
pmdp_set_wrprotect(src_mm, addr, src_pmd);
pmd = pmd_mkold(pmd_wrprotect(pmd));
- pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
- atomic_long_inc(&dst_mm->nr_ptes);
ret = 0;
out_unlock:
@@ -1035,37 +1122,6 @@ unlock:
spin_unlock(ptl);
}
-/*
- * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages
- * during copy_user_huge_page()'s copy_page_rep(): in the case when
- * the source page gets split and a tail freed before copy completes.
- * Called under pmd_lock of checked pmd, so safe from splitting itself.
- */
-static void get_user_huge_page(struct page *page)
-{
- if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
- struct page *endpage = page + HPAGE_PMD_NR;
-
- atomic_add(HPAGE_PMD_NR, &page->_count);
- while (++page < endpage)
- get_huge_page_tail(page);
- } else {
- get_page(page);
- }
-}
-
-static void put_user_huge_page(struct page *page)
-{
- if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
- struct page *endpage = page + HPAGE_PMD_NR;
-
- while (page < endpage)
- put_page(page++);
- } else {
- put_page(page);
- }
-}
-
static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
@@ -1095,13 +1151,14 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
vma, address, page_to_nid(page));
if (unlikely(!pages[i] ||
mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL,
- &memcg))) {
+ &memcg, false))) {
if (pages[i])
put_page(pages[i]);
while (--i >= 0) {
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
- mem_cgroup_cancel_charge(pages[i], memcg);
+ mem_cgroup_cancel_charge(pages[i], memcg,
+ false);
put_page(pages[i]);
}
kfree(pages);
@@ -1139,8 +1196,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
- page_add_new_anon_rmap(pages[i], vma, haddr);
- mem_cgroup_commit_charge(pages[i], memcg, false);
+ page_add_new_anon_rmap(pages[i], vma, haddr, false);
+ mem_cgroup_commit_charge(pages[i], memcg, false, false);
lru_cache_add_active_or_unevictable(pages[i], vma);
pte = pte_offset_map(&_pmd, haddr);
VM_BUG_ON(!pte_none(*pte));
@@ -1151,7 +1208,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -1168,7 +1225,7 @@ out_free_pages:
for (i = 0; i < HPAGE_PMD_NR; i++) {
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
- mem_cgroup_cancel_charge(pages[i], memcg);
+ mem_cgroup_cancel_charge(pages[i], memcg, false);
put_page(pages[i]);
}
kfree(pages);
@@ -1198,7 +1255,17 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
page = pmd_page(orig_pmd);
VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
- if (page_mapcount(page) == 1) {
+ /*
+ * We can only reuse the page if nobody else maps the huge page or it's
+ * part. We can do it by checking page_mapcount() on each sub-page, but
+ * it's expensive.
+ * The cheaper way is to check page_count() to be equal 1: every
+ * mapcount takes page reference reference, so this way we can
+ * guarantee, that the PMD is the only mapping.
+ * This can give false negative if somebody pinned the page, but that's
+ * fine.
+ */
+ if (page_mapcount(page) == 1 && page_count(page) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1207,7 +1274,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
ret |= VM_FAULT_WRITE;
goto out_unlock;
}
- get_user_huge_page(page);
+ get_page(page);
spin_unlock(ptl);
alloc:
if (transparent_hugepage_enabled(vma) &&
@@ -1217,30 +1284,33 @@ alloc:
} else
new_page = NULL;
- if (unlikely(!new_page)) {
+ if (likely(new_page)) {
+ prep_transhuge_page(new_page);
+ } else {
if (!page) {
- split_huge_page_pmd(vma, address, pmd);
+ split_huge_pmd(vma, pmd, address);
ret |= VM_FAULT_FALLBACK;
} else {
ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
pmd, orig_pmd, page, haddr);
if (ret & VM_FAULT_OOM) {
- split_huge_page(page);
+ split_huge_pmd(vma, pmd, address);
ret |= VM_FAULT_FALLBACK;
}
- put_user_huge_page(page);
+ put_page(page);
}
count_vm_event(THP_FAULT_FALLBACK);
goto out;
}
- if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) {
+ if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg,
+ true))) {
put_page(new_page);
if (page) {
- split_huge_page(page);
- put_user_huge_page(page);
+ split_huge_pmd(vma, pmd, address);
+ put_page(page);
} else
- split_huge_page_pmd(vma, address, pmd);
+ split_huge_pmd(vma, pmd, address);
ret |= VM_FAULT_FALLBACK;
count_vm_event(THP_FAULT_FALLBACK);
goto out;
@@ -1260,10 +1330,10 @@ alloc:
spin_lock(ptl);
if (page)
- put_user_huge_page(page);
+ put_page(page);
if (unlikely(!pmd_same(*pmd, orig_pmd))) {
spin_unlock(ptl);
- mem_cgroup_cancel_charge(new_page, memcg);
+ mem_cgroup_cancel_charge(new_page, memcg, true);
put_page(new_page);
goto out_mn;
} else {
@@ -1271,8 +1341,8 @@ alloc:
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
pmdp_huge_clear_flush_notify(vma, haddr, pmd);
- page_add_new_anon_rmap(new_page, vma, haddr);
- mem_cgroup_commit_charge(new_page, memcg, false);
+ page_add_new_anon_rmap(new_page, vma, haddr, true);
+ mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_active_or_unevictable(new_page, vma);
set_pmd_at(mm, haddr, pmd, entry);
update_mmu_cache_pmd(vma, address, pmd);
@@ -1281,7 +1351,7 @@ alloc:
put_huge_zero_page();
} else {
VM_BUG_ON_PAGE(!PageHead(page), page);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
put_page(page);
}
ret |= VM_FAULT_WRITE;
@@ -1319,23 +1389,23 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
page = pmd_page(*pmd);
VM_BUG_ON_PAGE(!PageHead(page), page);
- if (flags & FOLL_TOUCH) {
- pmd_t _pmd;
+ if (flags & FOLL_TOUCH)
+ touch_pmd(vma, addr, pmd);
+ if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
/*
- * We should set the dirty bit only for FOLL_WRITE but
- * for now the dirty bit in the pmd is meaningless.
- * And if the dirty bit will become meaningful and
- * we'll only set it with FOLL_WRITE, an atomic
- * set_bit will be required on the pmd to set the
- * young bit, instead of the current set_pmd_at.
+ * We don't mlock() pte-mapped THPs. This way we can avoid
+ * leaking mlocked pages into non-VM_LOCKED VMAs.
+ *
+ * In most cases the pmd is the only mapping of the page as we
+ * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
+ * writable private mappings in populate_vma_page_range().
+ *
+ * The only scenario when we have the page shared here is if we
+ * mlocking read-only mapping shared over fork(). We skip
+ * mlocking such pages.
*/
- _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
- if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
- pmd, _pmd, 1))
- update_mmu_cache_pmd(vma, addr, pmd);
- }
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
- if (page->mapping && trylock_page(page)) {
+ if (compound_mapcount(page) == 1 && !PageDoubleMap(page) &&
+ page->mapping && trylock_page(page)) {
lru_add_drain();
if (page->mapping)
mlock_vma_page(page);
@@ -1345,7 +1415,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON_PAGE(!PageCompound(page), page);
if (flags & FOLL_GET)
- get_page_foll(page);
+ get_page(page);
out:
return page;
@@ -1480,13 +1550,84 @@ out:
return 0;
}
+int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long addr, unsigned long next)
+
+{
+ spinlock_t *ptl;
+ pmd_t orig_pmd;
+ struct page *page;
+ struct mm_struct *mm = tlb->mm;
+ int ret = 0;
+
+ if (!pmd_trans_huge_lock(pmd, vma, &ptl))
+ goto out;
+
+ orig_pmd = *pmd;
+ if (is_huge_zero_pmd(orig_pmd)) {
+ ret = 1;
+ goto out;
+ }
+
+ page = pmd_page(orig_pmd);
+ /*
+ * If other processes are mapping this page, we couldn't discard
+ * the page unless they all do MADV_FREE so let's skip the page.
+ */
+ if (page_mapcount(page) != 1)
+ goto out;
+
+ if (!trylock_page(page))
+ goto out;
+
+ /*
+ * If user want to discard part-pages of THP, split it so MADV_FREE
+ * will deactivate only them.
+ */
+ if (next - addr != HPAGE_PMD_SIZE) {
+ get_page(page);
+ spin_unlock(ptl);
+ if (split_huge_page(page)) {
+ put_page(page);
+ unlock_page(page);
+ goto out_unlocked;
+ }
+ put_page(page);
+ unlock_page(page);
+ ret = 1;
+ goto out_unlocked;
+ }
+
+ if (PageDirty(page))
+ ClearPageDirty(page);
+ unlock_page(page);
+
+ if (PageActive(page))
+ deactivate_page(page);
+
+ if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
+ orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
+ tlb->fullmm);
+ orig_pmd = pmd_mkold(orig_pmd);
+ orig_pmd = pmd_mkclean(orig_pmd);
+
+ set_pmd_at(mm, addr, pmd, orig_pmd);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ }
+ ret = 1;
+out:
+ spin_unlock(ptl);
+out_unlocked:
+ return ret;
+}
+
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
pmd_t orig_pmd;
spinlock_t *ptl;
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1)
+ if (!__pmd_trans_huge_lock(pmd, vma, &ptl))
return 0;
/*
* For architectures like ppc64 we look at deposited pgtable
@@ -1508,7 +1649,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
put_huge_zero_page();
} else {
struct page *page = pmd_page(orig_pmd);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
VM_BUG_ON_PAGE(!PageHead(page), page);
@@ -1520,13 +1661,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
return 1;
}
-int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
+bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
- int ret = 0;
pmd_t pmd;
struct mm_struct *mm = vma->vm_mm;
@@ -1535,7 +1675,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
(new_addr & ~HPAGE_PMD_MASK) ||
old_end - old_addr < HPAGE_PMD_SIZE ||
(new_vma->vm_flags & VM_NOHUGEPAGE))
- goto out;
+ return false;
/*
* The destination pmd shouldn't be established, free_pgtables()
@@ -1543,15 +1683,14 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
*/
if (WARN_ON(!pmd_none(*new_pmd))) {
VM_BUG_ON(pmd_trans_huge(*new_pmd));
- goto out;
+ return false;
}
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_sem prevents deadlock.
*/
- ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl);
- if (ret == 1) {
+ if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) {
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -1567,9 +1706,9 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
+ return true;
}
-out:
- return ret;
+ return false;
}
/*
@@ -1585,7 +1724,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
spinlock_t *ptl;
int ret = 0;
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl)) {
pmd_t entry;
bool preserve_write = prot_numa && pmd_write(*pmd);
ret = 1;
@@ -1616,405 +1755,19 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
}
/*
- * Returns 1 if a given pmd maps a stable (not under splitting) thp.
- * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
+ * Returns true if a given pmd maps a thp, false otherwise.
*
- * Note that if it returns 1, this routine returns without unlocking page
- * table locks. So callers must unlock them.
+ * Note that if it returns true, this routine returns without unlocking page
+ * table lock. So callers must unlock it.
*/
-int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
spinlock_t **ptl)
{
*ptl = pmd_lock(vma->vm_mm, pmd);
- if (likely(pmd_trans_huge(*pmd))) {
- if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(*ptl);
- wait_split_huge_page(vma->anon_vma, pmd);
- return -1;
- } else {
- /* Thp mapped by 'pmd' is stable, so we can
- * handle it as it is. */
- return 1;
- }
- }
- spin_unlock(*ptl);
- return 0;
-}
-
-/*
- * This function returns whether a given @page is mapped onto the @address
- * in the virtual space of @mm.
- *
- * When it's true, this function returns *pmd with holding the page table lock
- * and passing it back to the caller via @ptl.
- * If it's false, returns NULL without holding the page table lock.
- */
-pmd_t *page_check_address_pmd(struct page *page,
- struct mm_struct *mm,
- unsigned long address,
- enum page_check_address_pmd_flag flag,
- spinlock_t **ptl)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
-
- if (address & ~HPAGE_PMD_MASK)
- return NULL;
-
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
- return NULL;
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- return NULL;
- pmd = pmd_offset(pud, address);
-
- *ptl = pmd_lock(mm, pmd);
- if (!pmd_present(*pmd))
- goto unlock;
- if (pmd_page(*pmd) != page)
- goto unlock;
- /*
- * split_vma() may create temporary aliased mappings. There is
- * no risk as long as all huge pmd are found and have their
- * splitting bit set before __split_huge_page_refcount
- * runs. Finding the same huge pmd more than once during the
- * same rmap walk is not a problem.
- */
- if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
- pmd_trans_splitting(*pmd))
- goto unlock;
- if (pmd_trans_huge(*pmd)) {
- VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
- !pmd_trans_splitting(*pmd));
- return pmd;
- }
-unlock:
+ if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
+ return true;
spin_unlock(*ptl);
- return NULL;
-}
-
-static int __split_huge_page_splitting(struct page *page,
- struct vm_area_struct *vma,
- unsigned long address)
-{
- struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
- pmd_t *pmd;
- int ret = 0;
- /* For mmu_notifiers */
- const unsigned long mmun_start = address;
- const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
-
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
- if (pmd) {
- /*
- * We can't temporarily set the pmd to null in order
- * to split it, the pmd must remain marked huge at all
- * times or the VM won't take the pmd_trans_huge paths
- * and it won't wait on the anon_vma->root->rwsem to
- * serialize against split_huge_page*.
- */
- pmdp_splitting_flush(vma, address, pmd);
-
- ret = 1;
- spin_unlock(ptl);
- }
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-
- return ret;
-}
-
-static void __split_huge_page_refcount(struct page *page,
- struct list_head *list)
-{
- int i;
- struct zone *zone = page_zone(page);
- struct lruvec *lruvec;
- int tail_count = 0;
-
- /* prevent PageLRU to go away from under us, and freeze lru stats */
- spin_lock_irq(&zone->lru_lock);
- lruvec = mem_cgroup_page_lruvec(page, zone);
-
- compound_lock(page);
- /* complete memcg works before add pages to LRU */
- mem_cgroup_split_huge_fixup(page);
-
- for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
- struct page *page_tail = page + i;
-
- /* tail_page->_mapcount cannot change */
- BUG_ON(page_mapcount(page_tail) < 0);
- tail_count += page_mapcount(page_tail);
- /* check for overflow */
- BUG_ON(tail_count < 0);
- BUG_ON(atomic_read(&page_tail->_count) != 0);
- /*
- * tail_page->_count is zero and not changing from
- * under us. But get_page_unless_zero() may be running
- * from under us on the tail_page. If we used
- * atomic_set() below instead of atomic_add(), we
- * would then run atomic_set() concurrently with
- * get_page_unless_zero(), and atomic_set() is
- * implemented in C not using locked ops. spin_unlock
- * on x86 sometime uses locked ops because of PPro
- * errata 66, 92, so unless somebody can guarantee
- * atomic_set() here would be safe on all archs (and
- * not only on x86), it's safer to use atomic_add().
- */
- atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
- &page_tail->_count);
-
- /* after clearing PageTail the gup refcount can be released */
- smp_mb__after_atomic();
-
- page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
- page_tail->flags |= (page->flags &
- ((1L << PG_referenced) |
- (1L << PG_swapbacked) |
- (1L << PG_mlocked) |
- (1L << PG_uptodate) |
- (1L << PG_active) |
- (1L << PG_unevictable)));
- page_tail->flags |= (1L << PG_dirty);
-
- clear_compound_head(page_tail);
-
- if (page_is_young(page))
- set_page_young(page_tail);
- if (page_is_idle(page))
- set_page_idle(page_tail);
-
- /*
- * __split_huge_page_splitting() already set the
- * splitting bit in all pmd that could map this
- * hugepage, that will ensure no CPU can alter the
- * mapcount on the head page. The mapcount is only
- * accounted in the head page and it has to be
- * transferred to all tail pages in the below code. So
- * for this code to be safe, the split the mapcount
- * can't change. But that doesn't mean userland can't
- * keep changing and reading the page contents while
- * we transfer the mapcount, so the pmd splitting
- * status is achieved setting a reserved bit in the
- * pmd, not by clearing the present bit.
- */
- page_tail->_mapcount = page->_mapcount;
-
- BUG_ON(page_tail->mapping);
- page_tail->mapping = page->mapping;
-
- page_tail->index = page->index + i;
- page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
-
- BUG_ON(!PageAnon(page_tail));
- BUG_ON(!PageUptodate(page_tail));
- BUG_ON(!PageDirty(page_tail));
- BUG_ON(!PageSwapBacked(page_tail));
-
- lru_add_page_tail(page, page_tail, lruvec, list);
- }
- atomic_sub(tail_count, &page->_count);
- BUG_ON(atomic_read(&page->_count) <= 0);
-
- __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
-
- ClearPageCompound(page);
- compound_unlock(page);
- spin_unlock_irq(&zone->lru_lock);
-
- for (i = 1; i < HPAGE_PMD_NR; i++) {
- struct page *page_tail = page + i;
- BUG_ON(page_count(page_tail) <= 0);
- /*
- * Tail pages may be freed if there wasn't any mapping
- * like if add_to_swap() is running on a lru page that
- * had its mapping zapped. And freeing these pages
- * requires taking the lru_lock so we do the put_page
- * of the tail pages after the split is complete.
- */
- put_page(page_tail);
- }
-
- /*
- * Only the head page (now become a regular page) is required
- * to be pinned by the caller.
- */
- BUG_ON(page_count(page) <= 0);
-}
-
-static int __split_huge_page_map(struct page *page,
- struct vm_area_struct *vma,
- unsigned long address)
-{
- struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
- pmd_t *pmd, _pmd;
- int ret = 0, i;
- pgtable_t pgtable;
- unsigned long haddr;
-
- pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
- if (pmd) {
- pgtable = pgtable_trans_huge_withdraw(mm, pmd);
- pmd_populate(mm, &_pmd, pgtable);
- if (pmd_write(*pmd))
- BUG_ON(page_mapcount(page) != 1);
-
- haddr = address;
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
- pte_t *pte, entry;
- BUG_ON(PageCompound(page+i));
- /*
- * Note that NUMA hinting access restrictions are not
- * transferred to avoid any possibility of altering
- * permissions across VMAs.
- */
- entry = mk_pte(page + i, vma->vm_page_prot);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (!pmd_write(*pmd))
- entry = pte_wrprotect(entry);
- if (!pmd_young(*pmd))
- entry = pte_mkold(entry);
- pte = pte_offset_map(&_pmd, haddr);
- BUG_ON(!pte_none(*pte));
- set_pte_at(mm, haddr, pte, entry);
- pte_unmap(pte);
- }
-
- smp_wmb(); /* make pte visible before pmd */
- /*
- * Up to this point the pmd is present and huge and
- * userland has the whole access to the hugepage
- * during the split (which happens in place). If we
- * overwrite the pmd with the not-huge version
- * pointing to the pte here (which of course we could
- * if all CPUs were bug free), userland could trigger
- * a small page size TLB miss on the small sized TLB
- * while the hugepage TLB entry is still established
- * in the huge TLB. Some CPU doesn't like that. See
- * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
- * Erratum 383 on page 93. Intel should be safe but is
- * also warns that it's only safe if the permission
- * and cache attributes of the two entries loaded in
- * the two TLB is identical (which should be the case
- * here). But it is generally safer to never allow
- * small and huge TLB entries for the same virtual
- * address to be loaded simultaneously. So instead of
- * doing "pmd_populate(); flush_pmd_tlb_range();" we first
- * mark the current pmd notpresent (atomically because
- * here the pmd_trans_huge and pmd_trans_splitting
- * must remain set at all times on the pmd until the
- * split is complete for this pmd), then we flush the
- * SMP TLB and finally we write the non-huge version
- * of the pmd entry with pmd_populate.
- */
- pmdp_invalidate(vma, address, pmd);
- pmd_populate(mm, pmd, pgtable);
- ret = 1;
- spin_unlock(ptl);
- }
-
- return ret;
-}
-
-/* must be called with anon_vma->root->rwsem held */
-static void __split_huge_page(struct page *page,
- struct anon_vma *anon_vma,
- struct list_head *list)
-{
- int mapcount, mapcount2;
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- struct anon_vma_chain *avc;
-
- BUG_ON(!PageHead(page));
- BUG_ON(PageTail(page));
-
- mapcount = 0;
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
- struct vm_area_struct *vma = avc->vma;
- unsigned long addr = vma_address(page, vma);
- BUG_ON(is_vma_temporary_stack(vma));
- mapcount += __split_huge_page_splitting(page, vma, addr);
- }
- /*
- * It is critical that new vmas are added to the tail of the
- * anon_vma list. This guarantes that if copy_huge_pmd() runs
- * and establishes a child pmd before
- * __split_huge_page_splitting() freezes the parent pmd (so if
- * we fail to prevent copy_huge_pmd() from running until the
- * whole __split_huge_page() is complete), we will still see
- * the newly established pmd of the child later during the
- * walk, to be able to set it as pmd_trans_splitting too.
- */
- if (mapcount != page_mapcount(page)) {
- pr_err("mapcount %d page_mapcount %d\n",
- mapcount, page_mapcount(page));
- BUG();
- }
-
- __split_huge_page_refcount(page, list);
-
- mapcount2 = 0;
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
- struct vm_area_struct *vma = avc->vma;
- unsigned long addr = vma_address(page, vma);
- BUG_ON(is_vma_temporary_stack(vma));
- mapcount2 += __split_huge_page_map(page, vma, addr);
- }
- if (mapcount != mapcount2) {
- pr_err("mapcount %d mapcount2 %d page_mapcount %d\n",
- mapcount, mapcount2, page_mapcount(page));
- BUG();
- }
-}
-
-/*
- * Split a hugepage into normal pages. This doesn't change the position of head
- * page. If @list is null, tail pages will be added to LRU list, otherwise, to
- * @list. Both head page and tail pages will inherit mapping, flags, and so on
- * from the hugepage.
- * Return 0 if the hugepage is split successfully otherwise return 1.
- */
-int split_huge_page_to_list(struct page *page, struct list_head *list)
-{
- struct anon_vma *anon_vma;
- int ret = 1;
-
- BUG_ON(is_huge_zero_page(page));
- BUG_ON(!PageAnon(page));
-
- /*
- * The caller does not necessarily hold an mmap_sem that would prevent
- * the anon_vma disappearing so we first we take a reference to it
- * and then lock the anon_vma for write. This is similar to
- * page_lock_anon_vma_read except the write lock is taken to serialise
- * against parallel split or collapse operations.
- */
- anon_vma = page_get_anon_vma(page);
- if (!anon_vma)
- goto out;
- anon_vma_lock_write(anon_vma);
-
- ret = 0;
- if (!PageCompound(page))
- goto out_unlock;
-
- BUG_ON(!PageSwapBacked(page));
- __split_huge_page(page, anon_vma, list);
- count_vm_event(THP_SPLIT);
-
- BUG_ON(PageCompound(page));
-out_unlock:
- anon_vma_unlock_write(anon_vma);
- put_anon_vma(anon_vma);
-out:
- return ret;
+ return false;
}
#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
@@ -2371,7 +2124,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
* superfluous.
*/
pte_clear(vma->vm_mm, address, _pte);
- page_remove_rmap(src_page);
+ page_remove_rmap(src_page, false);
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
}
@@ -2481,6 +2234,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
return NULL;
}
+ prep_transhuge_page(*hpage);
count_vm_event(THP_COLLAPSE_ALLOC);
return *hpage;
}
@@ -2492,8 +2246,12 @@ static int khugepaged_find_target_node(void)
static inline struct page *alloc_hugepage(int defrag)
{
- return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
- HPAGE_PMD_ORDER);
+ struct page *page;
+
+ page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER);
+ if (page)
+ prep_transhuge_page(page);
+ return page;
}
static struct page *khugepaged_alloc_hugepage(bool *wait)
@@ -2543,7 +2301,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
(vma->vm_flags & VM_NOHUGEPAGE))
return false;
-
if (!vma->anon_vma || vma->vm_ops)
return false;
if (is_vma_temporary_stack(vma))
@@ -2583,7 +2340,7 @@ static void collapse_huge_page(struct mm_struct *mm,
goto out_nolock;
}
- if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg))) {
+ if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
result = SCAN_CGROUP_CHARGE_FAIL;
goto out_nolock;
}
@@ -2682,8 +2439,8 @@ static void collapse_huge_page(struct mm_struct *mm,
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
- page_add_new_anon_rmap(new_page, vma, address);
- mem_cgroup_commit_charge(new_page, memcg, false);
+ page_add_new_anon_rmap(new_page, vma, address, true);
+ mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_active_or_unevictable(new_page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
@@ -2703,7 +2460,7 @@ out_nolock:
trace_mm_collapse_huge_page(mm, isolated, result);
return;
out:
- mem_cgroup_cancel_charge(new_page, memcg);
+ mem_cgroup_cancel_charge(new_page, memcg, true);
goto out_up_write;
}
@@ -2755,6 +2512,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
result = SCAN_PAGE_NULL;
goto out_unmap;
}
+
+ /* TODO: teach khugepaged to collapse THP mapped with pte */
+ if (PageCompound(page)) {
+ result = SCAN_PAGE_COMPOUND;
+ goto out_unmap;
+ }
+
/*
* Record which node the original page is from and save this
* information to khugepaged_node_load[].
@@ -2767,7 +2531,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
goto out_unmap;
}
khugepaged_node_load[node]++;
- VM_BUG_ON_PAGE(PageCompound(page), page);
if (!PageLRU(page)) {
result = SCAN_SCAN_ABORT;
goto out_unmap;
@@ -3040,8 +2803,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
pmd_t _pmd;
int i;
- pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
+ pmdp_huge_clear_flush_notify(vma, haddr, pmd);
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
@@ -3060,66 +2823,153 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
put_huge_zero_page();
}
-void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmd)
+static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long haddr, bool freeze)
{
- spinlock_t *ptl;
- struct page *page = NULL;
struct mm_struct *mm = vma->vm_mm;
- unsigned long haddr = address & HPAGE_PMD_MASK;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct page *page;
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ bool young, write, dirty;
+ int i;
- BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
+ VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
+ VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
+ VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
+ VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd));
+
+ count_vm_event(THP_SPLIT_PMD);
- mmun_start = haddr;
- mmun_end = haddr + HPAGE_PMD_SIZE;
-again:
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_trans_huge(*pmd)))
- goto unlock;
if (vma_is_dax(vma)) {
pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
if (is_huge_zero_pmd(_pmd))
put_huge_zero_page();
+ return;
} else if (is_huge_zero_pmd(*pmd)) {
- __split_huge_zero_page_pmd(vma, haddr, pmd);
- } else {
- page = pmd_page(*pmd);
- VM_BUG_ON_PAGE(!page_count(page), page);
- get_page(page);
+ return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
- unlock:
- spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- if (!page)
- return;
+ page = pmd_page(*pmd);
+ VM_BUG_ON_PAGE(!page_count(page), page);
+ atomic_add(HPAGE_PMD_NR - 1, &page->_count);
+ write = pmd_write(*pmd);
+ young = pmd_young(*pmd);
+ dirty = pmd_dirty(*pmd);
- split_huge_page(page);
- put_page(page);
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ pmd_populate(mm, &_pmd, pgtable);
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t entry, *pte;
+ /*
+ * Note that NUMA hinting access restrictions are not
+ * transferred to avoid any possibility of altering
+ * permissions across VMAs.
+ */
+ if (freeze) {
+ swp_entry_t swp_entry;
+ swp_entry = make_migration_entry(page + i, write);
+ entry = swp_entry_to_pte(swp_entry);
+ } else {
+ entry = mk_pte(page + i, vma->vm_page_prot);
+ entry = maybe_mkwrite(entry, vma);
+ if (!write)
+ entry = pte_wrprotect(entry);
+ if (!young)
+ entry = pte_mkold(entry);
+ }
+ if (dirty)
+ SetPageDirty(page + i);
+ pte = pte_offset_map(&_pmd, haddr);
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ atomic_inc(&page[i]._mapcount);
+ pte_unmap(pte);
+ }
+
+ /*
+ * Set PG_double_map before dropping compound_mapcount to avoid
+ * false-negative page_mapped().
+ */
+ if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ atomic_inc(&page[i]._mapcount);
+ }
+
+ if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
+ /* Last compound_mapcount is gone. */
+ __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+ if (TestClearPageDoubleMap(page)) {
+ /* No need in mapcount reference anymore */
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ atomic_dec(&page[i]._mapcount);
+ }
+ }
+
+ smp_wmb(); /* make pte visible before pmd */
/*
- * We don't always have down_write of mmap_sem here: a racing
- * do_huge_pmd_wp_page() might have copied-on-write to another
- * huge page before our split_huge_page() got the anon_vma lock.
+ * Up to this point the pmd is present and huge and userland has the
+ * whole access to the hugepage during the split (which happens in
+ * place). If we overwrite the pmd with the not-huge version pointing
+ * to the pte here (which of course we could if all CPUs were bug
+ * free), userland could trigger a small page size TLB miss on the
+ * small sized TLB while the hugepage TLB entry is still established in
+ * the huge TLB. Some CPU doesn't like that.
+ * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
+ * 383 on page 93. Intel should be safe but is also warns that it's
+ * only safe if the permission and cache attributes of the two entries
+ * loaded in the two TLB is identical (which should be the case here).
+ * But it is generally safer to never allow small and huge TLB entries
+ * for the same virtual address to be loaded simultaneously. So instead
+ * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
+ * current pmd notpresent (atomically because here the pmd_trans_huge
+ * and pmd_trans_splitting must remain set at all times on the pmd
+ * until the split is complete for this pmd), then we flush the SMP TLB
+ * and finally we write the non-huge version of the pmd entry with
+ * pmd_populate.
*/
- if (unlikely(pmd_trans_huge(*pmd)))
- goto again;
+ pmdp_invalidate(vma, haddr, pmd);
+ pmd_populate(mm, pmd, pgtable);
+
+ if (freeze) {
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ page_remove_rmap(page + i, false);
+ put_page(page + i);
+ }
+ }
}
-void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd)
+void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long address)
{
- struct vm_area_struct *vma;
+ spinlock_t *ptl;
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *page = NULL;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
- vma = find_vma(mm, address);
- BUG_ON(vma == NULL);
- split_huge_page_pmd(vma, address, pmd);
+ mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
+ ptl = pmd_lock(mm, pmd);
+ if (pmd_trans_huge(*pmd)) {
+ page = pmd_page(*pmd);
+ if (PageMlocked(page))
+ get_page(page);
+ else
+ page = NULL;
+ } else if (!pmd_devmap(*pmd))
+ goto out;
+ __split_huge_pmd_locked(vma, pmd, haddr, false);
+out:
+ spin_unlock(ptl);
+ mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
+ if (page) {
+ lock_page(page);
+ munlock_vma_page(page);
+ unlock_page(page);
+ put_page(page);
+ }
}
-static void split_huge_page_address(struct mm_struct *mm,
+static void split_huge_pmd_address(struct vm_area_struct *vma,
unsigned long address)
{
pgd_t *pgd;
@@ -3128,7 +2978,7 @@ static void split_huge_page_address(struct mm_struct *mm,
VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
- pgd = pgd_offset(mm, address);
+ pgd = pgd_offset(vma->vm_mm, address);
if (!pgd_present(*pgd))
return;
@@ -3137,13 +2987,13 @@ static void split_huge_page_address(struct mm_struct *mm,
return;
pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd))
+ if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)))
return;
/*
* Caller holds the mmap_sem write mode, so a huge pmd cannot
* materialize from under us.
*/
- split_huge_page_pmd_mm(mm, address, pmd);
+ split_huge_pmd(vma, pmd, address);
}
void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -3159,7 +3009,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (start & ~HPAGE_PMD_MASK &&
(start & HPAGE_PMD_MASK) >= vma->vm_start &&
(start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_page_address(vma->vm_mm, start);
+ split_huge_pmd_address(vma, start);
/*
* If the new end address isn't hpage aligned and it could
@@ -3169,7 +3019,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (end & ~HPAGE_PMD_MASK &&
(end & HPAGE_PMD_MASK) >= vma->vm_start &&
(end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_page_address(vma->vm_mm, end);
+ split_huge_pmd_address(vma, end);
/*
* If we're also updating the vma->vm_next->vm_start, if the new
@@ -3183,6 +3033,540 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (nstart & ~HPAGE_PMD_MASK &&
(nstart & HPAGE_PMD_MASK) >= next->vm_start &&
(nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
- split_huge_page_address(next->vm_mm, nstart);
+ split_huge_pmd_address(next, nstart);
+ }
+}
+
+static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
+ unsigned long address)
+{
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+ spinlock_t *ptl;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ int i, nr = HPAGE_PMD_NR;
+
+ /* Skip pages which doesn't belong to the VMA */
+ if (address < vma->vm_start) {
+ int off = (vma->vm_start - address) >> PAGE_SHIFT;
+ page += off;
+ nr -= off;
+ address = vma->vm_start;
+ }
+
+ pgd = pgd_offset(vma->vm_mm, address);
+ if (!pgd_present(*pgd))
+ return;
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return;
+ pmd = pmd_offset(pud, address);
+ ptl = pmd_lock(vma->vm_mm, pmd);
+ if (!pmd_present(*pmd)) {
+ spin_unlock(ptl);
+ return;
+ }
+ if (pmd_trans_huge(*pmd)) {
+ if (page == pmd_page(*pmd))
+ __split_huge_pmd_locked(vma, pmd, haddr, true);
+ spin_unlock(ptl);
+ return;
+ }
+ spin_unlock(ptl);
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+ for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
+ pte_t entry, swp_pte;
+ swp_entry_t swp_entry;
+
+ /*
+ * We've just crossed page table boundary: need to map next one.
+ * It can happen if THP was mremaped to non PMD-aligned address.
+ */
+ if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
+ pte_unmap_unlock(pte - 1, ptl);
+ pmd = mm_find_pmd(vma->vm_mm, address);
+ if (!pmd)
+ return;
+ pte = pte_offset_map_lock(vma->vm_mm, pmd,
+ address, &ptl);
+ }
+
+ if (!pte_present(*pte))
+ continue;
+ if (page_to_pfn(page) != pte_pfn(*pte))
+ continue;
+ flush_cache_page(vma, address, page_to_pfn(page));
+ entry = ptep_clear_flush(vma, address, pte);
+ if (pte_dirty(entry))
+ SetPageDirty(page);
+ swp_entry = make_migration_entry(page, pte_write(entry));
+ swp_pte = swp_entry_to_pte(swp_entry);
+ if (pte_soft_dirty(entry))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ set_pte_at(vma->vm_mm, address, pte, swp_pte);
+ page_remove_rmap(page, false);
+ put_page(page);
+ }
+ pte_unmap_unlock(pte - 1, ptl);
+}
+
+static void freeze_page(struct anon_vma *anon_vma, struct page *page)
+{
+ struct anon_vma_chain *avc;
+ pgoff_t pgoff = page_to_pgoff(page);
+
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
+ pgoff + HPAGE_PMD_NR - 1) {
+ unsigned long address = __vma_address(page, avc->vma);
+
+ mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+ address, address + HPAGE_PMD_SIZE);
+ freeze_page_vma(avc->vma, page, address);
+ mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+ address, address + HPAGE_PMD_SIZE);
+ }
+}
+
+static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
+ unsigned long address)
+{
+ spinlock_t *ptl;
+ pmd_t *pmd;
+ pte_t *pte, entry;
+ swp_entry_t swp_entry;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+ int i, nr = HPAGE_PMD_NR;
+
+ /* Skip pages which doesn't belong to the VMA */
+ if (address < vma->vm_start) {
+ int off = (vma->vm_start - address) >> PAGE_SHIFT;
+ page += off;
+ nr -= off;
+ address = vma->vm_start;
+ }
+
+ pmd = mm_find_pmd(vma->vm_mm, address);
+ if (!pmd)
+ return;
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+ for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
+ /*
+ * We've just crossed page table boundary: need to map next one.
+ * It can happen if THP was mremaped to non-PMD aligned address.
+ */
+ if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
+ pte_unmap_unlock(pte - 1, ptl);
+ pmd = mm_find_pmd(vma->vm_mm, address);
+ if (!pmd)
+ return;
+ pte = pte_offset_map_lock(vma->vm_mm, pmd,
+ address, &ptl);
+ }
+
+ if (!is_swap_pte(*pte))
+ continue;
+
+ swp_entry = pte_to_swp_entry(*pte);
+ if (!is_migration_entry(swp_entry))
+ continue;
+ if (migration_entry_to_page(swp_entry) != page)
+ continue;
+
+ get_page(page);
+ page_add_anon_rmap(page, vma, address, false);
+
+ entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
+ if (PageDirty(page))
+ entry = pte_mkdirty(entry);
+ if (is_write_migration_entry(swp_entry))
+ entry = maybe_mkwrite(entry, vma);
+
+ flush_dcache_page(page);
+ set_pte_at(vma->vm_mm, address, pte, entry);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(vma, address, pte);
+ }
+ pte_unmap_unlock(pte - 1, ptl);
+}
+
+static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
+{
+ struct anon_vma_chain *avc;
+ pgoff_t pgoff = page_to_pgoff(page);
+
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
+ pgoff, pgoff + HPAGE_PMD_NR - 1) {
+ unsigned long address = __vma_address(page, avc->vma);
+
+ mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+ address, address + HPAGE_PMD_SIZE);
+ unfreeze_page_vma(avc->vma, page, address);
+ mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+ address, address + HPAGE_PMD_SIZE);
+ }
+}
+
+static int __split_huge_page_tail(struct page *head, int tail,
+ struct lruvec *lruvec, struct list_head *list)
+{
+ int mapcount;
+ struct page *page_tail = head + tail;
+
+ mapcount = atomic_read(&page_tail->_mapcount) + 1;
+ VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
+
+ /*
+ * tail_page->_count is zero and not changing from under us. But
+ * get_page_unless_zero() may be running from under us on the
+ * tail_page. If we used atomic_set() below instead of atomic_add(), we
+ * would then run atomic_set() concurrently with
+ * get_page_unless_zero(), and atomic_set() is implemented in C not
+ * using locked ops. spin_unlock on x86 sometime uses locked ops
+ * because of PPro errata 66, 92, so unless somebody can guarantee
+ * atomic_set() here would be safe on all archs (and not only on x86),
+ * it's safer to use atomic_add().
+ */
+ atomic_add(mapcount + 1, &page_tail->_count);
+
+
+ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ page_tail->flags |= (head->flags &
+ ((1L << PG_referenced) |
+ (1L << PG_swapbacked) |
+ (1L << PG_mlocked) |
+ (1L << PG_uptodate) |
+ (1L << PG_active) |
+ (1L << PG_locked) |
+ (1L << PG_unevictable) |
+ (1L << PG_dirty)));
+
+ /*
+ * After clearing PageTail the gup refcount can be released.
+ * Page flags also must be visible before we make the page non-compound.
+ */
+ smp_wmb();
+
+ clear_compound_head(page_tail);
+
+ if (page_is_young(head))
+ set_page_young(page_tail);
+ if (page_is_idle(head))
+ set_page_idle(page_tail);
+
+ /* ->mapping in first tail page is compound_mapcount */
+ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
+ page_tail);
+ page_tail->mapping = head->mapping;
+
+ page_tail->index = head->index + tail;
+ page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
+ lru_add_page_tail(head, page_tail, lruvec, list);
+
+ return mapcount;
+}
+
+static void __split_huge_page(struct page *page, struct list_head *list)
+{
+ struct page *head = compound_head(page);
+ struct zone *zone = page_zone(head);
+ struct lruvec *lruvec;
+ int i, tail_mapcount;
+
+ /* prevent PageLRU to go away from under us, and freeze lru stats */
+ spin_lock_irq(&zone->lru_lock);
+ lruvec = mem_cgroup_page_lruvec(head, zone);
+
+ /* complete memcg works before add pages to LRU */
+ mem_cgroup_split_huge_fixup(head);
+
+ tail_mapcount = 0;
+ for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
+ tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
+ atomic_sub(tail_mapcount, &head->_count);
+
+ ClearPageCompound(head);
+ spin_unlock_irq(&zone->lru_lock);
+
+ unfreeze_page(page_anon_vma(head), head);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ struct page *subpage = head + i;
+ if (subpage == page)
+ continue;
+ unlock_page(subpage);
+
+ /*
+ * Subpages may be freed if there wasn't any mapping
+ * like if add_to_swap() is running on a lru page that
+ * had its mapping zapped. And freeing these pages
+ * requires taking the lru_lock so we do the put_page
+ * of the tail pages after the split is complete.
+ */
+ put_page(subpage);
}
}
+
+int total_mapcount(struct page *page)
+{
+ int i, ret;
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ if (likely(!PageCompound(page)))
+ return atomic_read(&page->_mapcount) + 1;
+
+ ret = compound_mapcount(page);
+ if (PageHuge(page))
+ return ret;
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ ret += atomic_read(&page[i]._mapcount) + 1;
+ if (PageDoubleMap(page))
+ ret -= HPAGE_PMD_NR;
+ return ret;
+}
+
+/*
+ * This function splits huge page into normal pages. @page can point to any
+ * subpage of huge page to split. Split doesn't change the position of @page.
+ *
+ * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
+ * The huge page must be locked.
+ *
+ * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
+ *
+ * Both head page and tail pages will inherit mapping, flags, and so on from
+ * the hugepage.
+ *
+ * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
+ * they are not mapped.
+ *
+ * Returns 0 if the hugepage is split successfully.
+ * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
+ * us.
+ */
+int split_huge_page_to_list(struct page *page, struct list_head *list)
+{
+ struct page *head = compound_head(page);
+ struct anon_vma *anon_vma;
+ int count, mapcount, ret;
+ bool mlocked;
+
+ VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
+ VM_BUG_ON_PAGE(!PageAnon(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+ VM_BUG_ON_PAGE(!PageCompound(page), page);
+
+ /*
+ * The caller does not necessarily hold an mmap_sem that would prevent
+ * the anon_vma disappearing so we first we take a reference to it
+ * and then lock the anon_vma for write. This is similar to
+ * page_lock_anon_vma_read except the write lock is taken to serialise
+ * against parallel split or collapse operations.
+ */
+ anon_vma = page_get_anon_vma(head);
+ if (!anon_vma) {
+ ret = -EBUSY;
+ goto out;
+ }
+ anon_vma_lock_write(anon_vma);
+
+ /*
+ * Racy check if we can split the page, before freeze_page() will
+ * split PMDs
+ */
+ if (total_mapcount(head) != page_count(head) - 1) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ mlocked = PageMlocked(page);
+ freeze_page(anon_vma, head);
+ VM_BUG_ON_PAGE(compound_mapcount(head), head);
+
+ /* Make sure the page is not on per-CPU pagevec as it takes pin */
+ if (mlocked)
+ lru_add_drain();
+
+ /* Prevent deferred_split_scan() touching ->_count */
+ spin_lock(&split_queue_lock);
+ count = page_count(head);
+ mapcount = total_mapcount(head);
+ if (!mapcount && count == 1) {
+ if (!list_empty(page_deferred_list(head))) {
+ split_queue_len--;
+ list_del(page_deferred_list(head));
+ }
+ spin_unlock(&split_queue_lock);
+ __split_huge_page(page, list);
+ ret = 0;
+ } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
+ spin_unlock(&split_queue_lock);
+ pr_alert("total_mapcount: %u, page_count(): %u\n",
+ mapcount, count);
+ if (PageTail(page))
+ dump_page(head, NULL);
+ dump_page(page, "total_mapcount(head) > 0");
+ BUG();
+ } else {
+ spin_unlock(&split_queue_lock);
+ unfreeze_page(anon_vma, head);
+ ret = -EBUSY;
+ }
+
+out_unlock:
+ anon_vma_unlock_write(anon_vma);
+ put_anon_vma(anon_vma);
+out:
+ count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
+ return ret;
+}
+
+void free_transhuge_page(struct page *page)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&split_queue_lock, flags);
+ if (!list_empty(page_deferred_list(page))) {
+ split_queue_len--;
+ list_del(page_deferred_list(page));
+ }
+ spin_unlock_irqrestore(&split_queue_lock, flags);
+ free_compound_page(page);
+}
+
+void deferred_split_huge_page(struct page *page)
+{
+ unsigned long flags;
+
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+
+ spin_lock_irqsave(&split_queue_lock, flags);
+ if (list_empty(page_deferred_list(page))) {
+ list_add_tail(page_deferred_list(page), &split_queue);
+ split_queue_len++;
+ }
+ spin_unlock_irqrestore(&split_queue_lock, flags);
+}
+
+static unsigned long deferred_split_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ /*
+ * Split a page from split_queue will free up at least one page,
+ * at most HPAGE_PMD_NR - 1. We don't track exact number.
+ * Let's use HPAGE_PMD_NR / 2 as ballpark.
+ */
+ return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
+}
+
+static unsigned long deferred_split_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ unsigned long flags;
+ LIST_HEAD(list), *pos, *next;
+ struct page *page;
+ int split = 0;
+
+ spin_lock_irqsave(&split_queue_lock, flags);
+ list_splice_init(&split_queue, &list);
+
+ /* Take pin on all head pages to avoid freeing them under us */
+ list_for_each_safe(pos, next, &list) {
+ page = list_entry((void *)pos, struct page, mapping);
+ page = compound_head(page);
+ /* race with put_compound_page() */
+ if (!get_page_unless_zero(page)) {
+ list_del_init(page_deferred_list(page));
+ split_queue_len--;
+ }
+ }
+ spin_unlock_irqrestore(&split_queue_lock, flags);
+
+ list_for_each_safe(pos, next, &list) {
+ page = list_entry((void *)pos, struct page, mapping);
+ lock_page(page);
+ /* split_huge_page() removes page from list on success */
+ if (!split_huge_page(page))
+ split++;
+ unlock_page(page);
+ put_page(page);
+ }
+
+ spin_lock_irqsave(&split_queue_lock, flags);
+ list_splice_tail(&list, &split_queue);
+ spin_unlock_irqrestore(&split_queue_lock, flags);
+
+ return split * HPAGE_PMD_NR / 2;
+}
+
+static struct shrinker deferred_split_shrinker = {
+ .count_objects = deferred_split_count,
+ .scan_objects = deferred_split_scan,
+ .seeks = DEFAULT_SEEKS,
+};
+
+#ifdef CONFIG_DEBUG_FS
+static int split_huge_pages_set(void *data, u64 val)
+{
+ struct zone *zone;
+ struct page *page;
+ unsigned long pfn, max_zone_pfn;
+ unsigned long total = 0, split = 0;
+
+ if (val != 1)
+ return -EINVAL;
+
+ for_each_populated_zone(zone) {
+ max_zone_pfn = zone_end_pfn(zone);
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
+ if (!pfn_valid(pfn))
+ continue;
+
+ page = pfn_to_page(pfn);
+ if (!get_page_unless_zero(page))
+ continue;
+
+ if (zone != page_zone(page))
+ goto next;
+
+ if (!PageHead(page) || !PageAnon(page) ||
+ PageHuge(page))
+ goto next;
+
+ total++;
+ lock_page(page);
+ if (!split_huge_page(page))
+ split++;
+ unlock_page(page);
+next:
+ put_page(page);
+ }
+ }
+
+ pr_info("%lu of %lu THP split", split, total);
+
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
+ "%llu\n");
+
+static int __init split_huge_pages_debugfs(void)
+{
+ void *ret;
+
+ ret = debugfs_create_file("split_huge_pages", 0644, NULL, NULL,
+ &split_huge_pages_fops);
+ if (!ret)
+ pr_warn("Failed to create split_huge_pages in debugfs");
+ return 0;
+}
+late_initcall(split_huge_pages_debugfs);
+#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index be934df..12908dc 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1267,8 +1267,8 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
/* we rely on prep_new_huge_page to set the destructor */
set_compound_order(page, order);
- __SetPageHead(page);
__ClearPageReserved(page);
+ __SetPageHead(page);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
/*
* For gigantic hugepages allocated through bootmem at
@@ -3102,7 +3102,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
- page_dup_rmap(ptepage);
+ page_dup_rmap(ptepage, true);
set_huge_pte_at(dst, addr, dst_pte, entry);
hugetlb_count_add(pages_per_huge_page(h), dst);
}
@@ -3186,7 +3186,7 @@ again:
set_page_dirty(page);
hugetlb_count_sub(pages_per_huge_page(h), mm);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
force_flush = !__tlb_remove_page(tlb, page);
if (force_flush) {
address += sz;
@@ -3415,7 +3415,7 @@ retry_avoidcopy:
mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1));
- page_remove_rmap(old_page);
+ page_remove_rmap(old_page, true);
hugepage_add_new_anon_rmap(new_page, vma, address);
/* Make the old page be freed below */
new_page = old_page;
@@ -3585,7 +3585,7 @@ retry:
ClearPagePrivate(page);
hugepage_add_new_anon_rmap(page, vma, address);
} else
- page_dup_rmap(page);
+ page_dup_rmap(page, true);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
set_huge_pte_at(mm, address, ptep, new_pte);
@@ -3865,7 +3865,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
same_page:
if (pages) {
pages[i] = mem_map_offset(page, pfn_offset);
- get_page_foll(pages[i]);
+ get_page(pages[i]);
}
if (vmas)
diff --git a/mm/internal.h b/mm/internal.h
index 38e24b8..ed8b5ff 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -13,6 +13,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/pagemap.h>
/*
* The set of flags that only affect watermark checking and reclaim
@@ -66,50 +67,6 @@ static inline void set_page_refcounted(struct page *page)
set_page_count(page, 1);
}
-static inline void __get_page_tail_foll(struct page *page,
- bool get_page_head)
-{
- /*
- * If we're getting a tail page, the elevated page->_count is
- * required only in the head page and we will elevate the head
- * page->_count and tail page->_mapcount.
- *
- * We elevate page_tail->_mapcount for tail pages to force
- * page_tail->_count to be zero at all times to avoid getting
- * false positives from get_page_unless_zero() with
- * speculative page access (like in
- * page_cache_get_speculative()) on tail pages.
- */
- VM_BUG_ON_PAGE(atomic_read(&compound_head(page)->_count) <= 0, page);
- if (get_page_head)
- atomic_inc(&compound_head(page)->_count);
- get_huge_page_tail(page);
-}
-
-/*
- * This is meant to be called as the FOLL_GET operation of
- * follow_page() and it must be called while holding the proper PT
- * lock while the pte (or pmd_trans_huge) is still mapping the page.
- */
-static inline void get_page_foll(struct page *page)
-{
- if (unlikely(PageTail(page)))
- /*
- * This is safe only because
- * __split_huge_page_refcount() can't run under
- * get_page_foll() because we hold the proper PT lock.
- */
- __get_page_tail_foll(page, true);
- else {
- /*
- * Getting a normal page or the head of a compound page
- * requires to already have an elevated page->_count.
- */
- VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
- atomic_inc(&page->_count);
- }
-}
-
extern unsigned long highest_memmap_pfn;
/*
@@ -309,10 +266,27 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-extern unsigned long vma_address(struct page *page,
- struct vm_area_struct *vma);
-#endif
+/*
+ * At what user virtual address is page expected in @vma?
+ */
+static inline unsigned long
+__vma_address(struct page *page, struct vm_area_struct *vma)
+{
+ pgoff_t pgoff = page_to_pgoff(page);
+ return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+}
+
+static inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+ unsigned long address = __vma_address(page, vma);
+
+ /* page should be within @vma mapping range */
+ VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+
+ return address;
+}
+
#else /* !CONFIG_MMU */
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
diff --git a/mm/ksm.c b/mm/ksm.c
index 2d162c5..ca6d2a0 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -441,20 +441,6 @@ static void break_cow(struct rmap_item *rmap_item)
up_read(&mm->mmap_sem);
}
-static struct page *page_trans_compound_anon(struct page *page)
-{
- if (PageTransCompound(page)) {
- struct page *head = compound_head(page);
- /*
- * head may actually be splitted and freed from under
- * us but it's ok here.
- */
- if (PageAnon(head))
- return head;
- }
- return NULL;
-}
-
static struct page *get_mergeable_page(struct rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
@@ -470,7 +456,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
page = follow_page(vma, addr, FOLL_GET);
if (IS_ERR_OR_NULL(page))
goto out;
- if (PageAnon(page) || page_trans_compound_anon(page)) {
+ if (PageAnon(page)) {
flush_anon_page(vma, page, addr);
flush_dcache_page(page);
} else {
@@ -956,13 +942,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
}
get_page(kpage);
- page_add_anon_rmap(kpage, vma, addr);
+ page_add_anon_rmap(kpage, vma, addr, false);
flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
- page_remove_rmap(page);
+ page_remove_rmap(page, false);
if (!page_mapped(page))
try_to_free_swap(page);
put_page(page);
@@ -975,33 +961,6 @@ out:
return err;
}
-static int page_trans_compound_anon_split(struct page *page)
-{
- int ret = 0;
- struct page *transhuge_head = page_trans_compound_anon(page);
- if (transhuge_head) {
- /* Get the reference on the head to split it. */
- if (get_page_unless_zero(transhuge_head)) {
- /*
- * Recheck we got the reference while the head
- * was still anonymous.
- */
- if (PageAnon(transhuge_head))
- ret = split_huge_page(transhuge_head);
- else
- /*
- * Retry later if split_huge_page run
- * from under us.
- */
- ret = 1;
- put_page(transhuge_head);
- } else
- /* Retry later if split_huge_page run from under us. */
- ret = 1;
- }
- return ret;
-}
-
/*
* try_to_merge_one_page - take two pages and merge them into one
* @vma: the vma that holds the pte pointing to page
@@ -1020,9 +979,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
if (page == kpage) /* ksm page forked */
return 0;
- if (PageTransCompound(page) && page_trans_compound_anon_split(page))
- goto out;
- BUG_ON(PageTransCompound(page));
if (!PageAnon(page))
goto out;
@@ -1035,6 +991,13 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
*/
if (!trylock_page(page))
goto out;
+
+ if (PageTransCompound(page)) {
+ err = split_huge_page(page);
+ if (err)
+ goto out_unlock;
+ }
+
/*
* If this anonymous page is mapped only here, its pte may need
* to be write-protected. If it's mapped elsewhere, all of its
@@ -1050,6 +1013,12 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
*/
set_page_stable_node(page, NULL);
mark_page_accessed(page);
+ /*
+ * Page reclaim just frees a clean page with no dirty
+ * ptes: make sure that the ksm page would be swapped.
+ */
+ if (!PageDirty(page))
+ SetPageDirty(page);
err = 0;
} else if (pages_identical(page, kpage))
err = replace_page(vma, page, kpage, orig_pte);
@@ -1065,6 +1034,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
}
}
+out_unlock:
unlock_page(page);
out:
return err;
@@ -1635,8 +1605,7 @@ next_mm:
cond_resched();
continue;
}
- if (PageAnon(*page) ||
- page_trans_compound_anon(*page)) {
+ if (PageAnon(*page)) {
flush_anon_page(vma, *page, ksm_scan.address);
flush_dcache_page(*page);
rmap_item = get_next_rmap_item(slot,
@@ -1899,7 +1868,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
SetPageDirty(new_page);
__SetPageUptodate(new_page);
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
}
return new_page;
diff --git a/mm/madvise.c b/mm/madvise.c
index c889fcb..f56825b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -20,6 +20,9 @@
#include <linux/backing-dev.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
+
+#include <asm/tlb.h>
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
@@ -32,6 +35,7 @@ static int madvise_need_mmap_write(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_FREE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
@@ -256,6 +260,194 @@ static long madvise_willneed(struct vm_area_struct *vma,
return 0;
}
+static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+
+{
+ struct mmu_gather *tlb = walk->private;
+ struct mm_struct *mm = tlb->mm;
+ struct vm_area_struct *vma = walk->vma;
+ spinlock_t *ptl;
+ pte_t *orig_pte, *pte, ptent;
+ struct page *page;
+ int nr_swap = 0;
+ unsigned long next;
+
+ next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd))
+ if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
+ goto next;
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+ orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+
+ if (pte_none(ptent))
+ continue;
+ /*
+ * If the pte has swp_entry, just clear page table to
+ * prevent swap-in which is more expensive rather than
+ * (page allocation + zeroing).
+ */
+ if (!pte_present(ptent)) {
+ swp_entry_t entry;
+
+ entry = pte_to_swp_entry(ptent);
+ if (non_swap_entry(entry))
+ continue;
+ nr_swap--;
+ free_swap_and_cache(entry);
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ continue;
+ }
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ /*
+ * If pmd isn't transhuge but the page is THP and
+ * is owned by only this process, split it and
+ * deactivate all pages.
+ */
+ if (PageTransCompound(page)) {
+ if (page_mapcount(page) != 1)
+ goto out;
+ get_page(page);
+ if (!trylock_page(page)) {
+ put_page(page);
+ goto out;
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+ if (split_huge_page(page)) {
+ unlock_page(page);
+ put_page(page);
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ goto out;
+ }
+ put_page(page);
+ unlock_page(page);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte--;
+ addr -= PAGE_SIZE;
+ continue;
+ }
+
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+ if (PageSwapCache(page) || PageDirty(page)) {
+ if (!trylock_page(page))
+ continue;
+ /*
+ * If page is shared with others, we couldn't clear
+ * PG_dirty of the page.
+ */
+ if (page_mapcount(page) != 1) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (PageSwapCache(page) && !try_to_free_swap(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ ClearPageDirty(page);
+ unlock_page(page);
+ }
+
+ if (pte_young(ptent) || pte_dirty(ptent)) {
+ /*
+ * Some of architecture(ex, PPC) don't update TLB
+ * with set_pte_at and tlb_remove_tlb_entry so for
+ * the portability, remap the pte with old|clean
+ * after pte clearing.
+ */
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
+ tlb->fullmm);
+
+ ptent = pte_mkold(ptent);
+ ptent = pte_mkclean(ptent);
+ set_pte_at(mm, addr, pte, ptent);
+ if (PageActive(page))
+ deactivate_page(page);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ }
+ }
+out:
+ if (nr_swap) {
+ if (current->mm == mm)
+ sync_mm_rss(mm);
+
+ add_mm_counter(mm, MM_SWAPENTS, nr_swap);
+ }
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(orig_pte, ptl);
+ cond_resched();
+next:
+ return 0;
+}
+
+static void madvise_free_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ struct mm_walk free_walk = {
+ .pmd_entry = madvise_free_pte_range,
+ .mm = vma->vm_mm,
+ .private = tlb,
+ };
+
+ tlb_start_vma(tlb, vma);
+ walk_page_range(addr, end, &free_walk);
+ tlb_end_vma(tlb, vma);
+}
+
+static int madvise_free_single_vma(struct vm_area_struct *vma,
+ unsigned long start_addr, unsigned long end_addr)
+{
+ unsigned long start, end;
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
+
+ if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
+ return -EINVAL;
+
+ /* MADV_FREE works for only anon vma at the moment */
+ if (!vma_is_anonymous(vma))
+ return -EINVAL;
+
+ start = max(vma->vm_start, start_addr);
+ if (start >= vma->vm_end)
+ return -EINVAL;
+ end = min(vma->vm_end, end_addr);
+ if (end <= vma->vm_start)
+ return -EINVAL;
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm, start, end);
+ update_hiwater_rss(mm);
+
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ madvise_free_page_range(&tlb, vma, start, end);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ tlb_finish_mmu(&tlb, start, end);
+
+ return 0;
+}
+
+static long madvise_free(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ *prev = vma;
+ return madvise_free_single_vma(vma, start, end);
+}
+
/*
* Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about
@@ -379,6 +571,14 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
return madvise_remove(vma, prev, start, end);
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
+ case MADV_FREE:
+ /*
+ * XXX: In this implementation, MADV_FREE works like
+ * MADV_DONTNEED on swapless system or full swap.
+ */
+ if (get_nr_swap_pages() > 0)
+ return madvise_free(vma, prev, start, end);
+ /* passthrough */
case MADV_DONTNEED:
return madvise_dontneed(vma, prev, start, end);
default:
@@ -398,6 +598,7 @@ madvise_behavior_valid(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_FREE:
#ifdef CONFIG_KSM
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 54eae4f..0eda673 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -382,14 +382,11 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
{
struct mem_cgroup *memcg;
- rcu_read_lock();
-
memcg = page->mem_cgroup;
if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
memcg = root_mem_cgroup;
- rcu_read_unlock();
return &memcg->css;
}
@@ -647,7 +644,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
struct page *page,
- int nr_pages)
+ bool compound, int nr_pages)
{
/*
* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
@@ -660,9 +657,11 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
nr_pages);
- if (PageTransHuge(page))
+ if (compound) {
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
nr_pages);
+ }
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
@@ -2431,9 +2430,7 @@ void __memcg_kmem_uncharge(struct page *page, int order)
/*
* Because tail pages are not marked as "used", set it. We're under
- * zone->lru_lock, 'splitting on pmd' and compound_lock.
- * charge/uncharge will be never happen and move_account() is done under
- * compound_lock(), so we don't have to take care of races.
+ * zone->lru_lock and migration entries setup in all page mappings.
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
@@ -3494,16 +3491,17 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
swap_buffers:
/* Swap primary and spare array */
thresholds->spare = thresholds->primary;
- /* If all events are unregistered, free the spare array */
- if (!new) {
- kfree(thresholds->spare);
- thresholds->spare = NULL;
- }
rcu_assign_pointer(thresholds->primary, new);
/* To be sure that nobody uses thresholds */
synchronize_rcu();
+
+ /* If all events are unregistered, free the spare array */
+ if (!new) {
+ kfree(thresholds->spare);
+ thresholds->spare = NULL;
+ }
unlock:
mutex_unlock(&memcg->thresholds_lock);
}
@@ -4505,38 +4503,30 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
*
- * The caller must confirm following.
- * - page is not on LRU (isolate_page() is useful.)
- * - compound_lock is held when nr_pages > 1
+ * The caller must make sure the page is not on LRU (isolate_page() is useful.)
*
* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
* from old cgroup.
*/
static int mem_cgroup_move_account(struct page *page,
- unsigned int nr_pages,
+ bool compound,
struct mem_cgroup *from,
struct mem_cgroup *to)
{
unsigned long flags;
+ unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret;
bool anon;
VM_BUG_ON(from == to);
VM_BUG_ON_PAGE(PageLRU(page), page);
- /*
- * The page is isolated from LRU. So, collapse function
- * will not handle this page. But page splitting can happen.
- * Do this check under compound_page_lock(). The caller should
- * hold it.
- */
- ret = -EBUSY;
- if (nr_pages > 1 && !PageTransHuge(page))
- goto out;
+ VM_BUG_ON(compound && !PageTransHuge(page));
/*
* Prevent mem_cgroup_replace_page() from looking at
* page->mem_cgroup of its source page while we change it.
*/
+ ret = -EBUSY;
if (!trylock_page(page))
goto out;
@@ -4591,9 +4581,9 @@ static int mem_cgroup_move_account(struct page *page,
ret = 0;
local_irq_disable();
- mem_cgroup_charge_statistics(to, page, nr_pages);
+ mem_cgroup_charge_statistics(to, page, compound, nr_pages);
memcg_check_events(to, page);
- mem_cgroup_charge_statistics(from, page, -nr_pages);
+ mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
memcg_check_events(from, page);
local_irq_enable();
out_unlock:
@@ -4683,7 +4673,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
spin_unlock(ptl);
@@ -4871,17 +4861,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
union mc_target target;
struct page *page;
- /*
- * We don't take compound_lock() here but no race with splitting thp
- * happens because:
- * - if pmd_trans_huge_lock() returns 1, the relevant thp is not
- * under splitting, which means there's no concurrent thp split,
- * - if another thread runs into split_huge_page() just after we
- * entered this if-block, the thread must wait for page table lock
- * to be unlocked in __split_huge_page_splitting(), where the main
- * part of thp split is not executed yet.
- */
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
if (mc.precharge < HPAGE_PMD_NR) {
spin_unlock(ptl);
return 0;
@@ -4890,7 +4870,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
if (target_type == MC_TARGET_PAGE) {
page = target.page;
if (!isolate_lru_page(page)) {
- if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
+ if (!mem_cgroup_move_account(page, true,
mc.from, mc.to)) {
mc.precharge -= HPAGE_PMD_NR;
mc.moved_charge += HPAGE_PMD_NR;
@@ -4917,9 +4897,18 @@ retry:
switch (get_mctgt_type(vma, addr, ptent, &target)) {
case MC_TARGET_PAGE:
page = target.page;
+ /*
+ * We can have a part of the split pmd here. Moving it
+ * can be done but it would be too convoluted so simply
+ * ignore such a partial THP and keep it in original
+ * memcg. There should be somebody mapping the head.
+ */
+ if (PageTransCompound(page))
+ goto put;
if (isolate_lru_page(page))
goto put;
- if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
+ if (!mem_cgroup_move_account(page, false,
+ mc.from, mc.to)) {
mc.precharge--;
/* we uncharge from mc.from later. */
mc.moved_charge++;
@@ -5258,10 +5247,11 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
* with mem_cgroup_cancel_charge() in case page instantiation fails.
*/
int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcgp)
+ gfp_t gfp_mask, struct mem_cgroup **memcgp,
+ bool compound)
{
struct mem_cgroup *memcg = NULL;
- unsigned int nr_pages = 1;
+ unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret = 0;
if (mem_cgroup_disabled())
@@ -5291,11 +5281,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
}
}
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- }
-
if (!memcg)
memcg = get_mem_cgroup_from_mm(mm);
@@ -5324,9 +5309,9 @@ out:
* Use mem_cgroup_cancel_charge() to cancel the transaction instead.
*/
void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
- bool lrucare)
+ bool lrucare, bool compound)
{
- unsigned int nr_pages = 1;
+ unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
VM_BUG_ON_PAGE(!page->mapping, page);
VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
@@ -5343,13 +5328,8 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
commit_charge(page, memcg, lrucare);
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- }
-
local_irq_disable();
- mem_cgroup_charge_statistics(memcg, page, nr_pages);
+ mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
memcg_check_events(memcg, page);
local_irq_enable();
@@ -5371,9 +5351,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
*
* Cancel a charge transaction started by mem_cgroup_try_charge().
*/
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
+void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
+ bool compound)
{
- unsigned int nr_pages = 1;
+ unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
if (mem_cgroup_disabled())
return;
@@ -5385,11 +5366,6 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
if (!memcg)
return;
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- }
-
cancel_charge(memcg, nr_pages);
}
@@ -5750,7 +5726,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* only synchronisation we have for udpating the per-CPU variables.
*/
VM_BUG_ON(!irqs_disabled());
- mem_cgroup_charge_statistics(memcg, page, -1);
+ mem_cgroup_charge_statistics(memcg, page, false, -1);
memcg_check_events(memcg, page);
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 8424b64..ac595e7 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -882,15 +882,7 @@ int get_hwpoison_page(struct page *page)
{
struct page *head = compound_head(page);
- if (PageHuge(head))
- return get_page_unless_zero(head);
-
- /*
- * Thp tail page has special refcounting rule (refcount of tail pages
- * is stored in ->_mapcount,) so we can't call get_page_unless_zero()
- * directly for tail pages.
- */
- if (PageTransHuge(head)) {
+ if (!PageHuge(head) && PageTransHuge(head)) {
/*
* Non anonymous thp exists only in allocation/free time. We
* can't handle such a case correctly, so let's give it up.
@@ -902,41 +894,12 @@ int get_hwpoison_page(struct page *page)
page_to_pfn(page));
return 0;
}
-
- if (get_page_unless_zero(head)) {
- if (PageTail(page))
- get_page(page);
- return 1;
- } else {
- return 0;
- }
}
- return get_page_unless_zero(page);
+ return get_page_unless_zero(head);
}
EXPORT_SYMBOL_GPL(get_hwpoison_page);
-/**
- * put_hwpoison_page() - Put refcount for memory error handling:
- * @page: raw error page (hit by memory error)
- */
-void put_hwpoison_page(struct page *page)
-{
- struct page *head = compound_head(page);
-
- if (PageHuge(head)) {
- put_page(head);
- return;
- }
-
- if (PageTransHuge(head))
- if (page != head)
- put_page(head);
-
- put_page(page);
-}
-EXPORT_SYMBOL_GPL(put_hwpoison_page);
-
/*
* Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty.
@@ -1149,7 +1112,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
}
if (!PageHuge(p) && PageTransHuge(hpage)) {
+ lock_page(hpage);
if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
+ unlock_page(hpage);
if (!PageAnon(hpage))
pr_err("MCE: %#lx: non anonymous thp\n", pfn);
else
@@ -1159,6 +1124,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
put_hwpoison_page(p);
return -EBUSY;
}
+ unlock_page(hpage);
+ get_hwpoison_page(p);
+ put_hwpoison_page(hpage);
VM_BUG_ON_PAGE(!page_count(p), p);
hpage = compound_head(p);
}
@@ -1166,7 +1134,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
/*
* We ignore non-LRU pages for good reasons.
* - PG_locked is only well defined for LRU pages and a few others
- * - to avoid races with __set_page_locked()
+ * - to avoid races with __SetPageLocked()
* - to avoid races with __SetPageSlab*() (and more non-atomic ops)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
@@ -1572,7 +1540,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
* Did it turn free?
*/
ret = __get_any_page(page, pfn, 0);
- if (!PageLRU(page)) {
+ if (ret == 1 && !PageLRU(page)) {
/* Drop page reference which is from __get_any_page() */
put_hwpoison_page(page);
pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
@@ -1716,6 +1684,49 @@ static int __soft_offline_page(struct page *page, int flags)
return ret;
}
+static int soft_offline_in_use_page(struct page *page, int flags)
+{
+ int ret;
+ struct page *hpage = compound_head(page);
+
+ if (!PageHuge(page) && PageTransHuge(hpage)) {
+ lock_page(hpage);
+ if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
+ unlock_page(hpage);
+ if (!PageAnon(hpage))
+ pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
+ else
+ pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
+ put_hwpoison_page(hpage);
+ return -EBUSY;
+ }
+ unlock_page(hpage);
+ get_hwpoison_page(page);
+ put_hwpoison_page(hpage);
+ }
+
+ if (PageHuge(page))
+ ret = soft_offline_huge_page(page, flags);
+ else
+ ret = __soft_offline_page(page, flags);
+
+ return ret;
+}
+
+static void soft_offline_free_page(struct page *page)
+{
+ if (PageHuge(page)) {
+ struct page *hpage = compound_head(page);
+
+ set_page_hwpoison_huge_page(hpage);
+ if (!dequeue_hwpoisoned_huge_page(hpage))
+ num_poisoned_pages_add(1 << compound_order(hpage));
+ } else {
+ if (!TestSetPageHWPoison(page))
+ num_poisoned_pages_inc();
+ }
+}
+
/**
* soft_offline_page - Soft offline a page.
* @page: page to offline
@@ -1742,7 +1753,6 @@ int soft_offline_page(struct page *page, int flags)
{
int ret;
unsigned long pfn = page_to_pfn(page);
- struct page *hpage = compound_head(page);
if (PageHWPoison(page)) {
pr_info("soft offline: %#lx page already poisoned\n", pfn);
@@ -1750,34 +1760,15 @@ int soft_offline_page(struct page *page, int flags)
put_hwpoison_page(page);
return -EBUSY;
}
- if (!PageHuge(page) && PageTransHuge(hpage)) {
- if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
- pr_info("soft offline: %#lx: failed to split THP\n",
- pfn);
- if (flags & MF_COUNT_INCREASED)
- put_hwpoison_page(page);
- return -EBUSY;
- }
- }
get_online_mems();
-
ret = get_any_page(page, pfn, flags);
put_online_mems();
- if (ret > 0) { /* for in-use pages */
- if (PageHuge(page))
- ret = soft_offline_huge_page(page, flags);
- else
- ret = __soft_offline_page(page, flags);
- } else if (ret == 0) { /* for free pages */
- if (PageHuge(page)) {
- set_page_hwpoison_huge_page(hpage);
- if (!dequeue_hwpoisoned_huge_page(hpage))
- num_poisoned_pages_add(1 << compound_order(hpage));
- } else {
- if (!TestSetPageHWPoison(page))
- num_poisoned_pages_inc();
- }
- }
+
+ if (ret > 0)
+ ret = soft_offline_in_use_page(page, flags);
+ else if (ret == 0)
+ soft_offline_free_page(page);
+
return ret;
}
diff --git a/mm/memory.c b/mm/memory.c
index d4e4d37..ff17850 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -50,6 +50,7 @@
#include <linux/export.h>
#include <linux/delayacct.h>
#include <linux/init.h>
+#include <linux/pfn_t.h>
#include <linux/writeback.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
@@ -566,7 +567,6 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
{
spinlock_t *ptl;
pgtable_t new = pte_alloc_one(mm, address);
- int wait_split_huge_page;
if (!new)
return -ENOMEM;
@@ -586,18 +586,14 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
ptl = pmd_lock(mm, pmd);
- wait_split_huge_page = 0;
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
atomic_long_inc(&mm->nr_ptes);
pmd_populate(mm, pmd, new);
new = NULL;
- } else if (unlikely(pmd_trans_splitting(*pmd)))
- wait_split_huge_page = 1;
+ }
spin_unlock(ptl);
if (new)
pte_free(mm, new);
- if (wait_split_huge_page)
- wait_split_huge_page(vma->anon_vma, pmd);
return 0;
}
@@ -613,8 +609,7 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
pmd_populate_kernel(&init_mm, pmd, new);
new = NULL;
- } else
- VM_BUG_ON(pmd_trans_splitting(*pmd));
+ }
spin_unlock(&init_mm.page_table_lock);
if (new)
pte_free_kernel(&init_mm, new);
@@ -870,7 +865,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
page = vm_normal_page(vma, addr, pte);
if (page) {
get_page(page);
- page_dup_rmap(page);
+ page_dup_rmap(page, false);
rss[mm_counter(page)]++;
}
@@ -955,7 +950,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
src_pmd = pmd_offset(src_pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (pmd_trans_huge(*src_pmd)) {
+ if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
int err;
VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
err = copy_huge_pmd(dst_mm, src_mm,
@@ -1118,7 +1113,7 @@ again:
mark_page_accessed(page);
}
rss[mm_counter(page)]--;
- page_remove_rmap(page);
+ page_remove_rmap(page, false);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
if (unlikely(!__tlb_remove_page(tlb, page))) {
@@ -1182,7 +1177,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (pmd_trans_huge(*pmd)) {
+ if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE) {
#ifdef CONFIG_DEBUG_VM
if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
@@ -1193,7 +1188,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
BUG();
}
#endif
- split_huge_page_pmd(vma, addr, pmd);
+ split_huge_pmd(vma, pmd, addr);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
@@ -1506,7 +1501,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
EXPORT_SYMBOL(vm_insert_page);
static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn, pgprot_t prot)
+ pfn_t pfn, pgprot_t prot)
{
struct mm_struct *mm = vma->vm_mm;
int retval;
@@ -1522,7 +1517,10 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
goto out_unlock;
/* Ok, finally just insert the thing.. */
- entry = pte_mkspecial(pfn_pte(pfn, prot));
+ if (pfn_t_devmap(pfn))
+ entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
+ else
+ entry = pte_mkspecial(pfn_t_pte(pfn, prot));
set_pte_at(mm, addr, pte, entry);
update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
@@ -1569,17 +1567,17 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
- if (track_pfn_insert(vma, &pgprot, pfn))
+ if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)))
return -EINVAL;
- ret = insert_pfn(vma, addr, pfn, pgprot);
+ ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
return ret;
}
EXPORT_SYMBOL(vm_insert_pfn);
int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn)
+ pfn_t pfn)
{
BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
@@ -1593,10 +1591,10 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
* without pte special, it would there be refcounted as a normal page.
*/
- if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
+ if (!HAVE_PTE_SPECIAL && pfn_t_valid(pfn)) {
struct page *page;
- page = pfn_to_page(pfn);
+ page = pfn_t_to_page(pfn);
return insert_page(vma, addr, page, vma->vm_page_prot);
}
return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
@@ -2087,7 +2085,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
cow_user_page(new_page, old_page, address, vma);
}
- if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
+ if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
goto oom_free_new;
__SetPageUptodate(new_page);
@@ -2118,8 +2116,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
* thread doing COW.
*/
ptep_clear_flush_notify(vma, address, page_table);
- page_add_new_anon_rmap(new_page, vma, address);
- mem_cgroup_commit_charge(new_page, memcg, false);
+ page_add_new_anon_rmap(new_page, vma, address, false);
+ mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
/*
* We call the notify macro here because, when using secondary
@@ -2151,14 +2149,14 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
* mapcount is visible. So transitively, TLBs to
* old page will be flushed before it can be reused.
*/
- page_remove_rmap(old_page);
+ page_remove_rmap(old_page, false);
}
/* Free the old page.. */
new_page = old_page;
page_copied = 1;
} else {
- mem_cgroup_cancel_charge(new_page, memcg);
+ mem_cgroup_cancel_charge(new_page, memcg, false);
}
if (new_page)
@@ -2173,7 +2171,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if (page_copied && (vma->vm_flags & VM_LOCKED)) {
lock_page(old_page); /* LRU manipulation */
- munlock_vma_page(old_page);
+ if (PageMlocked(old_page))
+ munlock_vma_page(old_page);
unlock_page(old_page);
}
page_cache_release(old_page);
@@ -2533,7 +2532,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_page;
}
- if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {
+ if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) {
ret = VM_FAULT_OOM;
goto out_page;
}
@@ -2567,7 +2566,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
- exclusive = 1;
+ exclusive = RMAP_EXCLUSIVE;
}
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(orig_pte))
@@ -2575,10 +2574,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
set_pte_at(mm, address, page_table, pte);
if (page == swapcache) {
do_page_add_anon_rmap(page, vma, address, exclusive);
- mem_cgroup_commit_charge(page, memcg, true);
+ mem_cgroup_commit_charge(page, memcg, true, false);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, address);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, vma, address, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
@@ -2613,7 +2612,7 @@ unlock:
out:
return ret;
out_nomap:
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
pte_unmap_unlock(page_table, ptl);
out_page:
unlock_page(page);
@@ -2707,7 +2706,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (!page)
goto oom;
- if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
+ if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false))
goto oom_free_page;
/*
@@ -2728,15 +2727,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(page_table, ptl);
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
page_cache_release(page);
return handle_userfault(vma, address, flags,
VM_UFFD_MISSING);
}
inc_mm_counter_fast(mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, address);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, vma, address, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
setpte:
set_pte_at(mm, address, page_table, entry);
@@ -2747,7 +2746,7 @@ unlock:
pte_unmap_unlock(page_table, ptl);
return 0;
release:
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
page_cache_release(page);
goto unlock;
oom_free_page:
@@ -2824,7 +2823,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (anon) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, address);
+ page_add_new_anon_rmap(page, vma, address, false);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page);
@@ -3000,7 +2999,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (!new_page)
return VM_FAULT_OOM;
- if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {
+ if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) {
page_cache_release(new_page);
return VM_FAULT_OOM;
}
@@ -3029,7 +3028,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
goto uncharge_out;
}
do_set_pte(vma, address, new_page, pte, true, true);
- mem_cgroup_commit_charge(new_page, memcg, false);
+ mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
pte_unmap_unlock(pte, ptl);
if (fault_page) {
@@ -3044,7 +3043,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
return ret;
uncharge_out:
- mem_cgroup_cancel_charge(new_page, memcg);
+ mem_cgroup_cancel_charge(new_page, memcg, false);
page_cache_release(new_page);
return ret;
}
@@ -3096,7 +3095,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* pinned by vma->vm_file's reference. We rely on unlock_page()'s
* release semantics to prevent the compiler from undoing this copying.
*/
- mapping = fault_page->mapping;
+ mapping = page_rmapping(fault_page);
unlock_page(fault_page);
if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
/*
@@ -3198,6 +3197,12 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
}
+ /* TODO: handle PTE-mapped THP */
+ if (PageCompound(page)) {
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+ }
+
/*
* Avoid grouping on RO pages in general. RO pages shouldn't hurt as
* much anyway since they can be in shared cache state. This misses
@@ -3370,17 +3375,9 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int ret;
barrier();
- if (pmd_trans_huge(orig_pmd)) {
+ if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
unsigned int dirty = flags & FAULT_FLAG_WRITE;
- /*
- * If the pmd is splitting, return and retry the
- * the fault. Alternative: wait until the split
- * is done, and goto retry.
- */
- if (pmd_trans_splitting(orig_pmd))
- return 0;
-
if (pmd_protnone(orig_pmd))
return do_huge_pmd_numa_page(mm, vma, address,
orig_pmd, pmd);
@@ -3407,7 +3404,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
/* if an huge pmd materialized from under us just retry later */
- if (unlikely(pmd_trans_huge(*pmd)))
+ if (unlikely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
return 0;
/*
* A regular pmd is established and it can't morph into a huge pmd
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 92f9595..4af58a3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -17,6 +17,7 @@
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/memory.h>
+#include <linux/memremap.h>
#include <linux/memory_hotplug.h>
#include <linux/highmem.h>
#include <linux/vmalloc.h>
@@ -506,10 +507,25 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
unsigned long i;
int err = 0;
int start_sec, end_sec;
+ struct vmem_altmap *altmap;
+
/* during initialize mem_map, align hot-added range to section */
start_sec = pfn_to_section_nr(phys_start_pfn);
end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
+ altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn));
+ if (altmap) {
+ /*
+ * Validate altmap is within bounds of the total request
+ */
+ if (altmap->base_pfn != phys_start_pfn
+ || vmem_altmap_offset(altmap) > nr_pages) {
+ pr_warn_once("memory add fail, invalid altmap\n");
+ return -EINVAL;
+ }
+ altmap->alloc = 0;
+ }
+
for (i = start_sec; i <= end_sec; i++) {
err = __add_section(nid, zone, section_nr_to_pfn(i));
@@ -731,7 +747,8 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
pgdat_resize_unlock(zone->zone_pgdat, &flags);
}
-static int __remove_section(struct zone *zone, struct mem_section *ms)
+static int __remove_section(struct zone *zone, struct mem_section *ms,
+ unsigned long map_offset)
{
unsigned long start_pfn;
int scn_nr;
@@ -748,7 +765,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
start_pfn = section_nr_to_pfn(scn_nr);
__remove_zone(zone, start_pfn);
- sparse_remove_one_section(zone, ms);
+ sparse_remove_one_section(zone, ms, map_offset);
return 0;
}
@@ -767,9 +784,32 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
unsigned long nr_pages)
{
unsigned long i;
- int sections_to_remove;
- resource_size_t start, size;
- int ret = 0;
+ unsigned long map_offset = 0;
+ int sections_to_remove, ret = 0;
+
+ /* In the ZONE_DEVICE case device driver owns the memory region */
+ if (is_dev_zone(zone)) {
+ struct page *page = pfn_to_page(phys_start_pfn);
+ struct vmem_altmap *altmap;
+
+ altmap = to_vmem_altmap((unsigned long) page);
+ if (altmap)
+ map_offset = vmem_altmap_offset(altmap);
+ } else {
+ resource_size_t start, size;
+
+ start = phys_start_pfn << PAGE_SHIFT;
+ size = nr_pages * PAGE_SIZE;
+
+ ret = release_mem_region_adjustable(&iomem_resource, start,
+ size);
+ if (ret) {
+ resource_size_t endres = start + size - 1;
+
+ pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
+ &start, &endres, ret);
+ }
+ }
/*
* We can only remove entire sections
@@ -777,23 +817,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
BUG_ON(nr_pages % PAGES_PER_SECTION);
- start = phys_start_pfn << PAGE_SHIFT;
- size = nr_pages * PAGE_SIZE;
-
- /* in the ZONE_DEVICE case device driver owns the memory region */
- if (!is_dev_zone(zone))
- ret = release_mem_region_adjustable(&iomem_resource, start, size);
- if (ret) {
- resource_size_t endres = start + size - 1;
-
- pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
- &start, &endres, ret);
- }
-
sections_to_remove = nr_pages / PAGES_PER_SECTION;
for (i = 0; i < sections_to_remove; i++) {
unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
- ret = __remove_section(zone, __pfn_to_section(pfn));
+
+ ret = __remove_section(zone, __pfn_to_section(pfn), map_offset);
+ map_offset = 0;
if (ret)
break;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d8caff0..27d1354 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -489,14 +489,33 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
struct page *page;
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
- int nid;
+ int nid, ret;
pte_t *pte;
spinlock_t *ptl;
- split_huge_page_pmd(vma, addr, pmd);
- if (pmd_trans_unstable(pmd))
- return 0;
+ if (pmd_trans_huge(*pmd)) {
+ ptl = pmd_lock(walk->mm, pmd);
+ if (pmd_trans_huge(*pmd)) {
+ page = pmd_page(*pmd);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, addr);
+ } else {
+ get_page(page);
+ spin_unlock(ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (ret)
+ return 0;
+ }
+ } else {
+ spin_unlock(ptl);
+ }
+ }
+retry:
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE) {
if (!pte_present(*pte))
@@ -513,6 +532,21 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
nid = page_to_nid(page);
if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
continue;
+ if (PageTail(page) && PageAnon(page)) {
+ get_page(page);
+ pte_unmap_unlock(pte, ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ /* Failed to split -- skip. */
+ if (ret) {
+ pte = pte_offset_map_lock(walk->mm, pmd,
+ addr, &ptl);
+ continue;
+ }
+ goto retry;
+ }
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
migrate_page_add(page, qp->pagelist, flags);
@@ -610,7 +644,8 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
- if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
+ if (vma_migratable(vma) &&
+ vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
change_prot_numa(vma, start, endvma);
return 1;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 7890d0b..b1034f9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -165,9 +165,9 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
if (PageAnon(new))
hugepage_add_anon_rmap(new, vma, addr);
else
- page_dup_rmap(new);
+ page_dup_rmap(new, true);
} else if (PageAnon(new))
- page_add_anon_rmap(new, vma, addr);
+ page_add_anon_rmap(new, vma, addr, false);
else
page_add_file_rmap(new);
@@ -943,9 +943,13 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
goto out;
}
- if (unlikely(PageTransHuge(page)))
- if (unlikely(split_huge_page(page)))
+ if (unlikely(PageTransHuge(page))) {
+ lock_page(page);
+ rc = split_huge_page(page);
+ unlock_page(page);
+ if (rc)
goto out;
+ }
rc = __unmap_and_move(page, newpage, force, mode);
if (rc == MIGRATEPAGE_SUCCESS)
@@ -1756,6 +1760,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
HPAGE_PMD_ORDER);
if (!new_page)
goto out_fail;
+ prep_transhuge_page(new_page);
isolated = numamigrate_isolate_page(pgdat, page);
if (!isolated) {
@@ -1767,7 +1772,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
flush_tlb_range(vma, mmun_start, mmun_end);
/* Prepare a page as a migration target */
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
SetPageSwapBacked(new_page);
/* anon mapping, we can simply copy page->mapping to the new page: */
@@ -1815,7 +1820,7 @@ fail_putback:
* guarantee the copy is visible before the pagetable update.
*/
flush_cache_range(vma, mmun_start, mmun_end);
- page_add_anon_rmap(new_page, vma, mmun_start);
+ page_add_anon_rmap(new_page, vma, mmun_start, true);
pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
set_pmd_at(mm, mmun_start, pmd, entry);
flush_tlb_range(vma, mmun_start, mmun_end);
@@ -1826,14 +1831,14 @@ fail_putback:
flush_tlb_range(vma, mmun_start, mmun_end);
mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry);
- page_remove_rmap(new_page);
+ page_remove_rmap(new_page, true);
goto fail_putback;
}
mlock_migrate_page(new_page, page);
set_page_memcg(new_page, page_memcg(page));
set_page_memcg(page, NULL);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
diff --git a/mm/mincore.c b/mm/mincore.c
index 14bb9fb..2a565ed 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -117,7 +117,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
unsigned char *vec = walk->private;
int nr = (end - addr) >> PAGE_SHIFT;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
memset(vec, 1, nr);
spin_unlock(ptl);
goto out;
diff --git a/mm/mlock.c b/mm/mlock.c
index 9cb87cb..e1e2b12 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -24,13 +24,13 @@
#include "internal.h"
-int can_do_mlock(void)
+bool can_do_mlock(void)
{
if (rlimit(RLIMIT_MEMLOCK) != 0)
- return 1;
+ return true;
if (capable(CAP_IPC_LOCK))
- return 1;
- return 0;
+ return true;
+ return false;
}
EXPORT_SYMBOL(can_do_mlock);
@@ -82,6 +82,9 @@ void mlock_vma_page(struct page *page)
/* Serialize with page migration */
BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
+
if (!TestSetPageMlocked(page)) {
mod_zone_page_state(page_zone(page), NR_MLOCK,
hpage_nr_pages(page));
@@ -178,6 +181,8 @@ unsigned int munlock_vma_page(struct page *page)
/* For try_to_munlock() and to serialize with page migration */
BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
/*
* Serialize with any parallel __split_huge_page_refcount() which
* might otherwise copy PageMlocked to part of the tail pages before
@@ -388,6 +393,13 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
if (!page || page_zone_id(page) != zoneid)
break;
+ /*
+ * Do not use pagevec for PTE-mapped THP,
+ * munlock_vma_pages_range() will handle them.
+ */
+ if (PageTransCompound(page))
+ break;
+
get_page(page);
/*
* Increase the address that will be returned *before* the
@@ -444,7 +456,10 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
&page_mask);
if (page && !IS_ERR(page)) {
- if (PageTransHuge(page)) {
+ if (PageTransTail(page)) {
+ VM_BUG_ON_PAGE(PageMlocked(page), page);
+ put_page(page); /* follow_page_mask() */
+ } else if (PageTransHuge(page)) {
lock_page(page);
/*
* Any THP page found by follow_page_mask() may
@@ -477,8 +492,6 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
goto next;
}
}
- /* It's a bug to munlock in the middle of a THP page */
- VM_BUG_ON((start >> PAGE_SHIFT) & page_mask);
page_increm = 1 + page_mask;
start += page_increm * PAGE_SIZE;
next:
diff --git a/mm/mmap.c b/mm/mmap.c
index b3f00b6..84b1262 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3184,10 +3184,16 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
* mapping->flags avoid to take the same lock twice, if more than one
* vma in this mm is backed by the same anon_vma or address_space.
*
- * We can take all the locks in random order because the VM code
- * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never
- * takes more than one of them in a row. Secondly we're protected
- * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
+ * We take locks in following order, accordingly to comment at beginning
+ * of mm/rmap.c:
+ * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
+ * hugetlb mapping);
+ * - all i_mmap_rwsem locks;
+ * - all anon_vma->rwseml
+ *
+ * We can take all locks within these types randomly because the VM code
+ * doesn't nest them and we protected from parallel mm_take_all_locks() by
+ * mm_all_locks_mutex.
*
* mm_take_all_locks() and mm_drop_all_locks are expensive operations
* that may have to take thousand of locks.
@@ -3206,7 +3212,16 @@ int mm_take_all_locks(struct mm_struct *mm)
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (signal_pending(current))
goto out_unlock;
- if (vma->vm_file && vma->vm_file->f_mapping)
+ if (vma->vm_file && vma->vm_file->f_mapping &&
+ is_vm_hugetlb_page(vma))
+ vm_lock_mapping(mm, vma->vm_file->f_mapping);
+ }
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (signal_pending(current))
+ goto out_unlock;
+ if (vma->vm_file && vma->vm_file->f_mapping &&
+ !is_vm_hugetlb_page(vma))
vm_lock_mapping(mm, vma->vm_file->f_mapping);
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index c764402..8eb7bb4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -149,7 +149,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
unsigned long this_pages;
next = pmd_addr_end(addr, end);
- if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd))
+ if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
+ && pmd_none_or_clear_bad(pmd))
continue;
/* invoke the mmu notifier if the pmd is populated */
@@ -158,9 +159,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(mm, mni_start, end);
}
- if (pmd_trans_huge(*pmd)) {
+ if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
- split_huge_page_pmd(vma, addr, pmd);
+ split_huge_pmd(vma, pmd, addr);
else {
int nr_ptes = change_huge_pmd(vma, pmd, addr,
newprot, prot_numa);
diff --git a/mm/mremap.c b/mm/mremap.c
index e55b157..d77946a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -192,25 +192,24 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (!new_pmd)
break;
if (pmd_trans_huge(*old_pmd)) {
- int err = 0;
if (extent == HPAGE_PMD_SIZE) {
+ bool moved;
VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma,
vma);
/* See comment in move_ptes() */
if (need_rmap_locks)
anon_vma_lock_write(vma->anon_vma);
- err = move_huge_pmd(vma, new_vma, old_addr,
+ moved = move_huge_pmd(vma, new_vma, old_addr,
new_addr, old_end,
old_pmd, new_pmd);
if (need_rmap_locks)
anon_vma_unlock_write(vma->anon_vma);
+ if (moved) {
+ need_flush = true;
+ continue;
+ }
}
- if (err > 0) {
- need_flush = true;
- continue;
- } else if (!err) {
- split_huge_page_pmd(vma, old_addr, old_pmd);
- }
+ split_huge_pmd(vma, old_pmd, old_addr);
VM_BUG_ON(pmd_trans_huge(*old_pmd));
}
if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ce63d60..63358d9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -43,6 +43,7 @@
#include <linux/vmalloc.h>
#include <linux/vmstat.h>
#include <linux/mempolicy.h>
+#include <linux/memremap.h>
#include <linux/stop_machine.h>
#include <linux/sort.h>
#include <linux/pfn.h>
@@ -222,13 +223,15 @@ static char * const zone_names[MAX_NR_ZONES] = {
#endif
};
-static void free_compound_page(struct page *page);
compound_page_dtor * const compound_page_dtors[] = {
NULL,
free_compound_page,
#ifdef CONFIG_HUGETLB_PAGE
free_huge_page,
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ free_transhuge_page,
+#endif
};
int min_free_kbytes = 1024;
@@ -450,7 +453,7 @@ out:
* This usage means that zero-order pages may not be compound.
*/
-static void free_compound_page(struct page *page)
+void free_compound_page(struct page *page)
{
__free_pages_ok(page, compound_order(page));
}
@@ -466,8 +469,10 @@ void prep_compound_page(struct page *page, unsigned int order)
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
set_page_count(p, 0);
+ p->mapping = TAIL_MAPPING;
set_compound_head(p, page);
}
+ atomic_set(compound_mapcount_ptr(page), -1);
}
#ifdef CONFIG_DEBUG_PAGEALLOC
@@ -732,7 +737,7 @@ static inline int free_pages_check(struct page *page)
const char *bad_reason = NULL;
unsigned long bad_flags = 0;
- if (unlikely(page_mapcount(page)))
+ if (unlikely(atomic_read(&page->_mapcount) != -1))
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
@@ -856,6 +861,27 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
ret = 0;
goto out;
}
+ switch (page - head_page) {
+ case 1:
+ /* the first tail page: ->mapping is compound_mapcount() */
+ if (unlikely(compound_mapcount(page))) {
+ bad_page(page, "nonzero compound_mapcount", 0);
+ goto out;
+ }
+ break;
+ case 2:
+ /*
+ * the second tail page: ->mapping is
+ * page_deferred_list().next -- ignore value.
+ */
+ break;
+ default:
+ if (page->mapping != TAIL_MAPPING) {
+ bad_page(page, "corrupted mapping in tail page", 0);
+ goto out;
+ }
+ break;
+ }
if (unlikely(!PageTail(page))) {
bad_page(page, "PageTail not set", 0);
goto out;
@@ -866,6 +892,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
}
ret = 0;
out:
+ page->mapping = NULL;
clear_compound_head(page);
return ret;
}
@@ -1329,7 +1356,7 @@ static inline int check_new_page(struct page *page)
const char *bad_reason = NULL;
unsigned long bad_flags = 0;
- if (unlikely(page_mapcount(page)))
+ if (unlikely(atomic_read(&page->_mapcount) != -1))
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
@@ -4459,16 +4486,22 @@ static inline unsigned long wait_table_bits(unsigned long size)
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context)
{
- pg_data_t *pgdat = NODE_DATA(nid);
+ struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn));
unsigned long end_pfn = start_pfn + size;
+ pg_data_t *pgdat = NODE_DATA(nid);
unsigned long pfn;
- struct zone *z;
unsigned long nr_initialised = 0;
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
- z = &pgdat->node_zones[zone];
+ /*
+ * Honor reservation requested by the driver for this ZONE_DEVICE
+ * memory
+ */
+ if (altmap && start_pfn == altmap->base_pfn)
+ start_pfn += altmap->reserve;
+
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
* There can be holes in boot-time mem_map[]s
diff --git a/mm/page_idle.c b/mm/page_idle.c
index d5dd790..4ea9c4e 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -55,25 +55,26 @@ static int page_idle_clear_pte_refs_one(struct page *page,
unsigned long addr, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
pmd_t *pmd;
pte_t *pte;
+ spinlock_t *ptl;
bool referenced = false;
- if (unlikely(PageTransHuge(page))) {
- pmd = page_check_address_pmd(page, mm, addr,
- PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
- if (pmd) {
- referenced = pmdp_clear_young_notify(vma, addr, pmd);
- spin_unlock(ptl);
- }
+ if (!page_check_address_transhuge(page, mm, addr, &pmd, &pte, &ptl))
+ return SWAP_AGAIN;
+
+ if (pte) {
+ referenced = ptep_clear_young_notify(vma, addr, pte);
+ pte_unmap(pte);
+ } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ referenced = pmdp_clear_young_notify(vma, addr, pmd);
} else {
- pte = page_check_address(page, mm, addr, &ptl, 0);
- if (pte) {
- referenced = ptep_clear_young_notify(vma, addr, pte);
- pte_unmap_unlock(pte, ptl);
- }
+ /* unexpected pmd-mapped page? */
+ WARN_ON_ONCE(1);
}
+
+ spin_unlock(ptl);
+
if (referenced) {
clear_page_idle(page);
/*
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 5e139fe..92c4c36 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -196,8 +196,10 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
{
unsigned long pfn;
struct page *page;
- BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
- BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
+
+ BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
+ BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
+
for (pfn = start_pfn;
pfn < end_pfn;
pfn += pageblock_nr_pages) {
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 29f2f8b..2072444 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,7 +58,7 @@ again:
if (!walk->pte_entry)
continue;
- split_huge_page_pmd_mm(walk->mm, addr, pmd);
+ split_huge_pmd(walk->vma, pmd, addr);
if (pmd_trans_unstable(pmd))
goto again;
err = walk_pte_range(pmd, addr, next, walk);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 4c681ba..9d47676 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -132,25 +132,13 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
{
pmd_t pmd;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- VM_BUG_ON(!pmd_trans_huge(*pmdp));
+ VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
}
#endif
-#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
-{
- pmd_t pmd = pmd_mksplitting(*pmdp);
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- set_pmd_at(vma->vm_mm, address, pmdp, pmd);
- /* tlb flush only to serialize against gup-fast */
- flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
-}
-#endif
-
#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
diff --git a/mm/rmap.c b/mm/rmap.c
index 622756c..79f3bf0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -23,21 +23,22 @@
* inode->i_mutex (while writing or truncating, not reading or faulting)
* mm->mmap_sem
* page->flags PG_locked (lock_page)
- * mapping->i_mmap_rwsem
- * anon_vma->rwsem
- * mm->page_table_lock or pte_lock
- * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
- * swap_lock (in swap_duplicate, swap_info_get)
- * mmlist_lock (in mmput, drain_mmlist and others)
- * mapping->private_lock (in __set_page_dirty_buffers)
- * mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
- * mapping->tree_lock (widely used)
- * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
- * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
- * sb_lock (within inode_lock in fs/fs-writeback.c)
- * mapping->tree_lock (widely used, in set_page_dirty,
- * in arch-dependent flush_dcache_mmap_lock,
- * within bdi.wb->list_lock in __sync_single_inode)
+ * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
+ * mapping->i_mmap_rwsem
+ * anon_vma->rwsem
+ * mm->page_table_lock or pte_lock
+ * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
+ * swap_lock (in swap_duplicate, swap_info_get)
+ * mmlist_lock (in mmput, drain_mmlist and others)
+ * mapping->private_lock (in __set_page_dirty_buffers)
+ * mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
+ * mapping->tree_lock (widely used)
+ * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
+ * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
+ * sb_lock (within inode_lock in fs/fs-writeback.c)
+ * mapping->tree_lock (widely used, in set_page_dirty,
+ * in arch-dependent flush_dcache_mmap_lock,
+ * within bdi.wb->list_lock in __sync_single_inode)
*
* anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
* ->tasklist_lock
@@ -567,27 +568,6 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
anon_vma_unlock_read(anon_vma);
}
-/*
- * At what user virtual address is page expected in @vma?
- */
-static inline unsigned long
-__vma_address(struct page *page, struct vm_area_struct *vma)
-{
- pgoff_t pgoff = page_to_pgoff(page);
- return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-}
-
-inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
-{
- unsigned long address = __vma_address(page, vma);
-
- /* page should be within @vma mapping range */
- VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
-
- return address;
-}
-
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
static void percpu_flush_tlb_batch_pages(void *data)
{
@@ -819,6 +799,96 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
return 1;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * Check that @page is mapped at @address into @mm. In contrast to
+ * page_check_address(), this function can handle transparent huge pages.
+ *
+ * On success returns true with pte mapped and locked. For PMD-mapped
+ * transparent huge pages *@ptep is set to NULL.
+ */
+bool page_check_address_transhuge(struct page *page, struct mm_struct *mm,
+ unsigned long address, pmd_t **pmdp,
+ pte_t **ptep, spinlock_t **ptlp)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ if (unlikely(PageHuge(page))) {
+ /* when pud is not present, pte will be NULL */
+ pte = huge_pte_offset(mm, address);
+ if (!pte)
+ return false;
+
+ ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
+ pmd = NULL;
+ goto check_pte;
+ }
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ return false;
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return false;
+ pmd = pmd_offset(pud, address);
+
+ if (pmd_trans_huge(*pmd)) {
+ ptl = pmd_lock(mm, pmd);
+ if (!pmd_present(*pmd))
+ goto unlock_pmd;
+ if (unlikely(!pmd_trans_huge(*pmd))) {
+ spin_unlock(ptl);
+ goto map_pte;
+ }
+
+ if (pmd_page(*pmd) != page)
+ goto unlock_pmd;
+
+ pte = NULL;
+ goto found;
+unlock_pmd:
+ spin_unlock(ptl);
+ return false;
+ } else {
+ pmd_t pmde = *pmd;
+
+ barrier();
+ if (!pmd_present(pmde) || pmd_trans_huge(pmde))
+ return false;
+ }
+map_pte:
+ pte = pte_offset_map(pmd, address);
+ if (!pte_present(*pte)) {
+ pte_unmap(pte);
+ return false;
+ }
+
+ ptl = pte_lockptr(mm, pmd);
+check_pte:
+ spin_lock(ptl);
+
+ if (!pte_present(*pte)) {
+ pte_unmap_unlock(pte, ptl);
+ return false;
+ }
+
+ /* THP can be referenced by any subpage */
+ if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) {
+ pte_unmap_unlock(pte, ptl);
+ return false;
+ }
+found:
+ *ptep = pte;
+ *pmdp = pmd;
+ *ptlp = ptl;
+ return true;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
struct page_referenced_arg {
int mapcount;
int referenced;
@@ -832,49 +902,24 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
+ struct page_referenced_arg *pra = arg;
+ pmd_t *pmd;
+ pte_t *pte;
spinlock_t *ptl;
int referenced = 0;
- struct page_referenced_arg *pra = arg;
-
- if (unlikely(PageTransHuge(page))) {
- pmd_t *pmd;
- /*
- * rmap might return false positives; we must filter
- * these out using page_check_address_pmd().
- */
- pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
- if (!pmd)
- return SWAP_AGAIN;
-
- if (vma->vm_flags & VM_LOCKED) {
- spin_unlock(ptl);
- pra->vm_flags |= VM_LOCKED;
- return SWAP_FAIL; /* To break the loop */
- }
+ if (!page_check_address_transhuge(page, mm, address, &pmd, &pte, &ptl))
+ return SWAP_AGAIN;
- /* go ahead even if the pmd is pmd_trans_splitting() */
- if (pmdp_clear_flush_young_notify(vma, address, pmd))
- referenced++;
+ if (vma->vm_flags & VM_LOCKED) {
+ if (pte)
+ pte_unmap(pte);
spin_unlock(ptl);
- } else {
- pte_t *pte;
-
- /*
- * rmap might return false positives; we must filter
- * these out using page_check_address().
- */
- pte = page_check_address(page, mm, address, &ptl, 0);
- if (!pte)
- return SWAP_AGAIN;
-
- if (vma->vm_flags & VM_LOCKED) {
- pte_unmap_unlock(pte, ptl);
- pra->vm_flags |= VM_LOCKED;
- return SWAP_FAIL; /* To break the loop */
- }
+ pra->vm_flags |= VM_LOCKED;
+ return SWAP_FAIL; /* To break the loop */
+ }
+ if (pte) {
if (ptep_clear_flush_young_notify(vma, address, pte)) {
/*
* Don't treat a reference through a sequentially read
@@ -886,8 +931,15 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
if (likely(!(vma->vm_flags & VM_SEQ_READ)))
referenced++;
}
- pte_unmap_unlock(pte, ptl);
+ pte_unmap(pte);
+ } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ if (pmdp_clear_flush_young_notify(vma, address, pmd))
+ referenced++;
+ } else {
+ /* unexpected pmd-mapped page? */
+ WARN_ON_ONCE(1);
}
+ spin_unlock(ptl);
if (referenced)
clear_page_idle(page);
@@ -935,7 +987,7 @@ int page_referenced(struct page *page,
int ret;
int we_locked = 0;
struct page_referenced_arg pra = {
- .mapcount = page_mapcount(page),
+ .mapcount = total_mapcount(page),
.memcg = memcg,
};
struct rmap_walk_control rwc = {
@@ -1124,7 +1176,7 @@ static void __page_check_anon_rmap(struct page *page,
* over the call to page_add_new_anon_rmap.
*/
BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
- BUG_ON(page->index != linear_page_index(vma, address));
+ BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address));
#endif
}
@@ -1133,6 +1185,7 @@ static void __page_check_anon_rmap(struct page *page,
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
+ * @compound: charge the page as compound or small page
*
* The caller needs to hold the pte lock, and the page must be locked in
* the anon_vma case: to serialize mapping,index checking after setting,
@@ -1140,9 +1193,9 @@ static void __page_check_anon_rmap(struct page *page,
* (but PageKsm is never downgraded to PageAnon).
*/
void page_add_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address)
+ struct vm_area_struct *vma, unsigned long address, bool compound)
{
- do_page_add_anon_rmap(page, vma, address, 0);
+ do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
}
/*
@@ -1151,29 +1204,44 @@ void page_add_anon_rmap(struct page *page,
* Everybody else should continue to use page_add_anon_rmap above.
*/
void do_page_add_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address, int exclusive)
+ struct vm_area_struct *vma, unsigned long address, int flags)
{
- int first = atomic_inc_and_test(&page->_mapcount);
+ bool compound = flags & RMAP_COMPOUND;
+ bool first;
+
+ if (compound) {
+ atomic_t *mapcount;
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ mapcount = compound_mapcount_ptr(page);
+ first = atomic_inc_and_test(mapcount);
+ } else {
+ first = atomic_inc_and_test(&page->_mapcount);
+ }
+
if (first) {
+ int nr = compound ? hpage_nr_pages(page) : 1;
/*
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption
* disabled.
*/
- if (PageTransHuge(page))
+ if (compound) {
__inc_zone_page_state(page,
NR_ANON_TRANSPARENT_HUGEPAGES);
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
- hpage_nr_pages(page));
+ }
+ __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
}
if (unlikely(PageKsm(page)))
return;
VM_BUG_ON_PAGE(!PageLocked(page), page);
+
/* address might be in next vma when migration races vma_adjust */
if (first)
- __page_set_anon_rmap(page, vma, address, exclusive);
+ __page_set_anon_rmap(page, vma, address,
+ flags & RMAP_EXCLUSIVE);
else
__page_check_anon_rmap(page, vma, address);
}
@@ -1183,21 +1251,31 @@ void do_page_add_anon_rmap(struct page *page,
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
+ * @compound: charge the page as compound or small page
*
* Same as page_add_anon_rmap but must only be called on *new* pages.
* This means the inc-and-test can be bypassed.
* Page does not have to be locked.
*/
void page_add_new_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address)
+ struct vm_area_struct *vma, unsigned long address, bool compound)
{
+ int nr = compound ? hpage_nr_pages(page) : 1;
+
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
SetPageSwapBacked(page);
- atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
- if (PageTransHuge(page))
+ if (compound) {
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ /* increment count (starts at -1) */
+ atomic_set(compound_mapcount_ptr(page), 0);
__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
- hpage_nr_pages(page));
+ } else {
+ /* Anon THP always mapped first with PMD */
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
+ /* increment count (starts at -1) */
+ atomic_set(&page->_mapcount, 0);
+ }
+ __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
__page_set_anon_rmap(page, vma, address, 1);
}
@@ -1225,12 +1303,15 @@ static void page_remove_file_rmap(struct page *page)
memcg = mem_cgroup_begin_page_stat(page);
- /* page still mapped by someone else? */
- if (!atomic_add_negative(-1, &page->_mapcount))
+ /* Hugepages are not counted in NR_FILE_MAPPED for now. */
+ if (unlikely(PageHuge(page))) {
+ /* hugetlb pages are always mapped with pmds */
+ atomic_dec(compound_mapcount_ptr(page));
goto out;
+ }
- /* Hugepages are not counted in NR_FILE_MAPPED for now. */
- if (unlikely(PageHuge(page)))
+ /* page still mapped by someone else? */
+ if (!atomic_add_negative(-1, &page->_mapcount))
goto out;
/*
@@ -1247,41 +1328,79 @@ out:
mem_cgroup_end_page_stat(memcg);
}
+static void page_remove_anon_compound_rmap(struct page *page)
+{
+ int i, nr;
+
+ if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
+ return;
+
+ /* Hugepages are not counted in NR_ANON_PAGES for now. */
+ if (unlikely(PageHuge(page)))
+ return;
+
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ return;
+
+ __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+
+ if (TestClearPageDoubleMap(page)) {
+ /*
+ * Subpages can be mapped with PTEs too. Check how many of
+ * themi are still mapped.
+ */
+ for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+ if (atomic_add_negative(-1, &page[i]._mapcount))
+ nr++;
+ }
+ } else {
+ nr = HPAGE_PMD_NR;
+ }
+
+ if (unlikely(PageMlocked(page)))
+ clear_page_mlock(page);
+
+ if (nr) {
+ __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr);
+ deferred_split_huge_page(page);
+ }
+}
+
/**
* page_remove_rmap - take down pte mapping from a page
- * @page: page to remove mapping from
+ * @page: page to remove mapping from
+ * @compound: uncharge the page as compound or small page
*
* The caller needs to hold the pte lock.
*/
-void page_remove_rmap(struct page *page)
+void page_remove_rmap(struct page *page, bool compound)
{
if (!PageAnon(page)) {
+ VM_BUG_ON_PAGE(compound && !PageHuge(page), page);
page_remove_file_rmap(page);
return;
}
+ if (compound)
+ return page_remove_anon_compound_rmap(page);
+
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
return;
- /* Hugepages are not counted in NR_ANON_PAGES for now. */
- if (unlikely(PageHuge(page)))
- return;
-
/*
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
- if (PageTransHuge(page))
- __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
-
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
- -hpage_nr_pages(page));
+ __dec_zone_page_state(page, NR_ANON_PAGES);
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
+ if (PageTransCompound(page))
+ deferred_split_huge_page(compound_head(page));
+
/*
* It would be tidy to reset the PageAnon mapping here,
* but that might overwrite a racing page_add_anon_rmap
@@ -1293,6 +1412,11 @@ void page_remove_rmap(struct page *page)
*/
}
+struct rmap_private {
+ enum ttu_flags flags;
+ int lazyfreed;
+};
+
/*
* @arg: enum ttu_flags will be passed to this argument
*/
@@ -1304,7 +1428,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
pte_t pteval;
spinlock_t *ptl;
int ret = SWAP_AGAIN;
- enum ttu_flags flags = (enum ttu_flags)arg;
+ struct rmap_private *rp = arg;
+ enum ttu_flags flags = rp->flags;
/* munlock has nothing to gain from examining un-locked vmas */
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
@@ -1396,6 +1521,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* See handle_pte_fault() ...
*/
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+
+ if (!PageDirty(page) && (flags & TTU_LZFREE)) {
+ /* It's a freeable page by MADV_FREE */
+ dec_mm_counter(mm, MM_ANONPAGES);
+ rp->lazyfreed++;
+ goto discard;
+ }
+
if (swap_duplicate(entry) < 0) {
set_pte_at(mm, address, pte, pteval);
ret = SWAP_FAIL;
@@ -1416,7 +1549,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
} else
dec_mm_counter(mm, mm_counter_file(page));
- page_remove_rmap(page);
+discard:
+ page_remove_rmap(page, PageHuge(page));
page_cache_release(page);
out_unmap:
@@ -1468,9 +1602,14 @@ static int page_not_mapped(struct page *page)
int try_to_unmap(struct page *page, enum ttu_flags flags)
{
int ret;
+ struct rmap_private rp = {
+ .flags = flags,
+ .lazyfreed = 0,
+ };
+
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
- .arg = (void *)flags,
+ .arg = &rp,
.done = page_not_mapped,
.anon_lock = page_lock_anon_vma_read,
};
@@ -1490,8 +1629,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
ret = rmap_walk(page, &rwc);
- if (ret != SWAP_MLOCK && !page_mapped(page))
+ if (ret != SWAP_MLOCK && !page_mapped(page)) {
ret = SWAP_SUCCESS;
+ if (rp.lazyfreed && !PageDirty(page))
+ ret = SWAP_LZFREE;
+ }
return ret;
}
@@ -1513,9 +1655,14 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
int try_to_munlock(struct page *page)
{
int ret;
+ struct rmap_private rp = {
+ .flags = TTU_MUNLOCK,
+ .lazyfreed = 0,
+ };
+
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
- .arg = (void *)TTU_MUNLOCK,
+ .arg = &rp,
.done = page_not_mapped,
.anon_lock = page_lock_anon_vma_read,
@@ -1698,7 +1845,7 @@ void hugepage_add_anon_rmap(struct page *page,
BUG_ON(!PageLocked(page));
BUG_ON(!anon_vma);
/* address might be in next vma when migration races vma_adjust */
- first = atomic_inc_and_test(&page->_mapcount);
+ first = atomic_inc_and_test(compound_mapcount_ptr(page));
if (first)
__hugepage_set_anon_rmap(page, vma, address, 0);
}
@@ -1707,7 +1854,7 @@ void hugepage_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
- atomic_set(&page->_mapcount, 0);
+ atomic_set(compound_mapcount_ptr(page), 0);
__hugepage_set_anon_rmap(page, vma, address, 1);
}
#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/shmem.c b/mm/shmem.c
index 970ff5b..b98e101 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -810,7 +810,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
* the shmem_swaplist_mutex which might hold up shmem_writepage().
* Charged back to the user (not to caller) when swap account is used.
*/
- error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg);
+ error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg,
+ false);
if (error)
goto out;
/* No radix_tree_preload: swap entry keeps a place for page in tree */
@@ -833,9 +834,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
if (error) {
if (error != -ENOMEM)
error = 0;
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
} else
- mem_cgroup_commit_charge(page, memcg, true);
+ mem_cgroup_commit_charge(page, memcg, true, false);
out:
unlock_page(page);
page_cache_release(page);
@@ -1085,7 +1086,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
copy_highpage(newpage, oldpage);
flush_dcache_page(newpage);
- __set_page_locked(newpage);
+ __SetPageLocked(newpage);
SetPageUptodate(newpage);
SetPageSwapBacked(newpage);
set_page_private(newpage, swap_index);
@@ -1218,7 +1219,8 @@ repeat:
goto failed;
}
- error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
+ error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+ false);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
swp_to_radix_entry(swap));
@@ -1235,14 +1237,14 @@ repeat:
* "repeat": reading a hole and writing should succeed.
*/
if (error) {
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
delete_from_swap_cache(page);
}
}
if (error)
goto failed;
- mem_cgroup_commit_charge(page, memcg, true);
+ mem_cgroup_commit_charge(page, memcg, true, false);
spin_lock(&info->lock);
info->swapped--;
@@ -1277,11 +1279,12 @@ repeat:
}
__SetPageSwapBacked(page);
- __set_page_locked(page);
+ __SetPageLocked(page);
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
- error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
+ error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+ false);
if (error)
goto decused;
error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
@@ -1291,10 +1294,10 @@ repeat:
radix_tree_preload_end();
}
if (error) {
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
goto decused;
}
- mem_cgroup_commit_charge(page, memcg, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_anon(page);
spin_lock(&info->lock);
diff --git a/mm/slub.c b/mm/slub.c
index 2d0e610..b21fd24 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -338,11 +338,13 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
*/
static __always_inline void slab_lock(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
bit_spin_lock(PG_locked, &page->flags);
}
static __always_inline void slab_unlock(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
__bit_spin_unlock(PG_locked, &page->flags);
}
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 4cba9c2..b60802b 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -20,6 +20,7 @@
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/bootmem.h>
+#include <linux/memremap.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -70,7 +71,7 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
}
/* need to make sure size is all the same during early stage */
-void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
+static void * __meminit alloc_block_buf(unsigned long size, int node)
{
void *ptr;
@@ -87,6 +88,77 @@ void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
return ptr;
}
+static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
+{
+ return altmap->base_pfn + altmap->reserve + altmap->alloc
+ + altmap->align;
+}
+
+static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap)
+{
+ unsigned long allocated = altmap->alloc + altmap->align;
+
+ if (altmap->free > allocated)
+ return altmap->free - allocated;
+ return 0;
+}
+
+/**
+ * vmem_altmap_alloc - allocate pages from the vmem_altmap reservation
+ * @altmap - reserved page pool for the allocation
+ * @nr_pfns - size (in pages) of the allocation
+ *
+ * Allocations are aligned to the size of the request
+ */
+static unsigned long __meminit vmem_altmap_alloc(struct vmem_altmap *altmap,
+ unsigned long nr_pfns)
+{
+ unsigned long pfn = vmem_altmap_next_pfn(altmap);
+ unsigned long nr_align;
+
+ nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG);
+ nr_align = ALIGN(pfn, nr_align) - pfn;
+
+ if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap))
+ return ULONG_MAX;
+ altmap->alloc += nr_pfns;
+ altmap->align += nr_align;
+ return pfn + nr_align;
+}
+
+static void * __meminit altmap_alloc_block_buf(unsigned long size,
+ struct vmem_altmap *altmap)
+{
+ unsigned long pfn, nr_pfns;
+ void *ptr;
+
+ if (size & ~PAGE_MASK) {
+ pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n",
+ __func__, size);
+ return NULL;
+ }
+
+ nr_pfns = size >> PAGE_SHIFT;
+ pfn = vmem_altmap_alloc(altmap, nr_pfns);
+ if (pfn < ULONG_MAX)
+ ptr = __va(__pfn_to_phys(pfn));
+ else
+ ptr = NULL;
+ pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n",
+ __func__, pfn, altmap->alloc, altmap->align, nr_pfns);
+
+ return ptr;
+}
+
+/* need to make sure size is all the same during early stage */
+void * __meminit __vmemmap_alloc_block_buf(unsigned long size, int node,
+ struct vmem_altmap *altmap)
+{
+ if (altmap)
+ return altmap_alloc_block_buf(size, altmap);
+ return alloc_block_buf(size, node);
+}
+
void __meminit vmemmap_verify(pte_t *pte, int node,
unsigned long start, unsigned long end)
{
@@ -103,7 +175,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
pte_t *pte = pte_offset_kernel(pmd, addr);
if (pte_none(*pte)) {
pte_t entry;
- void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
+ void *p = alloc_block_buf(PAGE_SIZE, node);
if (!p)
return NULL;
entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
diff --git a/mm/sparse.c b/mm/sparse.c
index d1b48b6..3717cee 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -748,7 +748,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
if (!memmap)
return;
- for (i = 0; i < PAGES_PER_SECTION; i++) {
+ for (i = 0; i < nr_pages; i++) {
if (PageHWPoison(&memmap[i])) {
atomic_long_sub(1, &num_poisoned_pages);
ClearPageHWPoison(&memmap[i]);
@@ -788,7 +788,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
free_map_bootmem(memmap);
}
-void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
+void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
+ unsigned long map_offset)
{
struct page *memmap = NULL;
unsigned long *usemap = NULL, flags;
@@ -804,7 +805,8 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
}
pgdat_resize_unlock(pgdat, &flags);
- clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
+ clear_hwpoisoned_pages(memmap + map_offset,
+ PAGES_PER_SECTION - map_offset);
free_section_usemap(memmap, usemap);
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/swap.c b/mm/swap.c
index 39395fb..09fe5e9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -24,6 +24,7 @@
#include <linux/export.h>
#include <linux/mm_inline.h>
#include <linux/percpu_counter.h>
+#include <linux/memremap.h>
#include <linux/percpu.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
@@ -45,6 +46,7 @@ int page_cluster;
static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
/*
* This path almost never happens for VM activity - pages are normally
@@ -89,260 +91,14 @@ static void __put_compound_page(struct page *page)
(*dtor)(page);
}
-/**
- * Two special cases here: we could avoid taking compound_lock_irqsave
- * and could skip the tail refcounting(in _mapcount).
- *
- * 1. Hugetlbfs page:
- *
- * PageHeadHuge will remain true until the compound page
- * is released and enters the buddy allocator, and it could
- * not be split by __split_huge_page_refcount().
- *
- * So if we see PageHeadHuge set, and we have the tail page pin,
- * then we could safely put head page.
- *
- * 2. Slab THP page:
- *
- * PG_slab is cleared before the slab frees the head page, and
- * tail pin cannot be the last reference left on the head page,
- * because the slab code is free to reuse the compound page
- * after a kfree/kmem_cache_free without having to check if
- * there's any tail pin left. In turn all tail pinsmust be always
- * released while the head is still pinned by the slab code
- * and so we know PG_slab will be still set too.
- *
- * So if we see PageSlab set, and we have the tail page pin,
- * then we could safely put head page.
- */
-static __always_inline
-void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
-{
- /*
- * If @page is a THP tail, we must read the tail page
- * flags after the head page flags. The
- * __split_huge_page_refcount side enforces write memory barriers
- * between clearing PageTail and before the head page
- * can be freed and reallocated.
- */
- smp_rmb();
- if (likely(PageTail(page))) {
- /*
- * __split_huge_page_refcount cannot race
- * here, see the comment above this function.
- */
- VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
- if (put_page_testzero(page_head)) {
- /*
- * If this is the tail of a slab THP page,
- * the tail pin must not be the last reference
- * held on the page, because the PG_slab cannot
- * be cleared before all tail pins (which skips
- * the _mapcount tail refcounting) have been
- * released.
- *
- * If this is the tail of a hugetlbfs page,
- * the tail pin may be the last reference on
- * the page instead, because PageHeadHuge will
- * not go away until the compound page enters
- * the buddy allocator.
- */
- VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
- __put_compound_page(page_head);
- }
- } else
- /*
- * __split_huge_page_refcount run before us,
- * @page was a THP tail. The split @page_head
- * has been freed and reallocated as slab or
- * hugetlbfs page of smaller order (only
- * possible if reallocated as slab on x86).
- */
- if (put_page_testzero(page))
- __put_single_page(page);
-}
-
-static __always_inline
-void put_refcounted_compound_page(struct page *page_head, struct page *page)
-{
- if (likely(page != page_head && get_page_unless_zero(page_head))) {
- unsigned long flags;
-
- /*
- * @page_head wasn't a dangling pointer but it may not
- * be a head page anymore by the time we obtain the
- * lock. That is ok as long as it can't be freed from
- * under us.
- */
- flags = compound_lock_irqsave(page_head);
- if (unlikely(!PageTail(page))) {
- /* __split_huge_page_refcount run before us */
- compound_unlock_irqrestore(page_head, flags);
- if (put_page_testzero(page_head)) {
- /*
- * The @page_head may have been freed
- * and reallocated as a compound page
- * of smaller order and then freed
- * again. All we know is that it
- * cannot have become: a THP page, a
- * compound page of higher order, a
- * tail page. That is because we
- * still hold the refcount of the
- * split THP tail and page_head was
- * the THP head before the split.
- */
- if (PageHead(page_head))
- __put_compound_page(page_head);
- else
- __put_single_page(page_head);
- }
-out_put_single:
- if (put_page_testzero(page))
- __put_single_page(page);
- return;
- }
- VM_BUG_ON_PAGE(page_head != compound_head(page), page);
- /*
- * We can release the refcount taken by
- * get_page_unless_zero() now that
- * __split_huge_page_refcount() is blocked on the
- * compound_lock.
- */
- if (put_page_testzero(page_head))
- VM_BUG_ON_PAGE(1, page_head);
- /* __split_huge_page_refcount will wait now */
- VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page);
- atomic_dec(&page->_mapcount);
- VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head);
- VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
- compound_unlock_irqrestore(page_head, flags);
-
- if (put_page_testzero(page_head)) {
- if (PageHead(page_head))
- __put_compound_page(page_head);
- else
- __put_single_page(page_head);
- }
- } else {
- /* @page_head is a dangling pointer */
- VM_BUG_ON_PAGE(PageTail(page), page);
- goto out_put_single;
- }
-}
-
-static void put_compound_page(struct page *page)
-{
- struct page *page_head;
-
- /*
- * We see the PageCompound set and PageTail not set, so @page maybe:
- * 1. hugetlbfs head page, or
- * 2. THP head page.
- */
- if (likely(!PageTail(page))) {
- if (put_page_testzero(page)) {
- /*
- * By the time all refcounts have been released
- * split_huge_page cannot run anymore from under us.
- */
- if (PageHead(page))
- __put_compound_page(page);
- else
- __put_single_page(page);
- }
- return;
- }
-
- /*
- * We see the PageCompound set and PageTail set, so @page maybe:
- * 1. a tail hugetlbfs page, or
- * 2. a tail THP page, or
- * 3. a split THP page.
- *
- * Case 3 is possible, as we may race with
- * __split_huge_page_refcount tearing down a THP page.
- */
- page_head = compound_head(page);
- if (!__compound_tail_refcounted(page_head))
- put_unrefcounted_compound_page(page_head, page);
- else
- put_refcounted_compound_page(page_head, page);
-}
-
-void put_page(struct page *page)
+void __put_page(struct page *page)
{
if (unlikely(PageCompound(page)))
- put_compound_page(page);
- else if (put_page_testzero(page))
+ __put_compound_page(page);
+ else
__put_single_page(page);
}
-EXPORT_SYMBOL(put_page);
-
-/*
- * This function is exported but must not be called by anything other
- * than get_page(). It implements the slow path of get_page().
- */
-bool __get_page_tail(struct page *page)
-{
- /*
- * This takes care of get_page() if run on a tail page
- * returned by one of the get_user_pages/follow_page variants.
- * get_user_pages/follow_page itself doesn't need the compound
- * lock because it runs __get_page_tail_foll() under the
- * proper PT lock that already serializes against
- * split_huge_page().
- */
- unsigned long flags;
- bool got;
- struct page *page_head = compound_head(page);
-
- /* Ref to put_compound_page() comment. */
- if (!__compound_tail_refcounted(page_head)) {
- smp_rmb();
- if (likely(PageTail(page))) {
- /*
- * This is a hugetlbfs page or a slab
- * page. __split_huge_page_refcount
- * cannot race here.
- */
- VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
- __get_page_tail_foll(page, true);
- return true;
- } else {
- /*
- * __split_huge_page_refcount run
- * before us, "page" was a THP
- * tail. The split page_head has been
- * freed and reallocated as slab or
- * hugetlbfs page of smaller order
- * (only possible if reallocated as
- * slab on x86).
- */
- return false;
- }
- }
-
- got = false;
- if (likely(page != page_head && get_page_unless_zero(page_head))) {
- /*
- * page_head wasn't a dangling pointer but it
- * may not be a head page anymore by the time
- * we obtain the lock. That is ok as long as it
- * can't be freed from under us.
- */
- flags = compound_lock_irqsave(page_head);
- /* here __split_huge_page_refcount won't run anymore */
- if (likely(PageTail(page))) {
- __get_page_tail_foll(page, false);
- got = true;
- }
- compound_unlock_irqrestore(page_head, flags);
- if (unlikely(!got))
- put_page(page_head);
- }
- return got;
-}
-EXPORT_SYMBOL(__get_page_tail);
+EXPORT_SYMBOL(__put_page);
/**
* put_pages_list() - release a list of pages
@@ -604,6 +360,7 @@ static void __lru_cache_activate_page(struct page *page)
*/
void mark_page_accessed(struct page *page)
{
+ page = compound_head(page);
if (!PageActive(page) && !PageUnevictable(page) &&
PageReferenced(page)) {
@@ -799,6 +556,24 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
update_page_reclaim_stat(lruvec, file, 0);
}
+
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ int file = page_is_file_cache(page);
+ int lru = page_lru_base_type(page);
+
+ del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+ ClearPageActive(page);
+ ClearPageReferenced(page);
+ add_page_to_lru_list(page, lruvec, lru);
+
+ __count_vm_event(PGDEACTIVATE);
+ update_page_reclaim_stat(lruvec, file, 0);
+ }
+}
+
/*
* Drain pages out of the cpu's pagevecs.
* Either "cpu" is the current CPU, and preemption has already been
@@ -825,6 +600,10 @@ void lru_add_drain_cpu(int cpu)
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+ pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+ if (pagevec_count(pvec))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+
activate_page_drain(cpu);
}
@@ -854,6 +633,26 @@ void deactivate_file_page(struct page *page)
}
}
+/**
+ * deactivate_page - deactivate a page
+ * @page: page to deactivate
+ *
+ * deactivate_page() moves @page to the inactive list if @page was on the active
+ * list and was not an unevictable page. This is done to accelerate the reclaim
+ * of @page.
+ */
+void deactivate_page(struct page *page)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+
+ page_cache_get(page);
+ if (!pagevec_add(pvec, page))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ put_cpu_var(lru_deactivate_pvecs);
+ }
+}
+
void lru_add_drain(void)
{
lru_add_drain_cpu(get_cpu());
@@ -883,6 +682,7 @@ void lru_add_drain_all(void)
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
+ pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
schedule_work_on(cpu, work);
@@ -918,15 +718,6 @@ void release_pages(struct page **pages, int nr, bool cold)
for (i = 0; i < nr; i++) {
struct page *page = pages[i];
- if (unlikely(PageCompound(page))) {
- if (zone) {
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- zone = NULL;
- }
- put_compound_page(page);
- continue;
- }
-
/*
* Make sure the IRQ-safe lock-holding time does not get
* excessive with a continuous string of pages from the
@@ -937,9 +728,19 @@ void release_pages(struct page **pages, int nr, bool cold)
zone = NULL;
}
+ page = compound_head(page);
if (!put_page_testzero(page))
continue;
+ if (PageCompound(page)) {
+ if (zone) {
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ zone = NULL;
+ }
+ __put_compound_page(page);
+ continue;
+ }
+
if (PageLRU(page)) {
struct zone *pagezone = page_zone(page);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d504adb..676ff29 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -185,13 +185,12 @@ int add_to_swap(struct page *page, struct list_head *list)
* deadlock in the swap out path.
*/
/*
- * Add it to the swap cache and mark it dirty
+ * Add it to the swap cache.
*/
err = add_to_swap_cache(page, entry,
__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
- if (!err) { /* Success */
- SetPageDirty(page);
+ if (!err) {
return 1;
} else { /* -ENOMEM radix-tree allocation failure */
/*
@@ -353,7 +352,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
}
/* May fail (-ENOMEM) if radix-tree node allocation failed. */
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
SetPageSwapBacked(new_page);
err = __add_to_swap_cache(new_page, entry);
if (likely(!err)) {
@@ -367,7 +366,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
}
radix_tree_preload_end();
ClearPageSwapBacked(new_page);
- __clear_page_locked(new_page);
+ __ClearPageLocked(new_page);
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
* clear SWAP_HAS_CACHE flag.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e6b8591..2bb30aa 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -926,6 +926,9 @@ int reuse_swap_page(struct page *page)
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (unlikely(PageKsm(page)))
return 0;
+ /* The page is part of THP and cannot be reused */
+ if (PageTransCompound(page))
+ return 0;
count = page_mapcount(page);
if (count <= 1 && PageSwapCache(page)) {
count += page_swapcount(page);
@@ -1108,19 +1111,9 @@ unsigned int count_swap_pages(int type, int free)
}
#endif /* CONFIG_HIBERNATION */
-static inline int maybe_same_pte(pte_t pte, pte_t swp_pte)
+static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
{
-#ifdef CONFIG_MEM_SOFT_DIRTY
- /*
- * When pte keeps soft dirty bit the pte generated
- * from swap entry does not has it, still it's same
- * pte from logical point of view.
- */
- pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte);
- return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty);
-#else
- return pte_same(pte, swp_pte);
-#endif
+ return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
}
/*
@@ -1142,14 +1135,15 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
if (unlikely(!page))
return -ENOMEM;
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) {
+ if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
+ &memcg, false)) {
ret = -ENOMEM;
goto out_nolock;
}
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) {
- mem_cgroup_cancel_charge(page, memcg);
+ if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
+ mem_cgroup_cancel_charge(page, memcg, false);
ret = 0;
goto out;
}
@@ -1160,11 +1154,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
if (page == swapcache) {
- page_add_anon_rmap(page, vma, addr);
- mem_cgroup_commit_charge(page, memcg, true);
+ page_add_anon_rmap(page, vma, addr, false);
+ mem_cgroup_commit_charge(page, memcg, true, false);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, addr);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, vma, addr, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
swap_free(entry);
@@ -1206,7 +1200,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
* swapoff spends a _lot_ of time in this loop!
* Test inline before going to call unuse_pte.
*/
- if (unlikely(maybe_same_pte(*pte, swp_pte))) {
+ if (unlikely(pte_same_as_swp(*pte, swp_pte))) {
pte_unmap(pte);
ret = unuse_pte(vma, pmd, addr, entry, page);
if (ret)
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 77fee93..806b0c7 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -63,7 +63,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
__SetPageUptodate(page);
ret = -ENOMEM;
- if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg))
+ if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
goto out_release;
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
@@ -76,8 +76,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
goto out_release_uncharge_unlock;
inc_mm_counter(dst_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, dst_vma, dst_addr);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, dst_vma);
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
@@ -91,7 +91,7 @@ out:
return ret;
out_release_uncharge_unlock:
pte_unmap_unlock(dst_pte, ptl);
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
out_release:
page_cache_release(page);
goto out;
diff --git a/mm/util.c b/mm/util.c
index 2d28f79..6d1f920 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -386,7 +386,9 @@ struct anon_vma *page_anon_vma(struct page *page)
struct address_space *page_mapping(struct page *page)
{
- unsigned long mapping;
+ struct address_space *mapping;
+
+ page = compound_head(page);
/* This happens if someone calls flush_dcache_page on slab page */
if (unlikely(PageSlab(page)))
@@ -399,11 +401,25 @@ struct address_space *page_mapping(struct page *page)
return swap_address_space(entry);
}
- mapping = (unsigned long)page->mapping;
- if (mapping & PAGE_MAPPING_FLAGS)
+ mapping = page->mapping;
+ if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
return NULL;
- return page->mapping;
+ return mapping;
+}
+
+/* Slow path of page_mapcount() for compound pages */
+int __page_mapcount(struct page *page)
+{
+ int ret;
+
+ ret = atomic_read(&page->_mapcount) + 1;
+ page = compound_head(page);
+ ret += atomic_read(compound_mapcount_ptr(page)) + 1;
+ if (PageDoubleMap(page))
+ ret--;
+ return ret;
}
+EXPORT_SYMBOL_GPL(__page_mapcount);
int overcommit_ratio_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 58ceeb1..fb42a5b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -455,7 +455,7 @@ found:
free_vmap_cache = &va->rb_node;
spin_unlock(&vmap_area_lock);
- BUG_ON(va->va_start & (align-1));
+ BUG_ON(!IS_ALIGNED(va->va_start, align));
BUG_ON(va->va_start < vstart);
BUG_ON(va->va_end > vend);
@@ -1086,7 +1086,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
BUG_ON(!addr);
BUG_ON(addr < VMALLOC_START);
BUG_ON(addr > VMALLOC_END);
- BUG_ON(addr & (PAGE_SIZE-1));
+ BUG_ON(!IS_ALIGNED(addr, PAGE_SIZE));
debug_check_no_locks_freed(mem, size);
vmap_debug_free_range(addr, addr+size);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 108bd11..5ac8695 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -906,6 +906,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
int may_enter_fs;
enum page_references references = PAGEREF_RECLAIM_CLEAN;
bool dirty, writeback;
+ bool lazyfree = false;
+ int ret = SWAP_SUCCESS;
cond_resched();
@@ -1049,6 +1051,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep_locked;
if (!add_to_swap(page, page_list))
goto activate_locked;
+ lazyfree = true;
may_enter_fs = 1;
/* Adding to swap updated mapping */
@@ -1060,14 +1063,17 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* processes. Try to unmap it here.
*/
if (page_mapped(page) && mapping) {
- switch (try_to_unmap(page,
- ttu_flags|TTU_BATCH_FLUSH)) {
+ switch (ret = try_to_unmap(page, lazyfree ?
+ (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) :
+ (ttu_flags | TTU_BATCH_FLUSH))) {
case SWAP_FAIL:
goto activate_locked;
case SWAP_AGAIN:
goto keep_locked;
case SWAP_MLOCK:
goto cull_mlocked;
+ case SWAP_LZFREE:
+ goto lazyfree;
case SWAP_SUCCESS:
; /* try to free the page below */
}
@@ -1174,6 +1180,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
}
+lazyfree:
if (!mapping || !__remove_mapping(mapping, page, true))
goto keep_locked;
@@ -1184,8 +1191,11 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* we obviously don't have to worry about waking up a process
* waiting on the page lock, because there are no references.
*/
- __clear_page_locked(page);
+ __ClearPageLocked(page);
free_it:
+ if (ret == SWAP_LZFREE)
+ count_vm_event(PGLAZYFREED);
+
nr_reclaimed++;
/*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 83a003b..64bd0aa 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -783,6 +783,7 @@ const char * const vmstat_text[] = {
"pgfault",
"pgmajfault",
+ "pglazyfreed",
TEXTS_FOR_ZONES("pgrefill")
TEXTS_FOR_ZONES("pgsteal_kswapd")
@@ -844,7 +845,9 @@ const char * const vmstat_text[] = {
"thp_fault_fallback",
"thp_collapse_alloc",
"thp_collapse_alloc_failed",
- "thp_split",
+ "thp_split_page",
+ "thp_split_page_failed",
+ "thp_split_pmd",
"thp_zero_page_alloc",
"thp_zero_page_alloc_failed",
#endif
OpenPOWER on IntegriCloud