diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-17 12:58:52 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-17 12:58:52 -0800 |
commit | 0cbeafb245ca568bc0765645aa64f0451b716657 (patch) | |
tree | 663c09ff5a62a1b2b66a17c4dfe0413603530a36 /mm | |
parent | 58cf279acac3080ce03eeea5ca268210b3165fe1 (diff) | |
parent | 06b031de22d28ae76b2e5bfaf22c56a265a1e106 (diff) | |
download | op-kernel-dev-0cbeafb245ca568bc0765645aa64f0451b716657.zip op-kernel-dev-0cbeafb245ca568bc0765645aa64f0451b716657.tar.gz |
Merge branch 'akpm' (patches from Andrew)
Merge second patch-bomb from Andrew Morton:
- more MM stuff:
- Kirill's page-flags rework
- Kirill's now-allegedly-fixed THP rework
- MADV_FREE implementation
- DAX feature work (msync/fsync). This isn't quite complete but DAX
is new and it's good enough and the guys have a handle on what
needs to be done - I expect this to be wrapped in the next week or
two.
- some vsprintf maintenance work
- various other misc bits
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (145 commits)
printk: change recursion_bug type to bool
lib/vsprintf: factor out %pN[F] handler as netdev_bits()
lib/vsprintf: refactor duplicate code to special_hex_number()
printk-formats.txt: remove unimplemented %pT
printk: help pr_debug and pr_devel to optimize out arguments
lib/test_printf.c: test dentry printing
lib/test_printf.c: add test for large bitmaps
lib/test_printf.c: account for kvasprintf tests
lib/test_printf.c: add a few number() tests
lib/test_printf.c: test precision quirks
lib/test_printf.c: check for out-of-bound writes
lib/test_printf.c: don't BUG
lib/kasprintf.c: add sanity check to kvasprintf
lib/vsprintf.c: warn about too large precisions and field widths
lib/vsprintf.c: help gcc make number() smaller
lib/vsprintf.c: expand field_width to 24 bits
lib/vsprintf.c: eliminate potential race in string()
lib/vsprintf.c: move string() below widen_string()
lib/vsprintf.c: pull out padding code from dentry_name()
printk: do cond_resched() between lines while outputting to consoles
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/debug.c | 8 | ||||
-rw-r--r-- | mm/filemap.c | 25 | ||||
-rw-r--r-- | mm/gup.c | 172 | ||||
-rw-r--r-- | mm/huge_memory.c | 1506 | ||||
-rw-r--r-- | mm/hugetlb.c | 12 | ||||
-rw-r--r-- | mm/internal.h | 70 | ||||
-rw-r--r-- | mm/ksm.c | 69 | ||||
-rw-r--r-- | mm/madvise.c | 201 | ||||
-rw-r--r-- | mm/memcontrol.c | 106 | ||||
-rw-r--r-- | mm/memory-failure.c | 125 | ||||
-rw-r--r-- | mm/memory.c | 101 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 67 | ||||
-rw-r--r-- | mm/mempolicy.c | 45 | ||||
-rw-r--r-- | mm/migrate.c | 21 | ||||
-rw-r--r-- | mm/mincore.c | 2 | ||||
-rw-r--r-- | mm/mlock.c | 27 | ||||
-rw-r--r-- | mm/mmap.c | 25 | ||||
-rw-r--r-- | mm/mprotect.c | 7 | ||||
-rw-r--r-- | mm/mremap.c | 15 | ||||
-rw-r--r-- | mm/page_alloc.c | 47 | ||||
-rw-r--r-- | mm/page_idle.c | 27 | ||||
-rw-r--r-- | mm/page_isolation.c | 6 | ||||
-rw-r--r-- | mm/pagewalk.c | 2 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 14 | ||||
-rw-r--r-- | mm/rmap.c | 369 | ||||
-rw-r--r-- | mm/shmem.c | 25 | ||||
-rw-r--r-- | mm/slub.c | 2 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 76 | ||||
-rw-r--r-- | mm/sparse.c | 8 | ||||
-rw-r--r-- | mm/swap.c | 319 | ||||
-rw-r--r-- | mm/swap_state.c | 9 | ||||
-rw-r--r-- | mm/swapfile.c | 34 | ||||
-rw-r--r-- | mm/userfaultfd.c | 8 | ||||
-rw-r--r-- | mm/util.c | 24 | ||||
-rw-r--r-- | mm/vmalloc.c | 4 | ||||
-rw-r--r-- | mm/vmscan.c | 16 | ||||
-rw-r--r-- | mm/vmstat.c | 5 |
37 files changed, 2155 insertions, 1444 deletions
@@ -40,9 +40,6 @@ static const struct trace_print_flags pageflag_names[] = { #ifdef CONFIG_MEMORY_FAILURE {1UL << PG_hwpoison, "hwpoison" }, #endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - {1UL << PG_compound_lock, "compound_lock" }, -#endif #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) {1UL << PG_young, "young" }, {1UL << PG_idle, "idle" }, @@ -82,9 +79,12 @@ static void dump_flags(unsigned long flags, void dump_page_badflags(struct page *page, const char *reason, unsigned long badflags) { - pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", + pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", page, atomic_read(&page->_count), page_mapcount(page), page->mapping, page->index); + if (PageCompound(page)) + pr_cont(" compound_mapcount: %d", compound_mapcount(page)); + pr_cont("\n"); BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names)); if (reason) diff --git a/mm/filemap.c b/mm/filemap.c index ff42d31..847ee43c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -204,7 +204,7 @@ void __delete_from_page_cache(struct page *page, void *shadow, __dec_zone_page_state(page, NR_FILE_PAGES); if (PageSwapBacked(page)) __dec_zone_page_state(page, NR_SHMEM); - BUG_ON(page_mapped(page)); + VM_BUG_ON_PAGE(page_mapped(page), page); /* * At this point page must be either written or cleaned by truncate. @@ -618,7 +618,7 @@ static int __add_to_page_cache_locked(struct page *page, if (!huge) { error = mem_cgroup_try_charge(page, current->mm, - gfp_mask, &memcg); + gfp_mask, &memcg, false); if (error) return error; } @@ -626,7 +626,7 @@ static int __add_to_page_cache_locked(struct page *page, error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); if (error) { if (!huge) - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); return error; } @@ -645,7 +645,7 @@ static int __add_to_page_cache_locked(struct page *page, __inc_zone_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); if (!huge) - mem_cgroup_commit_charge(page, memcg, false); + mem_cgroup_commit_charge(page, memcg, false, false); trace_mm_filemap_add_to_page_cache(page); return 0; err_insert: @@ -653,7 +653,7 @@ err_insert: /* Leave page->index set: truncation relies upon it */ spin_unlock_irq(&mapping->tree_lock); if (!huge) - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); page_cache_release(page); return error; } @@ -682,11 +682,11 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, void *shadow = NULL; int ret; - __set_page_locked(page); + __SetPageLocked(page); ret = __add_to_page_cache_locked(page, mapping, offset, gfp_mask, &shadow); if (unlikely(ret)) - __clear_page_locked(page); + __ClearPageLocked(page); else { /* * The page might have been evicted from cache only @@ -809,6 +809,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue); */ void unlock_page(struct page *page) { + page = compound_head(page); VM_BUG_ON_PAGE(!PageLocked(page), page); clear_bit_unlock(PG_locked, &page->flags); smp_mb__after_atomic(); @@ -873,18 +874,20 @@ EXPORT_SYMBOL_GPL(page_endio); */ void __lock_page(struct page *page) { - DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + struct page *page_head = compound_head(page); + DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); - __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io, + __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__lock_page); int __lock_page_killable(struct page *page) { - DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + struct page *page_head = compound_head(page); + DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); - return __wait_on_bit_lock(page_waitqueue(page), &wait, + return __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io, TASK_KILLABLE); } EXPORT_SYMBOL_GPL(__lock_page_killable); @@ -4,6 +4,7 @@ #include <linux/spinlock.h> #include <linux/mm.h> +#include <linux/memremap.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/swap.h> @@ -62,6 +63,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { struct mm_struct *mm = vma->vm_mm; + struct dev_pagemap *pgmap = NULL; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; @@ -98,7 +100,17 @@ retry: } page = vm_normal_page(vma, address, pte); - if (unlikely(!page)) { + if (!page && pte_devmap(pte) && (flags & FOLL_GET)) { + /* + * Only return device mapping pages in the FOLL_GET case since + * they are only valid while holding the pgmap reference. + */ + pgmap = get_dev_pagemap(pte_pfn(pte), NULL); + if (pgmap) + page = pte_page(pte); + else + goto no_page; + } else if (unlikely(!page)) { if (flags & FOLL_DUMP) { /* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); @@ -116,8 +128,28 @@ retry: } } - if (flags & FOLL_GET) - get_page_foll(page); + if (flags & FOLL_SPLIT && PageTransCompound(page)) { + int ret; + get_page(page); + pte_unmap_unlock(ptep, ptl); + lock_page(page); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + if (ret) + return ERR_PTR(ret); + goto retry; + } + + if (flags & FOLL_GET) { + get_page(page); + + /* drop the pgmap reference now that we hold the page */ + if (pgmap) { + put_dev_pagemap(pgmap); + pgmap = NULL; + } + } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) @@ -130,6 +162,10 @@ retry: mark_page_accessed(page); } if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + /* Do not mlock pte-mapped THP */ + if (PageTransCompound(page)) + goto out; + /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE @@ -220,27 +256,45 @@ struct page *follow_page_mask(struct vm_area_struct *vma, } if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) return no_page_table(vma, flags); - if (pmd_trans_huge(*pmd)) { - if (flags & FOLL_SPLIT) { - split_huge_page_pmd(vma, address, pmd); - return follow_page_pte(vma, address, pmd, flags); - } + if (pmd_devmap(*pmd)) { ptl = pmd_lock(mm, pmd); - if (likely(pmd_trans_huge(*pmd))) { - if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(ptl); - wait_split_huge_page(vma->anon_vma, pmd); - } else { - page = follow_trans_huge_pmd(vma, address, - pmd, flags); - spin_unlock(ptl); - *page_mask = HPAGE_PMD_NR - 1; - return page; - } - } else + page = follow_devmap_pmd(vma, address, pmd, flags); + spin_unlock(ptl); + if (page) + return page; + } + if (likely(!pmd_trans_huge(*pmd))) + return follow_page_pte(vma, address, pmd, flags); + + ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_trans_huge(*pmd))) { + spin_unlock(ptl); + return follow_page_pte(vma, address, pmd, flags); + } + if (flags & FOLL_SPLIT) { + int ret; + page = pmd_page(*pmd); + if (is_huge_zero_page(page)) { + spin_unlock(ptl); + ret = 0; + split_huge_pmd(vma, pmd, address); + } else { + get_page(page); spin_unlock(ptl); + lock_page(page); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + } + + return ret ? ERR_PTR(ret) : + follow_page_pte(vma, address, pmd, flags); } - return follow_page_pte(vma, address, pmd, flags); + + page = follow_trans_huge_pmd(vma, address, pmd, flags); + spin_unlock(ptl); + *page_mask = HPAGE_PMD_NR - 1; + return page; } static int get_gate_page(struct mm_struct *mm, unsigned long address, @@ -564,6 +618,8 @@ EXPORT_SYMBOL(__get_user_pages); * @mm: mm_struct of target mm * @address: user address * @fault_flags:flags to pass down to handle_mm_fault() + * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller + * does not allow retry * * This is meant to be called in the specific scenario where for locking reasons * we try to access user memory in atomic context (within a pagefault_disable() @@ -575,22 +631,28 @@ EXPORT_SYMBOL(__get_user_pages); * The main difference with get_user_pages() is that this function will * unconditionally call handle_mm_fault() which will in turn perform all the * necessary SW fixup of the dirty and young bits in the PTE, while - * handle_mm_fault() only guarantees to update these in the struct page. + * get_user_pages() only guarantees to update these in the struct page. * * This is important for some architectures where those bits also gate the * access permission to the page because they are maintained in software. On * such architectures, gup() will not be enough to make a subsequent access * succeed. * - * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault(). + * This function will not return with an unlocked mmap_sem. So it has not the + * same semantics wrt the @mm->mmap_sem as does filemap_fault(). */ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, - unsigned long address, unsigned int fault_flags) + unsigned long address, unsigned int fault_flags, + bool *unlocked) { struct vm_area_struct *vma; vm_flags_t vm_flags; - int ret; + int ret, major = 0; + if (unlocked) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; + +retry: vma = find_extend_vma(mm, address); if (!vma || address < vma->vm_start) return -EFAULT; @@ -600,6 +662,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, return -EFAULT; ret = handle_mm_fault(mm, vma, address, fault_flags); + major |= ret & VM_FAULT_MAJOR; if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return -ENOMEM; @@ -609,8 +672,19 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, return -EFAULT; BUG(); } + + if (ret & VM_FAULT_RETRY) { + down_read(&mm->mmap_sem); + if (!(fault_flags & FAULT_FLAG_TRIED)) { + *unlocked = true; + fault_flags &= ~FAULT_FLAG_ALLOW_RETRY; + fault_flags |= FAULT_FLAG_TRIED; + goto retry; + } + } + if (tsk) { - if (ret & VM_FAULT_MAJOR) + if (major) tsk->maj_flt++; else tsk->min_flt++; @@ -896,7 +970,6 @@ long populate_vma_page_range(struct vm_area_struct *vma, gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; if (vma->vm_flags & VM_LOCKONFAULT) gup_flags &= ~FOLL_POPULATE; - /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW @@ -1036,9 +1109,6 @@ struct page *get_dump_page(unsigned long addr) * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free * pages containing page tables. * - * *) THP splits will broadcast an IPI, this can be achieved by overriding - * pmdp_splitting_flush. - * * *) ptes can be read atomically by the architecture. * * *) access_ok is sufficient to validate userspace address ranges. @@ -1066,7 +1136,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, * for an example see gup_get_pte in arch/x86/mm/gup.c */ pte_t pte = READ_ONCE(*ptep); - struct page *page; + struct page *head, *page; /* * Similar to the PMD case below, NUMA hinting must take slow @@ -1078,15 +1148,17 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); + head = compound_head(page); - if (!page_cache_get_speculative(page)) + if (!page_cache_get_speculative(head)) goto pte_unmap; if (unlikely(pte_val(pte) != pte_val(*ptep))) { - put_page(page); + put_page(head); goto pte_unmap; } + VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; (*nr)++; @@ -1119,7 +1191,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { - struct page *head, *page, *tail; + struct page *head, *page; int refs; if (write && !pmd_write(orig)) @@ -1128,7 +1200,6 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, refs = 0; head = pmd_page(orig); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; @@ -1149,24 +1220,13 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; } - /* - * Any tail pages need their mapcount reference taken before we - * return. (This allows the THP code to bump their ref count when - * they are split into base pages). - */ - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { - struct page *head, *page, *tail; + struct page *head, *page; int refs; if (write && !pud_write(orig)) @@ -1175,7 +1235,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, refs = 0; head = pud_page(orig); page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; @@ -1196,12 +1255,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; } - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } @@ -1210,7 +1263,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, struct page **pages, int *nr) { int refs; - struct page *head, *page, *tail; + struct page *head, *page; if (write && !pgd_write(orig)) return 0; @@ -1218,7 +1271,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, refs = 0; head = pgd_page(orig); page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; @@ -1239,12 +1291,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, return 0; } - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } @@ -1259,7 +1305,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmd_t pmd = READ_ONCE(*pmdp); next = pmd_addr_end(addr, end); - if (pmd_none(pmd) || pmd_trans_splitting(pmd)) + if (pmd_none(pmd)) return 0; if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f952f05..b2db981 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -16,12 +16,16 @@ #include <linux/swap.h> #include <linux/shrinker.h> #include <linux/mm_inline.h> +#include <linux/swapops.h> #include <linux/dax.h> #include <linux/kthread.h> #include <linux/khugepaged.h> #include <linux/freezer.h> +#include <linux/pfn_t.h> #include <linux/mman.h> +#include <linux/memremap.h> #include <linux/pagemap.h> +#include <linux/debugfs.h> #include <linux/migrate.h> #include <linux/hashtable.h> #include <linux/userfaultfd_k.h> @@ -45,6 +49,7 @@ enum scan_result { SCAN_PAGE_LRU, SCAN_PAGE_LOCK, SCAN_PAGE_ANON, + SCAN_PAGE_COMPOUND, SCAN_ANY_PROCESS, SCAN_VMA_NULL, SCAN_VMA_CHECK, @@ -133,6 +138,10 @@ static struct khugepaged_scan khugepaged_scan = { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), }; +static DEFINE_SPINLOCK(split_queue_lock); +static LIST_HEAD(split_queue); +static unsigned long split_queue_len; +static struct shrinker deferred_split_shrinker; static void set_recommended_min_free_kbytes(void) { @@ -665,6 +674,9 @@ static int __init hugepage_init(void) err = register_shrinker(&huge_zero_page_shrinker); if (err) goto err_hzp_shrinker; + err = register_shrinker(&deferred_split_shrinker); + if (err) + goto err_split_shrinker; /* * By default disable transparent hugepages on smaller systems, @@ -682,6 +694,8 @@ static int __init hugepage_init(void) return 0; err_khugepaged: + unregister_shrinker(&deferred_split_shrinker); +err_split_shrinker: unregister_shrinker(&huge_zero_page_shrinker); err_hzp_shrinker: khugepaged_slab_exit(); @@ -738,6 +752,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) return entry; } +static inline struct list_head *page_deferred_list(struct page *page) +{ + /* + * ->lru in the tail pages is occupied by compound_head. + * Let's use ->mapping + ->index in the second tail page as list_head. + */ + return (struct list_head *)&page[2].mapping; +} + +void prep_transhuge_page(struct page *page) +{ + /* + * we use page->mapping and page->indexlru in second tail page + * as list_head: assuming THP order >= 2 + */ + BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); + + INIT_LIST_HEAD(page_deferred_list(page)); + set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); +} + static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, @@ -751,7 +786,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) { + if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -759,7 +794,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) { - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, true); put_page(page); return VM_FAULT_OOM; } @@ -775,7 +810,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_none(*pmd))) { spin_unlock(ptl); - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(mm, pgtable); } else { @@ -786,7 +821,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, int ret; spin_unlock(ptl); - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(mm, pgtable); ret = handle_userfault(vma, address, flags, @@ -797,8 +832,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - page_add_new_anon_rmap(page, vma, haddr); - mem_cgroup_commit_charge(page, memcg, false); + page_add_new_anon_rmap(page, vma, haddr, true); + mem_cgroup_commit_charge(page, memcg, false, true); lru_cache_add_active_or_unevictable(page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); @@ -892,32 +927,33 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } + prep_transhuge_page(page); return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, flags); } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write) + pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write) { struct mm_struct *mm = vma->vm_mm; pmd_t entry; spinlock_t *ptl; ptl = pmd_lock(mm, pmd); - if (pmd_none(*pmd)) { - entry = pmd_mkhuge(pfn_pmd(pfn, prot)); - if (write) { - entry = pmd_mkyoung(pmd_mkdirty(entry)); - entry = maybe_pmd_mkwrite(entry, vma); - } - set_pmd_at(mm, addr, pmd, entry); - update_mmu_cache_pmd(vma, addr, pmd); - } + entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); + if (pfn_t_devmap(pfn)) + entry = pmd_mkdevmap(entry); + if (write) { + entry = pmd_mkyoung(pmd_mkdirty(entry)); + entry = maybe_pmd_mkwrite(entry, vma); + } + set_pmd_at(mm, addr, pmd, entry); + update_mmu_cache_pmd(vma, addr, pmd); spin_unlock(ptl); } int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned long pfn, bool write) + pmd_t *pmd, pfn_t pfn, bool write) { pgprot_t pgprot = vma->vm_page_prot; /* @@ -929,7 +965,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); - BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); + BUG_ON(!pfn_t_devmap(pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; @@ -939,6 +975,63 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, return VM_FAULT_NOPAGE; } +static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd) +{ + pmd_t _pmd; + + /* + * We should set the dirty bit only for FOLL_WRITE but for now + * the dirty bit in the pmd is meaningless. And if the dirty + * bit will become meaningful and we'll only set it with + * FOLL_WRITE, an atomic set_bit will be required on the pmd to + * set the young bit, instead of the current set_pmd_at. + */ + _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); + if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, + pmd, _pmd, 1)) + update_mmu_cache_pmd(vma, addr, pmd); +} + +struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, int flags) +{ + unsigned long pfn = pmd_pfn(*pmd); + struct mm_struct *mm = vma->vm_mm; + struct dev_pagemap *pgmap; + struct page *page; + + assert_spin_locked(pmd_lockptr(mm, pmd)); + + if (flags & FOLL_WRITE && !pmd_write(*pmd)) + return NULL; + + if (pmd_present(*pmd) && pmd_devmap(*pmd)) + /* pass */; + else + return NULL; + + if (flags & FOLL_TOUCH) + touch_pmd(vma, addr, pmd); + + /* + * device mapped pages can only be returned if the + * caller will manage the page reference count. + */ + if (!(flags & FOLL_GET)) + return ERR_PTR(-EEXIST); + + pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; + pgmap = get_dev_pagemap(pfn, NULL); + if (!pgmap) + return ERR_PTR(-EFAULT); + page = pfn_to_page(pfn); + get_page(page); + put_dev_pagemap(pgmap); + + return page; +} + int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma) @@ -960,7 +1053,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = -EAGAIN; pmd = *src_pmd; - if (unlikely(!pmd_trans_huge(pmd))) { + if (unlikely(!pmd_trans_huge(pmd) && !pmd_devmap(pmd))) { pte_free(dst_mm, pgtable); goto out_unlock; } @@ -983,26 +1076,20 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, goto out_unlock; } - if (unlikely(pmd_trans_splitting(pmd))) { - /* split huge page running from under us */ - spin_unlock(src_ptl); - spin_unlock(dst_ptl); - pte_free(dst_mm, pgtable); - - wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ - goto out; + if (pmd_trans_huge(pmd)) { + /* thp accounting separate from pmd_devmap accounting */ + src_page = pmd_page(pmd); + VM_BUG_ON_PAGE(!PageHead(src_page), src_page); + get_page(src_page); + page_dup_rmap(src_page, true); + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + atomic_long_inc(&dst_mm->nr_ptes); + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); } - src_page = pmd_page(pmd); - VM_BUG_ON_PAGE(!PageHead(src_page), src_page); - get_page(src_page); - page_dup_rmap(src_page); - add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); pmdp_set_wrprotect(src_mm, addr, src_pmd); pmd = pmd_mkold(pmd_wrprotect(pmd)); - pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); set_pmd_at(dst_mm, addr, dst_pmd, pmd); - atomic_long_inc(&dst_mm->nr_ptes); ret = 0; out_unlock: @@ -1035,37 +1122,6 @@ unlock: spin_unlock(ptl); } -/* - * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages - * during copy_user_huge_page()'s copy_page_rep(): in the case when - * the source page gets split and a tail freed before copy completes. - * Called under pmd_lock of checked pmd, so safe from splitting itself. - */ -static void get_user_huge_page(struct page *page) -{ - if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { - struct page *endpage = page + HPAGE_PMD_NR; - - atomic_add(HPAGE_PMD_NR, &page->_count); - while (++page < endpage) - get_huge_page_tail(page); - } else { - get_page(page); - } -} - -static void put_user_huge_page(struct page *page) -{ - if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { - struct page *endpage = page + HPAGE_PMD_NR; - - while (page < endpage) - put_page(page++); - } else { - put_page(page); - } -} - static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -1095,13 +1151,14 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, vma, address, page_to_nid(page)); if (unlikely(!pages[i] || mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, - &memcg))) { + &memcg, false))) { if (pages[i]) put_page(pages[i]); while (--i >= 0) { memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - mem_cgroup_cancel_charge(pages[i], memcg); + mem_cgroup_cancel_charge(pages[i], memcg, + false); put_page(pages[i]); } kfree(pages); @@ -1139,8 +1196,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, entry = maybe_mkwrite(pte_mkdirty(entry), vma); memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - page_add_new_anon_rmap(pages[i], vma, haddr); - mem_cgroup_commit_charge(pages[i], memcg, false); + page_add_new_anon_rmap(pages[i], vma, haddr, false); + mem_cgroup_commit_charge(pages[i], memcg, false, false); lru_cache_add_active_or_unevictable(pages[i], vma); pte = pte_offset_map(&_pmd, haddr); VM_BUG_ON(!pte_none(*pte)); @@ -1151,7 +1208,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); - page_remove_rmap(page); + page_remove_rmap(page, true); spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); @@ -1168,7 +1225,7 @@ out_free_pages: for (i = 0; i < HPAGE_PMD_NR; i++) { memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - mem_cgroup_cancel_charge(pages[i], memcg); + mem_cgroup_cancel_charge(pages[i], memcg, false); put_page(pages[i]); } kfree(pages); @@ -1198,7 +1255,17 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pmd_page(orig_pmd); VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); - if (page_mapcount(page) == 1) { + /* + * We can only reuse the page if nobody else maps the huge page or it's + * part. We can do it by checking page_mapcount() on each sub-page, but + * it's expensive. + * The cheaper way is to check page_count() to be equal 1: every + * mapcount takes page reference reference, so this way we can + * guarantee, that the PMD is the only mapping. + * This can give false negative if somebody pinned the page, but that's + * fine. + */ + if (page_mapcount(page) == 1 && page_count(page) == 1) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -1207,7 +1274,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, ret |= VM_FAULT_WRITE; goto out_unlock; } - get_user_huge_page(page); + get_page(page); spin_unlock(ptl); alloc: if (transparent_hugepage_enabled(vma) && @@ -1217,30 +1284,33 @@ alloc: } else new_page = NULL; - if (unlikely(!new_page)) { + if (likely(new_page)) { + prep_transhuge_page(new_page); + } else { if (!page) { - split_huge_page_pmd(vma, address, pmd); + split_huge_pmd(vma, pmd, address); ret |= VM_FAULT_FALLBACK; } else { ret = do_huge_pmd_wp_page_fallback(mm, vma, address, pmd, orig_pmd, page, haddr); if (ret & VM_FAULT_OOM) { - split_huge_page(page); + split_huge_pmd(vma, pmd, address); ret |= VM_FAULT_FALLBACK; } - put_user_huge_page(page); + put_page(page); } count_vm_event(THP_FAULT_FALLBACK); goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg, + true))) { put_page(new_page); if (page) { - split_huge_page(page); - put_user_huge_page(page); + split_huge_pmd(vma, pmd, address); + put_page(page); } else - split_huge_page_pmd(vma, address, pmd); + split_huge_pmd(vma, pmd, address); ret |= VM_FAULT_FALLBACK; count_vm_event(THP_FAULT_FALLBACK); goto out; @@ -1260,10 +1330,10 @@ alloc: spin_lock(ptl); if (page) - put_user_huge_page(page); + put_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { spin_unlock(ptl); - mem_cgroup_cancel_charge(new_page, memcg); + mem_cgroup_cancel_charge(new_page, memcg, true); put_page(new_page); goto out_mn; } else { @@ -1271,8 +1341,8 @@ alloc: entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); pmdp_huge_clear_flush_notify(vma, haddr, pmd); - page_add_new_anon_rmap(new_page, vma, haddr); - mem_cgroup_commit_charge(new_page, memcg, false); + page_add_new_anon_rmap(new_page, vma, haddr, true); + mem_cgroup_commit_charge(new_page, memcg, false, true); lru_cache_add_active_or_unevictable(new_page, vma); set_pmd_at(mm, haddr, pmd, entry); update_mmu_cache_pmd(vma, address, pmd); @@ -1281,7 +1351,7 @@ alloc: put_huge_zero_page(); } else { VM_BUG_ON_PAGE(!PageHead(page), page); - page_remove_rmap(page); + page_remove_rmap(page, true); put_page(page); } ret |= VM_FAULT_WRITE; @@ -1319,23 +1389,23 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, page = pmd_page(*pmd); VM_BUG_ON_PAGE(!PageHead(page), page); - if (flags & FOLL_TOUCH) { - pmd_t _pmd; + if (flags & FOLL_TOUCH) + touch_pmd(vma, addr, pmd); + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* - * We should set the dirty bit only for FOLL_WRITE but - * for now the dirty bit in the pmd is meaningless. - * And if the dirty bit will become meaningful and - * we'll only set it with FOLL_WRITE, an atomic - * set_bit will be required on the pmd to set the - * young bit, instead of the current set_pmd_at. + * We don't mlock() pte-mapped THPs. This way we can avoid + * leaking mlocked pages into non-VM_LOCKED VMAs. + * + * In most cases the pmd is the only mapping of the page as we + * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for + * writable private mappings in populate_vma_page_range(). + * + * The only scenario when we have the page shared here is if we + * mlocking read-only mapping shared over fork(). We skip + * mlocking such pages. */ - _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); - if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, - pmd, _pmd, 1)) - update_mmu_cache_pmd(vma, addr, pmd); - } - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - if (page->mapping && trylock_page(page)) { + if (compound_mapcount(page) == 1 && !PageDoubleMap(page) && + page->mapping && trylock_page(page)) { lru_add_drain(); if (page->mapping) mlock_vma_page(page); @@ -1345,7 +1415,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON_PAGE(!PageCompound(page), page); if (flags & FOLL_GET) - get_page_foll(page); + get_page(page); out: return page; @@ -1480,13 +1550,84 @@ out: return 0; } +int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr, unsigned long next) + +{ + spinlock_t *ptl; + pmd_t orig_pmd; + struct page *page; + struct mm_struct *mm = tlb->mm; + int ret = 0; + + if (!pmd_trans_huge_lock(pmd, vma, &ptl)) + goto out; + + orig_pmd = *pmd; + if (is_huge_zero_pmd(orig_pmd)) { + ret = 1; + goto out; + } + + page = pmd_page(orig_pmd); + /* + * If other processes are mapping this page, we couldn't discard + * the page unless they all do MADV_FREE so let's skip the page. + */ + if (page_mapcount(page) != 1) + goto out; + + if (!trylock_page(page)) + goto out; + + /* + * If user want to discard part-pages of THP, split it so MADV_FREE + * will deactivate only them. + */ + if (next - addr != HPAGE_PMD_SIZE) { + get_page(page); + spin_unlock(ptl); + if (split_huge_page(page)) { + put_page(page); + unlock_page(page); + goto out_unlocked; + } + put_page(page); + unlock_page(page); + ret = 1; + goto out_unlocked; + } + + if (PageDirty(page)) + ClearPageDirty(page); + unlock_page(page); + + if (PageActive(page)) + deactivate_page(page); + + if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { + orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, + tlb->fullmm); + orig_pmd = pmd_mkold(orig_pmd); + orig_pmd = pmd_mkclean(orig_pmd); + + set_pmd_at(mm, addr, pmd, orig_pmd); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + } + ret = 1; +out: + spin_unlock(ptl); +out_unlocked: + return ret; +} + int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { pmd_t orig_pmd; spinlock_t *ptl; - if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1) + if (!__pmd_trans_huge_lock(pmd, vma, &ptl)) return 0; /* * For architectures like ppc64 we look at deposited pgtable @@ -1508,7 +1649,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, put_huge_zero_page(); } else { struct page *page = pmd_page(orig_pmd); - page_remove_rmap(page); + page_remove_rmap(page, true); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); VM_BUG_ON_PAGE(!PageHead(page), page); @@ -1520,13 +1661,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return 1; } -int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, +bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd) { spinlock_t *old_ptl, *new_ptl; - int ret = 0; pmd_t pmd; struct mm_struct *mm = vma->vm_mm; @@ -1535,7 +1675,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, (new_addr & ~HPAGE_PMD_MASK) || old_end - old_addr < HPAGE_PMD_SIZE || (new_vma->vm_flags & VM_NOHUGEPAGE)) - goto out; + return false; /* * The destination pmd shouldn't be established, free_pgtables() @@ -1543,15 +1683,14 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, */ if (WARN_ON(!pmd_none(*new_pmd))) { VM_BUG_ON(pmd_trans_huge(*new_pmd)); - goto out; + return false; } /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_sem prevents deadlock. */ - ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl); - if (ret == 1) { + if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) { new_ptl = pmd_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); @@ -1567,9 +1706,9 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, if (new_ptl != old_ptl) spin_unlock(new_ptl); spin_unlock(old_ptl); + return true; } -out: - return ret; + return false; } /* @@ -1585,7 +1724,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (__pmd_trans_huge_lock(pmd, vma, &ptl)) { pmd_t entry; bool preserve_write = prot_numa && pmd_write(*pmd); ret = 1; @@ -1616,405 +1755,19 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, } /* - * Returns 1 if a given pmd maps a stable (not under splitting) thp. - * Returns -1 if it maps a thp under splitting. Returns 0 otherwise. + * Returns true if a given pmd maps a thp, false otherwise. * - * Note that if it returns 1, this routine returns without unlocking page - * table locks. So callers must unlock them. + * Note that if it returns true, this routine returns without unlocking page + * table lock. So callers must unlock it. */ -int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, +bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, spinlock_t **ptl) { *ptl = pmd_lock(vma->vm_mm, pmd); - if (likely(pmd_trans_huge(*pmd))) { - if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(*ptl); - wait_split_huge_page(vma->anon_vma, pmd); - return -1; - } else { - /* Thp mapped by 'pmd' is stable, so we can - * handle it as it is. */ - return 1; - } - } - spin_unlock(*ptl); - return 0; -} - -/* - * This function returns whether a given @page is mapped onto the @address - * in the virtual space of @mm. - * - * When it's true, this function returns *pmd with holding the page table lock - * and passing it back to the caller via @ptl. - * If it's false, returns NULL without holding the page table lock. - */ -pmd_t *page_check_address_pmd(struct page *page, - struct mm_struct *mm, - unsigned long address, - enum page_check_address_pmd_flag flag, - spinlock_t **ptl) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - - if (address & ~HPAGE_PMD_MASK) - return NULL; - - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - return NULL; - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return NULL; - pmd = pmd_offset(pud, address); - - *ptl = pmd_lock(mm, pmd); - if (!pmd_present(*pmd)) - goto unlock; - if (pmd_page(*pmd) != page) - goto unlock; - /* - * split_vma() may create temporary aliased mappings. There is - * no risk as long as all huge pmd are found and have their - * splitting bit set before __split_huge_page_refcount - * runs. Finding the same huge pmd more than once during the - * same rmap walk is not a problem. - */ - if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && - pmd_trans_splitting(*pmd)) - goto unlock; - if (pmd_trans_huge(*pmd)) { - VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && - !pmd_trans_splitting(*pmd)); - return pmd; - } -unlock: + if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) + return true; spin_unlock(*ptl); - return NULL; -} - -static int __split_huge_page_splitting(struct page *page, - struct vm_area_struct *vma, - unsigned long address) -{ - struct mm_struct *mm = vma->vm_mm; - spinlock_t *ptl; - pmd_t *pmd; - int ret = 0; - /* For mmu_notifiers */ - const unsigned long mmun_start = address; - const unsigned long mmun_end = address + HPAGE_PMD_SIZE; - - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl); - if (pmd) { - /* - * We can't temporarily set the pmd to null in order - * to split it, the pmd must remain marked huge at all - * times or the VM won't take the pmd_trans_huge paths - * and it won't wait on the anon_vma->root->rwsem to - * serialize against split_huge_page*. - */ - pmdp_splitting_flush(vma, address, pmd); - - ret = 1; - spin_unlock(ptl); - } - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - - return ret; -} - -static void __split_huge_page_refcount(struct page *page, - struct list_head *list) -{ - int i; - struct zone *zone = page_zone(page); - struct lruvec *lruvec; - int tail_count = 0; - - /* prevent PageLRU to go away from under us, and freeze lru stats */ - spin_lock_irq(&zone->lru_lock); - lruvec = mem_cgroup_page_lruvec(page, zone); - - compound_lock(page); - /* complete memcg works before add pages to LRU */ - mem_cgroup_split_huge_fixup(page); - - for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { - struct page *page_tail = page + i; - - /* tail_page->_mapcount cannot change */ - BUG_ON(page_mapcount(page_tail) < 0); - tail_count += page_mapcount(page_tail); - /* check for overflow */ - BUG_ON(tail_count < 0); - BUG_ON(atomic_read(&page_tail->_count) != 0); - /* - * tail_page->_count is zero and not changing from - * under us. But get_page_unless_zero() may be running - * from under us on the tail_page. If we used - * atomic_set() below instead of atomic_add(), we - * would then run atomic_set() concurrently with - * get_page_unless_zero(), and atomic_set() is - * implemented in C not using locked ops. spin_unlock - * on x86 sometime uses locked ops because of PPro - * errata 66, 92, so unless somebody can guarantee - * atomic_set() here would be safe on all archs (and - * not only on x86), it's safer to use atomic_add(). - */ - atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, - &page_tail->_count); - - /* after clearing PageTail the gup refcount can be released */ - smp_mb__after_atomic(); - - page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; - page_tail->flags |= (page->flags & - ((1L << PG_referenced) | - (1L << PG_swapbacked) | - (1L << PG_mlocked) | - (1L << PG_uptodate) | - (1L << PG_active) | - (1L << PG_unevictable))); - page_tail->flags |= (1L << PG_dirty); - - clear_compound_head(page_tail); - - if (page_is_young(page)) - set_page_young(page_tail); - if (page_is_idle(page)) - set_page_idle(page_tail); - - /* - * __split_huge_page_splitting() already set the - * splitting bit in all pmd that could map this - * hugepage, that will ensure no CPU can alter the - * mapcount on the head page. The mapcount is only - * accounted in the head page and it has to be - * transferred to all tail pages in the below code. So - * for this code to be safe, the split the mapcount - * can't change. But that doesn't mean userland can't - * keep changing and reading the page contents while - * we transfer the mapcount, so the pmd splitting - * status is achieved setting a reserved bit in the - * pmd, not by clearing the present bit. - */ - page_tail->_mapcount = page->_mapcount; - - BUG_ON(page_tail->mapping); - page_tail->mapping = page->mapping; - - page_tail->index = page->index + i; - page_cpupid_xchg_last(page_tail, page_cpupid_last(page)); - - BUG_ON(!PageAnon(page_tail)); - BUG_ON(!PageUptodate(page_tail)); - BUG_ON(!PageDirty(page_tail)); - BUG_ON(!PageSwapBacked(page_tail)); - - lru_add_page_tail(page, page_tail, lruvec, list); - } - atomic_sub(tail_count, &page->_count); - BUG_ON(atomic_read(&page->_count) <= 0); - - __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); - - ClearPageCompound(page); - compound_unlock(page); - spin_unlock_irq(&zone->lru_lock); - - for (i = 1; i < HPAGE_PMD_NR; i++) { - struct page *page_tail = page + i; - BUG_ON(page_count(page_tail) <= 0); - /* - * Tail pages may be freed if there wasn't any mapping - * like if add_to_swap() is running on a lru page that - * had its mapping zapped. And freeing these pages - * requires taking the lru_lock so we do the put_page - * of the tail pages after the split is complete. - */ - put_page(page_tail); - } - - /* - * Only the head page (now become a regular page) is required - * to be pinned by the caller. - */ - BUG_ON(page_count(page) <= 0); -} - -static int __split_huge_page_map(struct page *page, - struct vm_area_struct *vma, - unsigned long address) -{ - struct mm_struct *mm = vma->vm_mm; - spinlock_t *ptl; - pmd_t *pmd, _pmd; - int ret = 0, i; - pgtable_t pgtable; - unsigned long haddr; - - pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl); - if (pmd) { - pgtable = pgtable_trans_huge_withdraw(mm, pmd); - pmd_populate(mm, &_pmd, pgtable); - if (pmd_write(*pmd)) - BUG_ON(page_mapcount(page) != 1); - - haddr = address; - for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { - pte_t *pte, entry; - BUG_ON(PageCompound(page+i)); - /* - * Note that NUMA hinting access restrictions are not - * transferred to avoid any possibility of altering - * permissions across VMAs. - */ - entry = mk_pte(page + i, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (!pmd_write(*pmd)) - entry = pte_wrprotect(entry); - if (!pmd_young(*pmd)) - entry = pte_mkold(entry); - pte = pte_offset_map(&_pmd, haddr); - BUG_ON(!pte_none(*pte)); - set_pte_at(mm, haddr, pte, entry); - pte_unmap(pte); - } - - smp_wmb(); /* make pte visible before pmd */ - /* - * Up to this point the pmd is present and huge and - * userland has the whole access to the hugepage - * during the split (which happens in place). If we - * overwrite the pmd with the not-huge version - * pointing to the pte here (which of course we could - * if all CPUs were bug free), userland could trigger - * a small page size TLB miss on the small sized TLB - * while the hugepage TLB entry is still established - * in the huge TLB. Some CPU doesn't like that. See - * http://support.amd.com/us/Processor_TechDocs/41322.pdf, - * Erratum 383 on page 93. Intel should be safe but is - * also warns that it's only safe if the permission - * and cache attributes of the two entries loaded in - * the two TLB is identical (which should be the case - * here). But it is generally safer to never allow - * small and huge TLB entries for the same virtual - * address to be loaded simultaneously. So instead of - * doing "pmd_populate(); flush_pmd_tlb_range();" we first - * mark the current pmd notpresent (atomically because - * here the pmd_trans_huge and pmd_trans_splitting - * must remain set at all times on the pmd until the - * split is complete for this pmd), then we flush the - * SMP TLB and finally we write the non-huge version - * of the pmd entry with pmd_populate. - */ - pmdp_invalidate(vma, address, pmd); - pmd_populate(mm, pmd, pgtable); - ret = 1; - spin_unlock(ptl); - } - - return ret; -} - -/* must be called with anon_vma->root->rwsem held */ -static void __split_huge_page(struct page *page, - struct anon_vma *anon_vma, - struct list_head *list) -{ - int mapcount, mapcount2; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - struct anon_vma_chain *avc; - - BUG_ON(!PageHead(page)); - BUG_ON(PageTail(page)); - - mapcount = 0; - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { - struct vm_area_struct *vma = avc->vma; - unsigned long addr = vma_address(page, vma); - BUG_ON(is_vma_temporary_stack(vma)); - mapcount += __split_huge_page_splitting(page, vma, addr); - } - /* - * It is critical that new vmas are added to the tail of the - * anon_vma list. This guarantes that if copy_huge_pmd() runs - * and establishes a child pmd before - * __split_huge_page_splitting() freezes the parent pmd (so if - * we fail to prevent copy_huge_pmd() from running until the - * whole __split_huge_page() is complete), we will still see - * the newly established pmd of the child later during the - * walk, to be able to set it as pmd_trans_splitting too. - */ - if (mapcount != page_mapcount(page)) { - pr_err("mapcount %d page_mapcount %d\n", - mapcount, page_mapcount(page)); - BUG(); - } - - __split_huge_page_refcount(page, list); - - mapcount2 = 0; - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { - struct vm_area_struct *vma = avc->vma; - unsigned long addr = vma_address(page, vma); - BUG_ON(is_vma_temporary_stack(vma)); - mapcount2 += __split_huge_page_map(page, vma, addr); - } - if (mapcount != mapcount2) { - pr_err("mapcount %d mapcount2 %d page_mapcount %d\n", - mapcount, mapcount2, page_mapcount(page)); - BUG(); - } -} - -/* - * Split a hugepage into normal pages. This doesn't change the position of head - * page. If @list is null, tail pages will be added to LRU list, otherwise, to - * @list. Both head page and tail pages will inherit mapping, flags, and so on - * from the hugepage. - * Return 0 if the hugepage is split successfully otherwise return 1. - */ -int split_huge_page_to_list(struct page *page, struct list_head *list) -{ - struct anon_vma *anon_vma; - int ret = 1; - - BUG_ON(is_huge_zero_page(page)); - BUG_ON(!PageAnon(page)); - - /* - * The caller does not necessarily hold an mmap_sem that would prevent - * the anon_vma disappearing so we first we take a reference to it - * and then lock the anon_vma for write. This is similar to - * page_lock_anon_vma_read except the write lock is taken to serialise - * against parallel split or collapse operations. - */ - anon_vma = page_get_anon_vma(page); - if (!anon_vma) - goto out; - anon_vma_lock_write(anon_vma); - - ret = 0; - if (!PageCompound(page)) - goto out_unlock; - - BUG_ON(!PageSwapBacked(page)); - __split_huge_page(page, anon_vma, list); - count_vm_event(THP_SPLIT); - - BUG_ON(PageCompound(page)); -out_unlock: - anon_vma_unlock_write(anon_vma); - put_anon_vma(anon_vma); -out: - return ret; + return false; } #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) @@ -2371,7 +2124,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, * superfluous. */ pte_clear(vma->vm_mm, address, _pte); - page_remove_rmap(src_page); + page_remove_rmap(src_page, false); spin_unlock(ptl); free_page_and_swap_cache(src_page); } @@ -2481,6 +2234,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, return NULL; } + prep_transhuge_page(*hpage); count_vm_event(THP_COLLAPSE_ALLOC); return *hpage; } @@ -2492,8 +2246,12 @@ static int khugepaged_find_target_node(void) static inline struct page *alloc_hugepage(int defrag) { - return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), - HPAGE_PMD_ORDER); + struct page *page; + + page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER); + if (page) + prep_transhuge_page(page); + return page; } static struct page *khugepaged_alloc_hugepage(bool *wait) @@ -2543,7 +2301,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma) if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || (vma->vm_flags & VM_NOHUGEPAGE)) return false; - if (!vma->anon_vma || vma->vm_ops) return false; if (is_vma_temporary_stack(vma)) @@ -2583,7 +2340,7 @@ static void collapse_huge_page(struct mm_struct *mm, goto out_nolock; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out_nolock; } @@ -2682,8 +2439,8 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - page_add_new_anon_rmap(new_page, vma, address); - mem_cgroup_commit_charge(new_page, memcg, false); + page_add_new_anon_rmap(new_page, vma, address, true); + mem_cgroup_commit_charge(new_page, memcg, false, true); lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); @@ -2703,7 +2460,7 @@ out_nolock: trace_mm_collapse_huge_page(mm, isolated, result); return; out: - mem_cgroup_cancel_charge(new_page, memcg); + mem_cgroup_cancel_charge(new_page, memcg, true); goto out_up_write; } @@ -2755,6 +2512,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, result = SCAN_PAGE_NULL; goto out_unmap; } + + /* TODO: teach khugepaged to collapse THP mapped with pte */ + if (PageCompound(page)) { + result = SCAN_PAGE_COMPOUND; + goto out_unmap; + } + /* * Record which node the original page is from and save this * information to khugepaged_node_load[]. @@ -2767,7 +2531,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, goto out_unmap; } khugepaged_node_load[node]++; - VM_BUG_ON_PAGE(PageCompound(page), page); if (!PageLRU(page)) { result = SCAN_SCAN_ABORT; goto out_unmap; @@ -3040,8 +2803,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pmd_t _pmd; int i; - pmdp_huge_clear_flush_notify(vma, haddr, pmd); /* leave pmd empty until pte is filled */ + pmdp_huge_clear_flush_notify(vma, haddr, pmd); pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); @@ -3060,66 +2823,153 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, put_huge_zero_page(); } -void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd) +static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long haddr, bool freeze) { - spinlock_t *ptl; - struct page *page = NULL; struct mm_struct *mm = vma->vm_mm; - unsigned long haddr = address & HPAGE_PMD_MASK; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct page *page; + pgtable_t pgtable; + pmd_t _pmd; + bool young, write, dirty; + int i; - BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); + VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); + VM_BUG_ON_VMA(vma->vm_start > haddr, vma); + VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); + VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)); + + count_vm_event(THP_SPLIT_PMD); - mmun_start = haddr; - mmun_end = haddr + HPAGE_PMD_SIZE; -again: - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_trans_huge(*pmd))) - goto unlock; if (vma_is_dax(vma)) { pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); if (is_huge_zero_pmd(_pmd)) put_huge_zero_page(); + return; } else if (is_huge_zero_pmd(*pmd)) { - __split_huge_zero_page_pmd(vma, haddr, pmd); - } else { - page = pmd_page(*pmd); - VM_BUG_ON_PAGE(!page_count(page), page); - get_page(page); + return __split_huge_zero_page_pmd(vma, haddr, pmd); } - unlock: - spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - if (!page) - return; + page = pmd_page(*pmd); + VM_BUG_ON_PAGE(!page_count(page), page); + atomic_add(HPAGE_PMD_NR - 1, &page->_count); + write = pmd_write(*pmd); + young = pmd_young(*pmd); + dirty = pmd_dirty(*pmd); - split_huge_page(page); - put_page(page); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); + pmd_populate(mm, &_pmd, pgtable); + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + pte_t entry, *pte; + /* + * Note that NUMA hinting access restrictions are not + * transferred to avoid any possibility of altering + * permissions across VMAs. + */ + if (freeze) { + swp_entry_t swp_entry; + swp_entry = make_migration_entry(page + i, write); + entry = swp_entry_to_pte(swp_entry); + } else { + entry = mk_pte(page + i, vma->vm_page_prot); + entry = maybe_mkwrite(entry, vma); + if (!write) + entry = pte_wrprotect(entry); + if (!young) + entry = pte_mkold(entry); + } + if (dirty) + SetPageDirty(page + i); + pte = pte_offset_map(&_pmd, haddr); + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + atomic_inc(&page[i]._mapcount); + pte_unmap(pte); + } + + /* + * Set PG_double_map before dropping compound_mapcount to avoid + * false-negative page_mapped(). + */ + if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) { + for (i = 0; i < HPAGE_PMD_NR; i++) + atomic_inc(&page[i]._mapcount); + } + + if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { + /* Last compound_mapcount is gone. */ + __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + if (TestClearPageDoubleMap(page)) { + /* No need in mapcount reference anymore */ + for (i = 0; i < HPAGE_PMD_NR; i++) + atomic_dec(&page[i]._mapcount); + } + } + + smp_wmb(); /* make pte visible before pmd */ /* - * We don't always have down_write of mmap_sem here: a racing - * do_huge_pmd_wp_page() might have copied-on-write to another - * huge page before our split_huge_page() got the anon_vma lock. + * Up to this point the pmd is present and huge and userland has the + * whole access to the hugepage during the split (which happens in + * place). If we overwrite the pmd with the not-huge version pointing + * to the pte here (which of course we could if all CPUs were bug + * free), userland could trigger a small page size TLB miss on the + * small sized TLB while the hugepage TLB entry is still established in + * the huge TLB. Some CPU doesn't like that. + * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum + * 383 on page 93. Intel should be safe but is also warns that it's + * only safe if the permission and cache attributes of the two entries + * loaded in the two TLB is identical (which should be the case here). + * But it is generally safer to never allow small and huge TLB entries + * for the same virtual address to be loaded simultaneously. So instead + * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the + * current pmd notpresent (atomically because here the pmd_trans_huge + * and pmd_trans_splitting must remain set at all times on the pmd + * until the split is complete for this pmd), then we flush the SMP TLB + * and finally we write the non-huge version of the pmd entry with + * pmd_populate. */ - if (unlikely(pmd_trans_huge(*pmd))) - goto again; + pmdp_invalidate(vma, haddr, pmd); + pmd_populate(mm, pmd, pgtable); + + if (freeze) { + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + page_remove_rmap(page + i, false); + put_page(page + i); + } + } } -void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, - pmd_t *pmd) +void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long address) { - struct vm_area_struct *vma; + spinlock_t *ptl; + struct mm_struct *mm = vma->vm_mm; + struct page *page = NULL; + unsigned long haddr = address & HPAGE_PMD_MASK; - vma = find_vma(mm, address); - BUG_ON(vma == NULL); - split_huge_page_pmd(vma, address, pmd); + mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); + ptl = pmd_lock(mm, pmd); + if (pmd_trans_huge(*pmd)) { + page = pmd_page(*pmd); + if (PageMlocked(page)) + get_page(page); + else + page = NULL; + } else if (!pmd_devmap(*pmd)) + goto out; + __split_huge_pmd_locked(vma, pmd, haddr, false); +out: + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); + if (page) { + lock_page(page); + munlock_vma_page(page); + unlock_page(page); + put_page(page); + } } -static void split_huge_page_address(struct mm_struct *mm, +static void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address) { pgd_t *pgd; @@ -3128,7 +2978,7 @@ static void split_huge_page_address(struct mm_struct *mm, VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); - pgd = pgd_offset(mm, address); + pgd = pgd_offset(vma->vm_mm, address); if (!pgd_present(*pgd)) return; @@ -3137,13 +2987,13 @@ static void split_huge_page_address(struct mm_struct *mm, return; pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) + if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd))) return; /* * Caller holds the mmap_sem write mode, so a huge pmd cannot * materialize from under us. */ - split_huge_page_pmd_mm(mm, address, pmd); + split_huge_pmd(vma, pmd, address); } void vma_adjust_trans_huge(struct vm_area_struct *vma, @@ -3159,7 +3009,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (start & ~HPAGE_PMD_MASK && (start & HPAGE_PMD_MASK) >= vma->vm_start && (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_page_address(vma->vm_mm, start); + split_huge_pmd_address(vma, start); /* * If the new end address isn't hpage aligned and it could @@ -3169,7 +3019,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (end & ~HPAGE_PMD_MASK && (end & HPAGE_PMD_MASK) >= vma->vm_start && (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_page_address(vma->vm_mm, end); + split_huge_pmd_address(vma, end); /* * If we're also updating the vma->vm_next->vm_start, if the new @@ -3183,6 +3033,540 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (nstart & ~HPAGE_PMD_MASK && (nstart & HPAGE_PMD_MASK) >= next->vm_start && (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) - split_huge_page_address(next->vm_mm, nstart); + split_huge_pmd_address(next, nstart); + } +} + +static void freeze_page_vma(struct vm_area_struct *vma, struct page *page, + unsigned long address) +{ + unsigned long haddr = address & HPAGE_PMD_MASK; + spinlock_t *ptl; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int i, nr = HPAGE_PMD_NR; + + /* Skip pages which doesn't belong to the VMA */ + if (address < vma->vm_start) { + int off = (vma->vm_start - address) >> PAGE_SHIFT; + page += off; + nr -= off; + address = vma->vm_start; + } + + pgd = pgd_offset(vma->vm_mm, address); + if (!pgd_present(*pgd)) + return; + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return; + pmd = pmd_offset(pud, address); + ptl = pmd_lock(vma->vm_mm, pmd); + if (!pmd_present(*pmd)) { + spin_unlock(ptl); + return; + } + if (pmd_trans_huge(*pmd)) { + if (page == pmd_page(*pmd)) + __split_huge_pmd_locked(vma, pmd, haddr, true); + spin_unlock(ptl); + return; + } + spin_unlock(ptl); + + pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); + for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) { + pte_t entry, swp_pte; + swp_entry_t swp_entry; + + /* + * We've just crossed page table boundary: need to map next one. + * It can happen if THP was mremaped to non PMD-aligned address. + */ + if (unlikely(address == haddr + HPAGE_PMD_SIZE)) { + pte_unmap_unlock(pte - 1, ptl); + pmd = mm_find_pmd(vma->vm_mm, address); + if (!pmd) + return; + pte = pte_offset_map_lock(vma->vm_mm, pmd, + address, &ptl); + } + + if (!pte_present(*pte)) + continue; + if (page_to_pfn(page) != pte_pfn(*pte)) + continue; + flush_cache_page(vma, address, page_to_pfn(page)); + entry = ptep_clear_flush(vma, address, pte); + if (pte_dirty(entry)) + SetPageDirty(page); + swp_entry = make_migration_entry(page, pte_write(entry)); + swp_pte = swp_entry_to_pte(swp_entry); + if (pte_soft_dirty(entry)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(vma->vm_mm, address, pte, swp_pte); + page_remove_rmap(page, false); + put_page(page); + } + pte_unmap_unlock(pte - 1, ptl); +} + +static void freeze_page(struct anon_vma *anon_vma, struct page *page) +{ + struct anon_vma_chain *avc; + pgoff_t pgoff = page_to_pgoff(page); + + VM_BUG_ON_PAGE(!PageHead(page), page); + + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, + pgoff + HPAGE_PMD_NR - 1) { + unsigned long address = __vma_address(page, avc->vma); + + mmu_notifier_invalidate_range_start(avc->vma->vm_mm, + address, address + HPAGE_PMD_SIZE); + freeze_page_vma(avc->vma, page, address); + mmu_notifier_invalidate_range_end(avc->vma->vm_mm, + address, address + HPAGE_PMD_SIZE); + } +} + +static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page, + unsigned long address) +{ + spinlock_t *ptl; + pmd_t *pmd; + pte_t *pte, entry; + swp_entry_t swp_entry; + unsigned long haddr = address & HPAGE_PMD_MASK; + int i, nr = HPAGE_PMD_NR; + + /* Skip pages which doesn't belong to the VMA */ + if (address < vma->vm_start) { + int off = (vma->vm_start - address) >> PAGE_SHIFT; + page += off; + nr -= off; + address = vma->vm_start; + } + + pmd = mm_find_pmd(vma->vm_mm, address); + if (!pmd) + return; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); + for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) { + /* + * We've just crossed page table boundary: need to map next one. + * It can happen if THP was mremaped to non-PMD aligned address. + */ + if (unlikely(address == haddr + HPAGE_PMD_SIZE)) { + pte_unmap_unlock(pte - 1, ptl); + pmd = mm_find_pmd(vma->vm_mm, address); + if (!pmd) + return; + pte = pte_offset_map_lock(vma->vm_mm, pmd, + address, &ptl); + } + + if (!is_swap_pte(*pte)) + continue; + + swp_entry = pte_to_swp_entry(*pte); + if (!is_migration_entry(swp_entry)) + continue; + if (migration_entry_to_page(swp_entry) != page) + continue; + + get_page(page); + page_add_anon_rmap(page, vma, address, false); + + entry = pte_mkold(mk_pte(page, vma->vm_page_prot)); + if (PageDirty(page)) + entry = pte_mkdirty(entry); + if (is_write_migration_entry(swp_entry)) + entry = maybe_mkwrite(entry, vma); + + flush_dcache_page(page); + set_pte_at(vma->vm_mm, address, pte, entry); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, pte); + } + pte_unmap_unlock(pte - 1, ptl); +} + +static void unfreeze_page(struct anon_vma *anon_vma, struct page *page) +{ + struct anon_vma_chain *avc; + pgoff_t pgoff = page_to_pgoff(page); + + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, + pgoff, pgoff + HPAGE_PMD_NR - 1) { + unsigned long address = __vma_address(page, avc->vma); + + mmu_notifier_invalidate_range_start(avc->vma->vm_mm, + address, address + HPAGE_PMD_SIZE); + unfreeze_page_vma(avc->vma, page, address); + mmu_notifier_invalidate_range_end(avc->vma->vm_mm, + address, address + HPAGE_PMD_SIZE); + } +} + +static int __split_huge_page_tail(struct page *head, int tail, + struct lruvec *lruvec, struct list_head *list) +{ + int mapcount; + struct page *page_tail = head + tail; + + mapcount = atomic_read(&page_tail->_mapcount) + 1; + VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail); + + /* + * tail_page->_count is zero and not changing from under us. But + * get_page_unless_zero() may be running from under us on the + * tail_page. If we used atomic_set() below instead of atomic_add(), we + * would then run atomic_set() concurrently with + * get_page_unless_zero(), and atomic_set() is implemented in C not + * using locked ops. spin_unlock on x86 sometime uses locked ops + * because of PPro errata 66, 92, so unless somebody can guarantee + * atomic_set() here would be safe on all archs (and not only on x86), + * it's safer to use atomic_add(). + */ + atomic_add(mapcount + 1, &page_tail->_count); + + + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + page_tail->flags |= (head->flags & + ((1L << PG_referenced) | + (1L << PG_swapbacked) | + (1L << PG_mlocked) | + (1L << PG_uptodate) | + (1L << PG_active) | + (1L << PG_locked) | + (1L << PG_unevictable) | + (1L << PG_dirty))); + + /* + * After clearing PageTail the gup refcount can be released. + * Page flags also must be visible before we make the page non-compound. + */ + smp_wmb(); + + clear_compound_head(page_tail); + + if (page_is_young(head)) + set_page_young(page_tail); + if (page_is_idle(head)) + set_page_idle(page_tail); + + /* ->mapping in first tail page is compound_mapcount */ + VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, + page_tail); + page_tail->mapping = head->mapping; + + page_tail->index = head->index + tail; + page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); + lru_add_page_tail(head, page_tail, lruvec, list); + + return mapcount; +} + +static void __split_huge_page(struct page *page, struct list_head *list) +{ + struct page *head = compound_head(page); + struct zone *zone = page_zone(head); + struct lruvec *lruvec; + int i, tail_mapcount; + + /* prevent PageLRU to go away from under us, and freeze lru stats */ + spin_lock_irq(&zone->lru_lock); + lruvec = mem_cgroup_page_lruvec(head, zone); + + /* complete memcg works before add pages to LRU */ + mem_cgroup_split_huge_fixup(head); + + tail_mapcount = 0; + for (i = HPAGE_PMD_NR - 1; i >= 1; i--) + tail_mapcount += __split_huge_page_tail(head, i, lruvec, list); + atomic_sub(tail_mapcount, &head->_count); + + ClearPageCompound(head); + spin_unlock_irq(&zone->lru_lock); + + unfreeze_page(page_anon_vma(head), head); + + for (i = 0; i < HPAGE_PMD_NR; i++) { + struct page *subpage = head + i; + if (subpage == page) + continue; + unlock_page(subpage); + + /* + * Subpages may be freed if there wasn't any mapping + * like if add_to_swap() is running on a lru page that + * had its mapping zapped. And freeing these pages + * requires taking the lru_lock so we do the put_page + * of the tail pages after the split is complete. + */ + put_page(subpage); } } + +int total_mapcount(struct page *page) +{ + int i, ret; + + VM_BUG_ON_PAGE(PageTail(page), page); + + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) + 1; + + ret = compound_mapcount(page); + if (PageHuge(page)) + return ret; + for (i = 0; i < HPAGE_PMD_NR; i++) + ret += atomic_read(&page[i]._mapcount) + 1; + if (PageDoubleMap(page)) + ret -= HPAGE_PMD_NR; + return ret; +} + +/* + * This function splits huge page into normal pages. @page can point to any + * subpage of huge page to split. Split doesn't change the position of @page. + * + * Only caller must hold pin on the @page, otherwise split fails with -EBUSY. + * The huge page must be locked. + * + * If @list is null, tail pages will be added to LRU list, otherwise, to @list. + * + * Both head page and tail pages will inherit mapping, flags, and so on from + * the hugepage. + * + * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if + * they are not mapped. + * + * Returns 0 if the hugepage is split successfully. + * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under + * us. + */ +int split_huge_page_to_list(struct page *page, struct list_head *list) +{ + struct page *head = compound_head(page); + struct anon_vma *anon_vma; + int count, mapcount, ret; + bool mlocked; + + VM_BUG_ON_PAGE(is_huge_zero_page(page), page); + VM_BUG_ON_PAGE(!PageAnon(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageSwapBacked(page), page); + VM_BUG_ON_PAGE(!PageCompound(page), page); + + /* + * The caller does not necessarily hold an mmap_sem that would prevent + * the anon_vma disappearing so we first we take a reference to it + * and then lock the anon_vma for write. This is similar to + * page_lock_anon_vma_read except the write lock is taken to serialise + * against parallel split or collapse operations. + */ + anon_vma = page_get_anon_vma(head); + if (!anon_vma) { + ret = -EBUSY; + goto out; + } + anon_vma_lock_write(anon_vma); + + /* + * Racy check if we can split the page, before freeze_page() will + * split PMDs + */ + if (total_mapcount(head) != page_count(head) - 1) { + ret = -EBUSY; + goto out_unlock; + } + + mlocked = PageMlocked(page); + freeze_page(anon_vma, head); + VM_BUG_ON_PAGE(compound_mapcount(head), head); + + /* Make sure the page is not on per-CPU pagevec as it takes pin */ + if (mlocked) + lru_add_drain(); + + /* Prevent deferred_split_scan() touching ->_count */ + spin_lock(&split_queue_lock); + count = page_count(head); + mapcount = total_mapcount(head); + if (!mapcount && count == 1) { + if (!list_empty(page_deferred_list(head))) { + split_queue_len--; + list_del(page_deferred_list(head)); + } + spin_unlock(&split_queue_lock); + __split_huge_page(page, list); + ret = 0; + } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { + spin_unlock(&split_queue_lock); + pr_alert("total_mapcount: %u, page_count(): %u\n", + mapcount, count); + if (PageTail(page)) + dump_page(head, NULL); + dump_page(page, "total_mapcount(head) > 0"); + BUG(); + } else { + spin_unlock(&split_queue_lock); + unfreeze_page(anon_vma, head); + ret = -EBUSY; + } + +out_unlock: + anon_vma_unlock_write(anon_vma); + put_anon_vma(anon_vma); +out: + count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); + return ret; +} + +void free_transhuge_page(struct page *page) +{ + unsigned long flags; + + spin_lock_irqsave(&split_queue_lock, flags); + if (!list_empty(page_deferred_list(page))) { + split_queue_len--; + list_del(page_deferred_list(page)); + } + spin_unlock_irqrestore(&split_queue_lock, flags); + free_compound_page(page); +} + +void deferred_split_huge_page(struct page *page) +{ + unsigned long flags; + + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + + spin_lock_irqsave(&split_queue_lock, flags); + if (list_empty(page_deferred_list(page))) { + list_add_tail(page_deferred_list(page), &split_queue); + split_queue_len++; + } + spin_unlock_irqrestore(&split_queue_lock, flags); +} + +static unsigned long deferred_split_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + /* + * Split a page from split_queue will free up at least one page, + * at most HPAGE_PMD_NR - 1. We don't track exact number. + * Let's use HPAGE_PMD_NR / 2 as ballpark. + */ + return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2; +} + +static unsigned long deferred_split_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + unsigned long flags; + LIST_HEAD(list), *pos, *next; + struct page *page; + int split = 0; + + spin_lock_irqsave(&split_queue_lock, flags); + list_splice_init(&split_queue, &list); + + /* Take pin on all head pages to avoid freeing them under us */ + list_for_each_safe(pos, next, &list) { + page = list_entry((void *)pos, struct page, mapping); + page = compound_head(page); + /* race with put_compound_page() */ + if (!get_page_unless_zero(page)) { + list_del_init(page_deferred_list(page)); + split_queue_len--; + } + } + spin_unlock_irqrestore(&split_queue_lock, flags); + + list_for_each_safe(pos, next, &list) { + page = list_entry((void *)pos, struct page, mapping); + lock_page(page); + /* split_huge_page() removes page from list on success */ + if (!split_huge_page(page)) + split++; + unlock_page(page); + put_page(page); + } + + spin_lock_irqsave(&split_queue_lock, flags); + list_splice_tail(&list, &split_queue); + spin_unlock_irqrestore(&split_queue_lock, flags); + + return split * HPAGE_PMD_NR / 2; +} + +static struct shrinker deferred_split_shrinker = { + .count_objects = deferred_split_count, + .scan_objects = deferred_split_scan, + .seeks = DEFAULT_SEEKS, +}; + +#ifdef CONFIG_DEBUG_FS +static int split_huge_pages_set(void *data, u64 val) +{ + struct zone *zone; + struct page *page; + unsigned long pfn, max_zone_pfn; + unsigned long total = 0, split = 0; + + if (val != 1) + return -EINVAL; + + for_each_populated_zone(zone) { + max_zone_pfn = zone_end_pfn(zone); + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { + if (!pfn_valid(pfn)) + continue; + + page = pfn_to_page(pfn); + if (!get_page_unless_zero(page)) + continue; + + if (zone != page_zone(page)) + goto next; + + if (!PageHead(page) || !PageAnon(page) || + PageHuge(page)) + goto next; + + total++; + lock_page(page); + if (!split_huge_page(page)) + split++; + unlock_page(page); +next: + put_page(page); + } + } + + pr_info("%lu of %lu THP split", split, total); + + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, + "%llu\n"); + +static int __init split_huge_pages_debugfs(void) +{ + void *ret; + + ret = debugfs_create_file("split_huge_pages", 0644, NULL, NULL, + &split_huge_pages_fops); + if (!ret) + pr_warn("Failed to create split_huge_pages in debugfs"); + return 0; +} +late_initcall(split_huge_pages_debugfs); +#endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index be934df..12908dc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1267,8 +1267,8 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order) /* we rely on prep_new_huge_page to set the destructor */ set_compound_order(page, order); - __SetPageHead(page); __ClearPageReserved(page); + __SetPageHead(page); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { /* * For gigantic hugepages allocated through bootmem at @@ -3102,7 +3102,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); get_page(ptepage); - page_dup_rmap(ptepage); + page_dup_rmap(ptepage, true); set_huge_pte_at(dst, addr, dst_pte, entry); hugetlb_count_add(pages_per_huge_page(h), dst); } @@ -3186,7 +3186,7 @@ again: set_page_dirty(page); hugetlb_count_sub(pages_per_huge_page(h), mm); - page_remove_rmap(page); + page_remove_rmap(page, true); force_flush = !__tlb_remove_page(tlb, page); if (force_flush) { address += sz; @@ -3415,7 +3415,7 @@ retry_avoidcopy: mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); - page_remove_rmap(old_page); + page_remove_rmap(old_page, true); hugepage_add_new_anon_rmap(new_page, vma, address); /* Make the old page be freed below */ new_page = old_page; @@ -3585,7 +3585,7 @@ retry: ClearPagePrivate(page); hugepage_add_new_anon_rmap(page, vma, address); } else - page_dup_rmap(page); + page_dup_rmap(page, true); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); set_huge_pte_at(mm, address, ptep, new_pte); @@ -3865,7 +3865,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, same_page: if (pages) { pages[i] = mem_map_offset(page, pfn_offset); - get_page_foll(pages[i]); + get_page(pages[i]); } if (vmas) diff --git a/mm/internal.h b/mm/internal.h index 38e24b8..ed8b5ff 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -13,6 +13,7 @@ #include <linux/fs.h> #include <linux/mm.h> +#include <linux/pagemap.h> /* * The set of flags that only affect watermark checking and reclaim @@ -66,50 +67,6 @@ static inline void set_page_refcounted(struct page *page) set_page_count(page, 1); } -static inline void __get_page_tail_foll(struct page *page, - bool get_page_head) -{ - /* - * If we're getting a tail page, the elevated page->_count is - * required only in the head page and we will elevate the head - * page->_count and tail page->_mapcount. - * - * We elevate page_tail->_mapcount for tail pages to force - * page_tail->_count to be zero at all times to avoid getting - * false positives from get_page_unless_zero() with - * speculative page access (like in - * page_cache_get_speculative()) on tail pages. - */ - VM_BUG_ON_PAGE(atomic_read(&compound_head(page)->_count) <= 0, page); - if (get_page_head) - atomic_inc(&compound_head(page)->_count); - get_huge_page_tail(page); -} - -/* - * This is meant to be called as the FOLL_GET operation of - * follow_page() and it must be called while holding the proper PT - * lock while the pte (or pmd_trans_huge) is still mapping the page. - */ -static inline void get_page_foll(struct page *page) -{ - if (unlikely(PageTail(page))) - /* - * This is safe only because - * __split_huge_page_refcount() can't run under - * get_page_foll() because we hold the proper PT lock. - */ - __get_page_tail_foll(page, true); - else { - /* - * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_count. - */ - VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); - atomic_inc(&page->_count); - } -} - extern unsigned long highest_memmap_pfn; /* @@ -309,10 +266,27 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -extern unsigned long vma_address(struct page *page, - struct vm_area_struct *vma); -#endif +/* + * At what user virtual address is page expected in @vma? + */ +static inline unsigned long +__vma_address(struct page *page, struct vm_area_struct *vma) +{ + pgoff_t pgoff = page_to_pgoff(page); + return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); +} + +static inline unsigned long +vma_address(struct page *page, struct vm_area_struct *vma) +{ + unsigned long address = __vma_address(page, vma); + + /* page should be within @vma mapping range */ + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); + + return address; +} + #else /* !CONFIG_MMU */ static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } @@ -441,20 +441,6 @@ static void break_cow(struct rmap_item *rmap_item) up_read(&mm->mmap_sem); } -static struct page *page_trans_compound_anon(struct page *page) -{ - if (PageTransCompound(page)) { - struct page *head = compound_head(page); - /* - * head may actually be splitted and freed from under - * us but it's ok here. - */ - if (PageAnon(head)) - return head; - } - return NULL; -} - static struct page *get_mergeable_page(struct rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; @@ -470,7 +456,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) page = follow_page(vma, addr, FOLL_GET); if (IS_ERR_OR_NULL(page)) goto out; - if (PageAnon(page) || page_trans_compound_anon(page)) { + if (PageAnon(page)) { flush_anon_page(vma, page, addr); flush_dcache_page(page); } else { @@ -956,13 +942,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, } get_page(kpage); - page_add_anon_rmap(kpage, vma, addr); + page_add_anon_rmap(kpage, vma, addr, false); flush_cache_page(vma, addr, pte_pfn(*ptep)); ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); - page_remove_rmap(page); + page_remove_rmap(page, false); if (!page_mapped(page)) try_to_free_swap(page); put_page(page); @@ -975,33 +961,6 @@ out: return err; } -static int page_trans_compound_anon_split(struct page *page) -{ - int ret = 0; - struct page *transhuge_head = page_trans_compound_anon(page); - if (transhuge_head) { - /* Get the reference on the head to split it. */ - if (get_page_unless_zero(transhuge_head)) { - /* - * Recheck we got the reference while the head - * was still anonymous. - */ - if (PageAnon(transhuge_head)) - ret = split_huge_page(transhuge_head); - else - /* - * Retry later if split_huge_page run - * from under us. - */ - ret = 1; - put_page(transhuge_head); - } else - /* Retry later if split_huge_page run from under us. */ - ret = 1; - } - return ret; -} - /* * try_to_merge_one_page - take two pages and merge them into one * @vma: the vma that holds the pte pointing to page @@ -1020,9 +979,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, if (page == kpage) /* ksm page forked */ return 0; - if (PageTransCompound(page) && page_trans_compound_anon_split(page)) - goto out; - BUG_ON(PageTransCompound(page)); if (!PageAnon(page)) goto out; @@ -1035,6 +991,13 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, */ if (!trylock_page(page)) goto out; + + if (PageTransCompound(page)) { + err = split_huge_page(page); + if (err) + goto out_unlock; + } + /* * If this anonymous page is mapped only here, its pte may need * to be write-protected. If it's mapped elsewhere, all of its @@ -1050,6 +1013,12 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, */ set_page_stable_node(page, NULL); mark_page_accessed(page); + /* + * Page reclaim just frees a clean page with no dirty + * ptes: make sure that the ksm page would be swapped. + */ + if (!PageDirty(page)) + SetPageDirty(page); err = 0; } else if (pages_identical(page, kpage)) err = replace_page(vma, page, kpage, orig_pte); @@ -1065,6 +1034,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, } } +out_unlock: unlock_page(page); out: return err; @@ -1635,8 +1605,7 @@ next_mm: cond_resched(); continue; } - if (PageAnon(*page) || - page_trans_compound_anon(*page)) { + if (PageAnon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); rmap_item = get_next_rmap_item(slot, @@ -1899,7 +1868,7 @@ struct page *ksm_might_need_to_copy(struct page *page, SetPageDirty(new_page); __SetPageUptodate(new_page); - __set_page_locked(new_page); + __SetPageLocked(new_page); } return new_page; diff --git a/mm/madvise.c b/mm/madvise.c index c889fcb..f56825b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -20,6 +20,9 @@ #include <linux/backing-dev.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/mmu_notifier.h> + +#include <asm/tlb.h> /* * Any behaviour which results in changes to the vma->vm_flags needs to @@ -32,6 +35,7 @@ static int madvise_need_mmap_write(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_FREE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ @@ -256,6 +260,194 @@ static long madvise_willneed(struct vm_area_struct *vma, return 0; } +static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) + +{ + struct mmu_gather *tlb = walk->private; + struct mm_struct *mm = tlb->mm; + struct vm_area_struct *vma = walk->vma; + spinlock_t *ptl; + pte_t *orig_pte, *pte, ptent; + struct page *page; + int nr_swap = 0; + unsigned long next; + + next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) + if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) + goto next; + + if (pmd_trans_unstable(pmd)) + return 0; + + orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + arch_enter_lazy_mmu_mode(); + for (; addr != end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + + if (pte_none(ptent)) + continue; + /* + * If the pte has swp_entry, just clear page table to + * prevent swap-in which is more expensive rather than + * (page allocation + zeroing). + */ + if (!pte_present(ptent)) { + swp_entry_t entry; + + entry = pte_to_swp_entry(ptent); + if (non_swap_entry(entry)) + continue; + nr_swap--; + free_swap_and_cache(entry); + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + continue; + } + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + /* + * If pmd isn't transhuge but the page is THP and + * is owned by only this process, split it and + * deactivate all pages. + */ + if (PageTransCompound(page)) { + if (page_mapcount(page) != 1) + goto out; + get_page(page); + if (!trylock_page(page)) { + put_page(page); + goto out; + } + pte_unmap_unlock(orig_pte, ptl); + if (split_huge_page(page)) { + unlock_page(page); + put_page(page); + pte_offset_map_lock(mm, pmd, addr, &ptl); + goto out; + } + put_page(page); + unlock_page(page); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte--; + addr -= PAGE_SIZE; + continue; + } + + VM_BUG_ON_PAGE(PageTransCompound(page), page); + + if (PageSwapCache(page) || PageDirty(page)) { + if (!trylock_page(page)) + continue; + /* + * If page is shared with others, we couldn't clear + * PG_dirty of the page. + */ + if (page_mapcount(page) != 1) { + unlock_page(page); + continue; + } + + if (PageSwapCache(page) && !try_to_free_swap(page)) { + unlock_page(page); + continue; + } + + ClearPageDirty(page); + unlock_page(page); + } + + if (pte_young(ptent) || pte_dirty(ptent)) { + /* + * Some of architecture(ex, PPC) don't update TLB + * with set_pte_at and tlb_remove_tlb_entry so for + * the portability, remap the pte with old|clean + * after pte clearing. + */ + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + + ptent = pte_mkold(ptent); + ptent = pte_mkclean(ptent); + set_pte_at(mm, addr, pte, ptent); + if (PageActive(page)) + deactivate_page(page); + tlb_remove_tlb_entry(tlb, pte, addr); + } + } +out: + if (nr_swap) { + if (current->mm == mm) + sync_mm_rss(mm); + + add_mm_counter(mm, MM_SWAPENTS, nr_swap); + } + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(orig_pte, ptl); + cond_resched(); +next: + return 0; +} + +static void madvise_free_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + struct mm_walk free_walk = { + .pmd_entry = madvise_free_pte_range, + .mm = vma->vm_mm, + .private = tlb, + }; + + tlb_start_vma(tlb, vma); + walk_page_range(addr, end, &free_walk); + tlb_end_vma(tlb, vma); +} + +static int madvise_free_single_vma(struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr) +{ + unsigned long start, end; + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) + return -EINVAL; + + /* MADV_FREE works for only anon vma at the moment */ + if (!vma_is_anonymous(vma)) + return -EINVAL; + + start = max(vma->vm_start, start_addr); + if (start >= vma->vm_end) + return -EINVAL; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + return -EINVAL; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, start, end); + update_hiwater_rss(mm); + + mmu_notifier_invalidate_range_start(mm, start, end); + madvise_free_page_range(&tlb, vma, start, end); + mmu_notifier_invalidate_range_end(mm, start, end); + tlb_finish_mmu(&tlb, start, end); + + return 0; +} + +static long madvise_free(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + *prev = vma; + return madvise_free_single_vma(vma, start, end); +} + /* * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about @@ -379,6 +571,14 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); + case MADV_FREE: + /* + * XXX: In this implementation, MADV_FREE works like + * MADV_DONTNEED on swapless system or full swap. + */ + if (get_nr_swap_pages() > 0) + return madvise_free(vma, prev, start, end); + /* passthrough */ case MADV_DONTNEED: return madvise_dontneed(vma, prev, start, end); default: @@ -398,6 +598,7 @@ madvise_behavior_valid(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_FREE: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 54eae4f..0eda673 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -382,14 +382,11 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) { struct mem_cgroup *memcg; - rcu_read_lock(); - memcg = page->mem_cgroup; if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) memcg = root_mem_cgroup; - rcu_read_unlock(); return &memcg->css; } @@ -647,7 +644,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, - int nr_pages) + bool compound, int nr_pages) { /* * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is @@ -660,9 +657,11 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); - if (PageTransHuge(page)) + if (compound) { + VM_BUG_ON_PAGE(!PageTransHuge(page), page); __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_pages); + } /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) @@ -2431,9 +2430,7 @@ void __memcg_kmem_uncharge(struct page *page, int order) /* * Because tail pages are not marked as "used", set it. We're under - * zone->lru_lock, 'splitting on pmd' and compound_lock. - * charge/uncharge will be never happen and move_account() is done under - * compound_lock(), so we don't have to take care of races. + * zone->lru_lock and migration entries setup in all page mappings. */ void mem_cgroup_split_huge_fixup(struct page *head) { @@ -3494,16 +3491,17 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, swap_buffers: /* Swap primary and spare array */ thresholds->spare = thresholds->primary; - /* If all events are unregistered, free the spare array */ - if (!new) { - kfree(thresholds->spare); - thresholds->spare = NULL; - } rcu_assign_pointer(thresholds->primary, new); /* To be sure that nobody uses thresholds */ synchronize_rcu(); + + /* If all events are unregistered, free the spare array */ + if (!new) { + kfree(thresholds->spare); + thresholds->spare = NULL; + } unlock: mutex_unlock(&memcg->thresholds_lock); } @@ -4505,38 +4503,30 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. * - * The caller must confirm following. - * - page is not on LRU (isolate_page() is useful.) - * - compound_lock is held when nr_pages > 1 + * The caller must make sure the page is not on LRU (isolate_page() is useful.) * * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" * from old cgroup. */ static int mem_cgroup_move_account(struct page *page, - unsigned int nr_pages, + bool compound, struct mem_cgroup *from, struct mem_cgroup *to) { unsigned long flags; + unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; int ret; bool anon; VM_BUG_ON(from == to); VM_BUG_ON_PAGE(PageLRU(page), page); - /* - * The page is isolated from LRU. So, collapse function - * will not handle this page. But page splitting can happen. - * Do this check under compound_page_lock(). The caller should - * hold it. - */ - ret = -EBUSY; - if (nr_pages > 1 && !PageTransHuge(page)) - goto out; + VM_BUG_ON(compound && !PageTransHuge(page)); /* * Prevent mem_cgroup_replace_page() from looking at * page->mem_cgroup of its source page while we change it. */ + ret = -EBUSY; if (!trylock_page(page)) goto out; @@ -4591,9 +4581,9 @@ static int mem_cgroup_move_account(struct page *page, ret = 0; local_irq_disable(); - mem_cgroup_charge_statistics(to, page, nr_pages); + mem_cgroup_charge_statistics(to, page, compound, nr_pages); memcg_check_events(to, page); - mem_cgroup_charge_statistics(from, page, -nr_pages); + mem_cgroup_charge_statistics(from, page, compound, -nr_pages); memcg_check_events(from, page); local_irq_enable(); out_unlock: @@ -4683,7 +4673,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl)) { if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) mc.precharge += HPAGE_PMD_NR; spin_unlock(ptl); @@ -4871,17 +4861,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, union mc_target target; struct page *page; - /* - * We don't take compound_lock() here but no race with splitting thp - * happens because: - * - if pmd_trans_huge_lock() returns 1, the relevant thp is not - * under splitting, which means there's no concurrent thp split, - * - if another thread runs into split_huge_page() just after we - * entered this if-block, the thread must wait for page table lock - * to be unlocked in __split_huge_page_splitting(), where the main - * part of thp split is not executed yet. - */ - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl)) { if (mc.precharge < HPAGE_PMD_NR) { spin_unlock(ptl); return 0; @@ -4890,7 +4870,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, if (target_type == MC_TARGET_PAGE) { page = target.page; if (!isolate_lru_page(page)) { - if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, + if (!mem_cgroup_move_account(page, true, mc.from, mc.to)) { mc.precharge -= HPAGE_PMD_NR; mc.moved_charge += HPAGE_PMD_NR; @@ -4917,9 +4897,18 @@ retry: switch (get_mctgt_type(vma, addr, ptent, &target)) { case MC_TARGET_PAGE: page = target.page; + /* + * We can have a part of the split pmd here. Moving it + * can be done but it would be too convoluted so simply + * ignore such a partial THP and keep it in original + * memcg. There should be somebody mapping the head. + */ + if (PageTransCompound(page)) + goto put; if (isolate_lru_page(page)) goto put; - if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) { + if (!mem_cgroup_move_account(page, false, + mc.from, mc.to)) { mc.precharge--; /* we uncharge from mc.from later. */ mc.moved_charge++; @@ -5258,10 +5247,11 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) * with mem_cgroup_cancel_charge() in case page instantiation fails. */ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp) + gfp_t gfp_mask, struct mem_cgroup **memcgp, + bool compound) { struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = 1; + unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; int ret = 0; if (mem_cgroup_disabled()) @@ -5291,11 +5281,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, } } - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - if (!memcg) memcg = get_mem_cgroup_from_mm(mm); @@ -5324,9 +5309,9 @@ out: * Use mem_cgroup_cancel_charge() to cancel the transaction instead. */ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare) + bool lrucare, bool compound) { - unsigned int nr_pages = 1; + unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; VM_BUG_ON_PAGE(!page->mapping, page); VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); @@ -5343,13 +5328,8 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, commit_charge(page, memcg, lrucare); - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - local_irq_disable(); - mem_cgroup_charge_statistics(memcg, page, nr_pages); + mem_cgroup_charge_statistics(memcg, page, compound, nr_pages); memcg_check_events(memcg, page); local_irq_enable(); @@ -5371,9 +5351,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, * * Cancel a charge transaction started by mem_cgroup_try_charge(). */ -void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) +void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, + bool compound) { - unsigned int nr_pages = 1; + unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; if (mem_cgroup_disabled()) return; @@ -5385,11 +5366,6 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) if (!memcg) return; - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - cancel_charge(memcg, nr_pages); } @@ -5750,7 +5726,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * only synchronisation we have for udpating the per-CPU variables. */ VM_BUG_ON(!irqs_disabled()); - mem_cgroup_charge_statistics(memcg, page, -1); + mem_cgroup_charge_statistics(memcg, page, false, -1); memcg_check_events(memcg, page); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 8424b64..ac595e7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -882,15 +882,7 @@ int get_hwpoison_page(struct page *page) { struct page *head = compound_head(page); - if (PageHuge(head)) - return get_page_unless_zero(head); - - /* - * Thp tail page has special refcounting rule (refcount of tail pages - * is stored in ->_mapcount,) so we can't call get_page_unless_zero() - * directly for tail pages. - */ - if (PageTransHuge(head)) { + if (!PageHuge(head) && PageTransHuge(head)) { /* * Non anonymous thp exists only in allocation/free time. We * can't handle such a case correctly, so let's give it up. @@ -902,41 +894,12 @@ int get_hwpoison_page(struct page *page) page_to_pfn(page)); return 0; } - - if (get_page_unless_zero(head)) { - if (PageTail(page)) - get_page(page); - return 1; - } else { - return 0; - } } - return get_page_unless_zero(page); + return get_page_unless_zero(head); } EXPORT_SYMBOL_GPL(get_hwpoison_page); -/** - * put_hwpoison_page() - Put refcount for memory error handling: - * @page: raw error page (hit by memory error) - */ -void put_hwpoison_page(struct page *page) -{ - struct page *head = compound_head(page); - - if (PageHuge(head)) { - put_page(head); - return; - } - - if (PageTransHuge(head)) - if (page != head) - put_page(head); - - put_page(page); -} -EXPORT_SYMBOL_GPL(put_hwpoison_page); - /* * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. @@ -1149,7 +1112,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } if (!PageHuge(p) && PageTransHuge(hpage)) { + lock_page(hpage); if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { + unlock_page(hpage); if (!PageAnon(hpage)) pr_err("MCE: %#lx: non anonymous thp\n", pfn); else @@ -1159,6 +1124,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) put_hwpoison_page(p); return -EBUSY; } + unlock_page(hpage); + get_hwpoison_page(p); + put_hwpoison_page(hpage); VM_BUG_ON_PAGE(!page_count(p), p); hpage = compound_head(p); } @@ -1166,7 +1134,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) /* * We ignore non-LRU pages for good reasons. * - PG_locked is only well defined for LRU pages and a few others - * - to avoid races with __set_page_locked() + * - to avoid races with __SetPageLocked() * - to avoid races with __SetPageSlab*() (and more non-atomic ops) * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. @@ -1572,7 +1540,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) * Did it turn free? */ ret = __get_any_page(page, pfn, 0); - if (!PageLRU(page)) { + if (ret == 1 && !PageLRU(page)) { /* Drop page reference which is from __get_any_page() */ put_hwpoison_page(page); pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", @@ -1716,6 +1684,49 @@ static int __soft_offline_page(struct page *page, int flags) return ret; } +static int soft_offline_in_use_page(struct page *page, int flags) +{ + int ret; + struct page *hpage = compound_head(page); + + if (!PageHuge(page) && PageTransHuge(hpage)) { + lock_page(hpage); + if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { + unlock_page(hpage); + if (!PageAnon(hpage)) + pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page)); + else + pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page)); + put_hwpoison_page(hpage); + return -EBUSY; + } + unlock_page(hpage); + get_hwpoison_page(page); + put_hwpoison_page(hpage); + } + + if (PageHuge(page)) + ret = soft_offline_huge_page(page, flags); + else + ret = __soft_offline_page(page, flags); + + return ret; +} + +static void soft_offline_free_page(struct page *page) +{ + if (PageHuge(page)) { + struct page *hpage = compound_head(page); + + set_page_hwpoison_huge_page(hpage); + if (!dequeue_hwpoisoned_huge_page(hpage)) + num_poisoned_pages_add(1 << compound_order(hpage)); + } else { + if (!TestSetPageHWPoison(page)) + num_poisoned_pages_inc(); + } +} + /** * soft_offline_page - Soft offline a page. * @page: page to offline @@ -1742,7 +1753,6 @@ int soft_offline_page(struct page *page, int flags) { int ret; unsigned long pfn = page_to_pfn(page); - struct page *hpage = compound_head(page); if (PageHWPoison(page)) { pr_info("soft offline: %#lx page already poisoned\n", pfn); @@ -1750,34 +1760,15 @@ int soft_offline_page(struct page *page, int flags) put_hwpoison_page(page); return -EBUSY; } - if (!PageHuge(page) && PageTransHuge(hpage)) { - if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { - pr_info("soft offline: %#lx: failed to split THP\n", - pfn); - if (flags & MF_COUNT_INCREASED) - put_hwpoison_page(page); - return -EBUSY; - } - } get_online_mems(); - ret = get_any_page(page, pfn, flags); put_online_mems(); - if (ret > 0) { /* for in-use pages */ - if (PageHuge(page)) - ret = soft_offline_huge_page(page, flags); - else - ret = __soft_offline_page(page, flags); - } else if (ret == 0) { /* for free pages */ - if (PageHuge(page)) { - set_page_hwpoison_huge_page(hpage); - if (!dequeue_hwpoisoned_huge_page(hpage)) - num_poisoned_pages_add(1 << compound_order(hpage)); - } else { - if (!TestSetPageHWPoison(page)) - num_poisoned_pages_inc(); - } - } + + if (ret > 0) + ret = soft_offline_in_use_page(page, flags); + else if (ret == 0) + soft_offline_free_page(page); + return ret; } diff --git a/mm/memory.c b/mm/memory.c index d4e4d37..ff17850 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -50,6 +50,7 @@ #include <linux/export.h> #include <linux/delayacct.h> #include <linux/init.h> +#include <linux/pfn_t.h> #include <linux/writeback.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> @@ -566,7 +567,6 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, { spinlock_t *ptl; pgtable_t new = pte_alloc_one(mm, address); - int wait_split_huge_page; if (!new) return -ENOMEM; @@ -586,18 +586,14 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ ptl = pmd_lock(mm, pmd); - wait_split_huge_page = 0; if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ atomic_long_inc(&mm->nr_ptes); pmd_populate(mm, pmd, new); new = NULL; - } else if (unlikely(pmd_trans_splitting(*pmd))) - wait_split_huge_page = 1; + } spin_unlock(ptl); if (new) pte_free(mm, new); - if (wait_split_huge_page) - wait_split_huge_page(vma->anon_vma, pmd); return 0; } @@ -613,8 +609,7 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ pmd_populate_kernel(&init_mm, pmd, new); new = NULL; - } else - VM_BUG_ON(pmd_trans_splitting(*pmd)); + } spin_unlock(&init_mm.page_table_lock); if (new) pte_free_kernel(&init_mm, new); @@ -870,7 +865,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, page = vm_normal_page(vma, addr, pte); if (page) { get_page(page); - page_dup_rmap(page); + page_dup_rmap(page, false); rss[mm_counter(page)]++; } @@ -955,7 +950,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_trans_huge(*src_pmd)) { + if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { int err; VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); err = copy_huge_pmd(dst_mm, src_mm, @@ -1118,7 +1113,7 @@ again: mark_page_accessed(page); } rss[mm_counter(page)]--; - page_remove_rmap(page); + page_remove_rmap(page, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); if (unlikely(!__tlb_remove_page(tlb, page))) { @@ -1182,7 +1177,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_trans_huge(*pmd)) { + if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { #ifdef CONFIG_DEBUG_VM if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { @@ -1193,7 +1188,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, BUG(); } #endif - split_huge_page_pmd(vma, addr, pmd); + split_huge_pmd(vma, pmd, addr); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ @@ -1506,7 +1501,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, EXPORT_SYMBOL(vm_insert_page); static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, pgprot_t prot) + pfn_t pfn, pgprot_t prot) { struct mm_struct *mm = vma->vm_mm; int retval; @@ -1522,7 +1517,10 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, goto out_unlock; /* Ok, finally just insert the thing.. */ - entry = pte_mkspecial(pfn_pte(pfn, prot)); + if (pfn_t_devmap(pfn)) + entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); + else + entry = pte_mkspecial(pfn_t_pte(pfn, prot)); set_pte_at(mm, addr, pte, entry); update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ @@ -1569,17 +1567,17 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; - if (track_pfn_insert(vma, &pgprot, pfn)) + if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV))) return -EINVAL; - ret = insert_pfn(vma, addr, pfn, pgprot); + ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot); return ret; } EXPORT_SYMBOL(vm_insert_pfn); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn) + pfn_t pfn) { BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); @@ -1593,10 +1591,10 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP * without pte special, it would there be refcounted as a normal page. */ - if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { + if (!HAVE_PTE_SPECIAL && pfn_t_valid(pfn)) { struct page *page; - page = pfn_to_page(pfn); + page = pfn_t_to_page(pfn); return insert_page(vma, addr, page, vma->vm_page_prot); } return insert_pfn(vma, addr, pfn, vma->vm_page_prot); @@ -2087,7 +2085,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, cow_user_page(new_page, old_page, address, vma); } - if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) + if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) goto oom_free_new; __SetPageUptodate(new_page); @@ -2118,8 +2116,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, * thread doing COW. */ ptep_clear_flush_notify(vma, address, page_table); - page_add_new_anon_rmap(new_page, vma, address); - mem_cgroup_commit_charge(new_page, memcg, false); + page_add_new_anon_rmap(new_page, vma, address, false); + mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); /* * We call the notify macro here because, when using secondary @@ -2151,14 +2149,14 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ - page_remove_rmap(old_page); + page_remove_rmap(old_page, false); } /* Free the old page.. */ new_page = old_page; page_copied = 1; } else { - mem_cgroup_cancel_charge(new_page, memcg); + mem_cgroup_cancel_charge(new_page, memcg, false); } if (new_page) @@ -2173,7 +2171,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, */ if (page_copied && (vma->vm_flags & VM_LOCKED)) { lock_page(old_page); /* LRU manipulation */ - munlock_vma_page(old_page); + if (PageMlocked(old_page)) + munlock_vma_page(old_page); unlock_page(old_page); } page_cache_release(old_page); @@ -2533,7 +2532,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_page; } - if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) { + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) { ret = VM_FAULT_OOM; goto out_page; } @@ -2567,7 +2566,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, pte = maybe_mkwrite(pte_mkdirty(pte), vma); flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; - exclusive = 1; + exclusive = RMAP_EXCLUSIVE; } flush_icache_page(vma, page); if (pte_swp_soft_dirty(orig_pte)) @@ -2575,10 +2574,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, set_pte_at(mm, address, page_table, pte); if (page == swapcache) { do_page_add_anon_rmap(page, vma, address, exclusive); - mem_cgroup_commit_charge(page, memcg, true); + mem_cgroup_commit_charge(page, memcg, true, false); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, address); - mem_cgroup_commit_charge(page, memcg, false); + page_add_new_anon_rmap(page, vma, address, false); + mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } @@ -2613,7 +2612,7 @@ unlock: out: return ret; out_nomap: - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); pte_unmap_unlock(page_table, ptl); out_page: unlock_page(page); @@ -2707,7 +2706,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!page) goto oom; - if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) goto oom_free_page; /* @@ -2728,15 +2727,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(page_table, ptl); - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); page_cache_release(page); return handle_userfault(vma, address, flags, VM_UFFD_MISSING); } inc_mm_counter_fast(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address); - mem_cgroup_commit_charge(page, memcg, false); + page_add_new_anon_rmap(page, vma, address, false); + mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); setpte: set_pte_at(mm, address, page_table, entry); @@ -2747,7 +2746,7 @@ unlock: pte_unmap_unlock(page_table, ptl); return 0; release: - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); page_cache_release(page); goto unlock; oom_free_page: @@ -2824,7 +2823,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address); + page_add_new_anon_rmap(page, vma, address, false); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page); @@ -3000,7 +2999,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!new_page) return VM_FAULT_OOM; - if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) { + if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) { page_cache_release(new_page); return VM_FAULT_OOM; } @@ -3029,7 +3028,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto uncharge_out; } do_set_pte(vma, address, new_page, pte, true, true); - mem_cgroup_commit_charge(new_page, memcg, false); + mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); pte_unmap_unlock(pte, ptl); if (fault_page) { @@ -3044,7 +3043,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, } return ret; uncharge_out: - mem_cgroup_cancel_charge(new_page, memcg); + mem_cgroup_cancel_charge(new_page, memcg, false); page_cache_release(new_page); return ret; } @@ -3096,7 +3095,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, * pinned by vma->vm_file's reference. We rely on unlock_page()'s * release semantics to prevent the compiler from undoing this copying. */ - mapping = fault_page->mapping; + mapping = page_rmapping(fault_page); unlock_page(fault_page); if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { /* @@ -3198,6 +3197,12 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } + /* TODO: handle PTE-mapped THP */ + if (PageCompound(page)) { + pte_unmap_unlock(ptep, ptl); + return 0; + } + /* * Avoid grouping on RO pages in general. RO pages shouldn't hurt as * much anyway since they can be in shared cache state. This misses @@ -3370,17 +3375,9 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, int ret; barrier(); - if (pmd_trans_huge(orig_pmd)) { + if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { unsigned int dirty = flags & FAULT_FLAG_WRITE; - /* - * If the pmd is splitting, return and retry the - * the fault. Alternative: wait until the split - * is done, and goto retry. - */ - if (pmd_trans_splitting(orig_pmd)) - return 0; - if (pmd_protnone(orig_pmd)) return do_huge_pmd_numa_page(mm, vma, address, orig_pmd, pmd); @@ -3407,7 +3404,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; /* if an huge pmd materialized from under us just retry later */ - if (unlikely(pmd_trans_huge(*pmd))) + if (unlikely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) return 0; /* * A regular pmd is established and it can't morph into a huge pmd diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 92f9595..4af58a3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -17,6 +17,7 @@ #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/memory.h> +#include <linux/memremap.h> #include <linux/memory_hotplug.h> #include <linux/highmem.h> #include <linux/vmalloc.h> @@ -506,10 +507,25 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, unsigned long i; int err = 0; int start_sec, end_sec; + struct vmem_altmap *altmap; + /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); + altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn)); + if (altmap) { + /* + * Validate altmap is within bounds of the total request + */ + if (altmap->base_pfn != phys_start_pfn + || vmem_altmap_offset(altmap) > nr_pages) { + pr_warn_once("memory add fail, invalid altmap\n"); + return -EINVAL; + } + altmap->alloc = 0; + } + for (i = start_sec; i <= end_sec; i++) { err = __add_section(nid, zone, section_nr_to_pfn(i)); @@ -731,7 +747,8 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn) pgdat_resize_unlock(zone->zone_pgdat, &flags); } -static int __remove_section(struct zone *zone, struct mem_section *ms) +static int __remove_section(struct zone *zone, struct mem_section *ms, + unsigned long map_offset) { unsigned long start_pfn; int scn_nr; @@ -748,7 +765,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms) start_pfn = section_nr_to_pfn(scn_nr); __remove_zone(zone, start_pfn); - sparse_remove_one_section(zone, ms); + sparse_remove_one_section(zone, ms, map_offset); return 0; } @@ -767,9 +784,32 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, unsigned long nr_pages) { unsigned long i; - int sections_to_remove; - resource_size_t start, size; - int ret = 0; + unsigned long map_offset = 0; + int sections_to_remove, ret = 0; + + /* In the ZONE_DEVICE case device driver owns the memory region */ + if (is_dev_zone(zone)) { + struct page *page = pfn_to_page(phys_start_pfn); + struct vmem_altmap *altmap; + + altmap = to_vmem_altmap((unsigned long) page); + if (altmap) + map_offset = vmem_altmap_offset(altmap); + } else { + resource_size_t start, size; + + start = phys_start_pfn << PAGE_SHIFT; + size = nr_pages * PAGE_SIZE; + + ret = release_mem_region_adjustable(&iomem_resource, start, + size); + if (ret) { + resource_size_t endres = start + size - 1; + + pr_warn("Unable to release resource <%pa-%pa> (%d)\n", + &start, &endres, ret); + } + } /* * We can only remove entire sections @@ -777,23 +817,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); BUG_ON(nr_pages % PAGES_PER_SECTION); - start = phys_start_pfn << PAGE_SHIFT; - size = nr_pages * PAGE_SIZE; - - /* in the ZONE_DEVICE case device driver owns the memory region */ - if (!is_dev_zone(zone)) - ret = release_mem_region_adjustable(&iomem_resource, start, size); - if (ret) { - resource_size_t endres = start + size - 1; - - pr_warn("Unable to release resource <%pa-%pa> (%d)\n", - &start, &endres, ret); - } - sections_to_remove = nr_pages / PAGES_PER_SECTION; for (i = 0; i < sections_to_remove; i++) { unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; - ret = __remove_section(zone, __pfn_to_section(pfn)); + + ret = __remove_section(zone, __pfn_to_section(pfn), map_offset); + map_offset = 0; if (ret) break; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d8caff0..27d1354 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -489,14 +489,33 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, struct page *page; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; - int nid; + int nid, ret; pte_t *pte; spinlock_t *ptl; - split_huge_page_pmd(vma, addr, pmd); - if (pmd_trans_unstable(pmd)) - return 0; + if (pmd_trans_huge(*pmd)) { + ptl = pmd_lock(walk->mm, pmd); + if (pmd_trans_huge(*pmd)) { + page = pmd_page(*pmd); + if (is_huge_zero_page(page)) { + spin_unlock(ptl); + split_huge_pmd(vma, pmd, addr); + } else { + get_page(page); + spin_unlock(ptl); + lock_page(page); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + if (ret) + return 0; + } + } else { + spin_unlock(ptl); + } + } +retry: pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) @@ -513,6 +532,21 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, nid = page_to_nid(page); if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) continue; + if (PageTail(page) && PageAnon(page)) { + get_page(page); + pte_unmap_unlock(pte, ptl); + lock_page(page); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + /* Failed to split -- skip. */ + if (ret) { + pte = pte_offset_map_lock(walk->mm, pmd, + addr, &ptl); + continue; + } + goto retry; + } if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) migrate_page_add(page, qp->pagelist, flags); @@ -610,7 +644,8 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, if (flags & MPOL_MF_LAZY) { /* Similar to task_numa_work, skip inaccessible VMAs */ - if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) + if (vma_migratable(vma) && + vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) change_prot_numa(vma, start, endvma); return 1; } diff --git a/mm/migrate.c b/mm/migrate.c index 7890d0b..b1034f9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -165,9 +165,9 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, if (PageAnon(new)) hugepage_add_anon_rmap(new, vma, addr); else - page_dup_rmap(new); + page_dup_rmap(new, true); } else if (PageAnon(new)) - page_add_anon_rmap(new, vma, addr); + page_add_anon_rmap(new, vma, addr, false); else page_add_file_rmap(new); @@ -943,9 +943,13 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, goto out; } - if (unlikely(PageTransHuge(page))) - if (unlikely(split_huge_page(page))) + if (unlikely(PageTransHuge(page))) { + lock_page(page); + rc = split_huge_page(page); + unlock_page(page); + if (rc) goto out; + } rc = __unmap_and_move(page, newpage, force, mode); if (rc == MIGRATEPAGE_SUCCESS) @@ -1756,6 +1760,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, HPAGE_PMD_ORDER); if (!new_page) goto out_fail; + prep_transhuge_page(new_page); isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) { @@ -1767,7 +1772,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, flush_tlb_range(vma, mmun_start, mmun_end); /* Prepare a page as a migration target */ - __set_page_locked(new_page); + __SetPageLocked(new_page); SetPageSwapBacked(new_page); /* anon mapping, we can simply copy page->mapping to the new page: */ @@ -1815,7 +1820,7 @@ fail_putback: * guarantee the copy is visible before the pagetable update. */ flush_cache_range(vma, mmun_start, mmun_end); - page_add_anon_rmap(new_page, vma, mmun_start); + page_add_anon_rmap(new_page, vma, mmun_start, true); pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); set_pmd_at(mm, mmun_start, pmd, entry); flush_tlb_range(vma, mmun_start, mmun_end); @@ -1826,14 +1831,14 @@ fail_putback: flush_tlb_range(vma, mmun_start, mmun_end); mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); - page_remove_rmap(new_page); + page_remove_rmap(new_page, true); goto fail_putback; } mlock_migrate_page(new_page, page); set_page_memcg(new_page, page_memcg(page)); set_page_memcg(page, NULL); - page_remove_rmap(page); + page_remove_rmap(page, true); spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); diff --git a/mm/mincore.c b/mm/mincore.c index 14bb9fb..2a565ed 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -117,7 +117,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec = walk->private; int nr = (end - addr) >> PAGE_SHIFT; - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl)) { memset(vec, 1, nr); spin_unlock(ptl); goto out; @@ -24,13 +24,13 @@ #include "internal.h" -int can_do_mlock(void) +bool can_do_mlock(void) { if (rlimit(RLIMIT_MEMLOCK) != 0) - return 1; + return true; if (capable(CAP_IPC_LOCK)) - return 1; - return 0; + return true; + return false; } EXPORT_SYMBOL(can_do_mlock); @@ -82,6 +82,9 @@ void mlock_vma_page(struct page *page) /* Serialize with page migration */ BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(PageTail(page), page); + VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); + if (!TestSetPageMlocked(page)) { mod_zone_page_state(page_zone(page), NR_MLOCK, hpage_nr_pages(page)); @@ -178,6 +181,8 @@ unsigned int munlock_vma_page(struct page *page) /* For try_to_munlock() and to serialize with page migration */ BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(PageTail(page), page); + /* * Serialize with any parallel __split_huge_page_refcount() which * might otherwise copy PageMlocked to part of the tail pages before @@ -388,6 +393,13 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, if (!page || page_zone_id(page) != zoneid) break; + /* + * Do not use pagevec for PTE-mapped THP, + * munlock_vma_pages_range() will handle them. + */ + if (PageTransCompound(page)) + break; + get_page(page); /* * Increase the address that will be returned *before* the @@ -444,7 +456,10 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, &page_mask); if (page && !IS_ERR(page)) { - if (PageTransHuge(page)) { + if (PageTransTail(page)) { + VM_BUG_ON_PAGE(PageMlocked(page), page); + put_page(page); /* follow_page_mask() */ + } else if (PageTransHuge(page)) { lock_page(page); /* * Any THP page found by follow_page_mask() may @@ -477,8 +492,6 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, goto next; } } - /* It's a bug to munlock in the middle of a THP page */ - VM_BUG_ON((start >> PAGE_SHIFT) & page_mask); page_increm = 1 + page_mask; start += page_increm * PAGE_SIZE; next: @@ -3184,10 +3184,16 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) * mapping->flags avoid to take the same lock twice, if more than one * vma in this mm is backed by the same anon_vma or address_space. * - * We can take all the locks in random order because the VM code - * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never - * takes more than one of them in a row. Secondly we're protected - * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. + * We take locks in following order, accordingly to comment at beginning + * of mm/rmap.c: + * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for + * hugetlb mapping); + * - all i_mmap_rwsem locks; + * - all anon_vma->rwseml + * + * We can take all locks within these types randomly because the VM code + * doesn't nest them and we protected from parallel mm_take_all_locks() by + * mm_all_locks_mutex. * * mm_take_all_locks() and mm_drop_all_locks are expensive operations * that may have to take thousand of locks. @@ -3206,7 +3212,16 @@ int mm_take_all_locks(struct mm_struct *mm) for (vma = mm->mmap; vma; vma = vma->vm_next) { if (signal_pending(current)) goto out_unlock; - if (vma->vm_file && vma->vm_file->f_mapping) + if (vma->vm_file && vma->vm_file->f_mapping && + is_vm_hugetlb_page(vma)) + vm_lock_mapping(mm, vma->vm_file->f_mapping); + } + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (signal_pending(current)) + goto out_unlock; + if (vma->vm_file && vma->vm_file->f_mapping && + !is_vm_hugetlb_page(vma)) vm_lock_mapping(mm, vma->vm_file->f_mapping); } diff --git a/mm/mprotect.c b/mm/mprotect.c index c764402..8eb7bb4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -149,7 +149,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, unsigned long this_pages; next = pmd_addr_end(addr, end); - if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd)) + if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd) + && pmd_none_or_clear_bad(pmd)) continue; /* invoke the mmu notifier if the pmd is populated */ @@ -158,9 +159,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(mm, mni_start, end); } - if (pmd_trans_huge(*pmd)) { + if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) - split_huge_page_pmd(vma, addr, pmd); + split_huge_pmd(vma, pmd, addr); else { int nr_ptes = change_huge_pmd(vma, pmd, addr, newprot, prot_numa); diff --git a/mm/mremap.c b/mm/mremap.c index e55b157..d77946a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -192,25 +192,24 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (!new_pmd) break; if (pmd_trans_huge(*old_pmd)) { - int err = 0; if (extent == HPAGE_PMD_SIZE) { + bool moved; VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma, vma); /* See comment in move_ptes() */ if (need_rmap_locks) anon_vma_lock_write(vma->anon_vma); - err = move_huge_pmd(vma, new_vma, old_addr, + moved = move_huge_pmd(vma, new_vma, old_addr, new_addr, old_end, old_pmd, new_pmd); if (need_rmap_locks) anon_vma_unlock_write(vma->anon_vma); + if (moved) { + need_flush = true; + continue; + } } - if (err > 0) { - need_flush = true; - continue; - } else if (!err) { - split_huge_page_pmd(vma, old_addr, old_pmd); - } + split_huge_pmd(vma, old_pmd, old_addr); VM_BUG_ON(pmd_trans_huge(*old_pmd)); } if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ce63d60..63358d9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -43,6 +43,7 @@ #include <linux/vmalloc.h> #include <linux/vmstat.h> #include <linux/mempolicy.h> +#include <linux/memremap.h> #include <linux/stop_machine.h> #include <linux/sort.h> #include <linux/pfn.h> @@ -222,13 +223,15 @@ static char * const zone_names[MAX_NR_ZONES] = { #endif }; -static void free_compound_page(struct page *page); compound_page_dtor * const compound_page_dtors[] = { NULL, free_compound_page, #ifdef CONFIG_HUGETLB_PAGE free_huge_page, #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + free_transhuge_page, +#endif }; int min_free_kbytes = 1024; @@ -450,7 +453,7 @@ out: * This usage means that zero-order pages may not be compound. */ -static void free_compound_page(struct page *page) +void free_compound_page(struct page *page) { __free_pages_ok(page, compound_order(page)); } @@ -466,8 +469,10 @@ void prep_compound_page(struct page *page, unsigned int order) for (i = 1; i < nr_pages; i++) { struct page *p = page + i; set_page_count(p, 0); + p->mapping = TAIL_MAPPING; set_compound_head(p, page); } + atomic_set(compound_mapcount_ptr(page), -1); } #ifdef CONFIG_DEBUG_PAGEALLOC @@ -732,7 +737,7 @@ static inline int free_pages_check(struct page *page) const char *bad_reason = NULL; unsigned long bad_flags = 0; - if (unlikely(page_mapcount(page))) + if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; @@ -856,6 +861,27 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) ret = 0; goto out; } + switch (page - head_page) { + case 1: + /* the first tail page: ->mapping is compound_mapcount() */ + if (unlikely(compound_mapcount(page))) { + bad_page(page, "nonzero compound_mapcount", 0); + goto out; + } + break; + case 2: + /* + * the second tail page: ->mapping is + * page_deferred_list().next -- ignore value. + */ + break; + default: + if (page->mapping != TAIL_MAPPING) { + bad_page(page, "corrupted mapping in tail page", 0); + goto out; + } + break; + } if (unlikely(!PageTail(page))) { bad_page(page, "PageTail not set", 0); goto out; @@ -866,6 +892,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) } ret = 0; out: + page->mapping = NULL; clear_compound_head(page); return ret; } @@ -1329,7 +1356,7 @@ static inline int check_new_page(struct page *page) const char *bad_reason = NULL; unsigned long bad_flags = 0; - if (unlikely(page_mapcount(page))) + if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; @@ -4459,16 +4486,22 @@ static inline unsigned long wait_table_bits(unsigned long size) void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, enum memmap_context context) { - pg_data_t *pgdat = NODE_DATA(nid); + struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn)); unsigned long end_pfn = start_pfn + size; + pg_data_t *pgdat = NODE_DATA(nid); unsigned long pfn; - struct zone *z; unsigned long nr_initialised = 0; if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; - z = &pgdat->node_zones[zone]; + /* + * Honor reservation requested by the driver for this ZONE_DEVICE + * memory + */ + if (altmap && start_pfn == altmap->base_pfn) + start_pfn += altmap->reserve; + for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* * There can be holes in boot-time mem_map[]s diff --git a/mm/page_idle.c b/mm/page_idle.c index d5dd790..4ea9c4e 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -55,25 +55,26 @@ static int page_idle_clear_pte_refs_one(struct page *page, unsigned long addr, void *arg) { struct mm_struct *mm = vma->vm_mm; - spinlock_t *ptl; pmd_t *pmd; pte_t *pte; + spinlock_t *ptl; bool referenced = false; - if (unlikely(PageTransHuge(page))) { - pmd = page_check_address_pmd(page, mm, addr, - PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); - if (pmd) { - referenced = pmdp_clear_young_notify(vma, addr, pmd); - spin_unlock(ptl); - } + if (!page_check_address_transhuge(page, mm, addr, &pmd, &pte, &ptl)) + return SWAP_AGAIN; + + if (pte) { + referenced = ptep_clear_young_notify(vma, addr, pte); + pte_unmap(pte); + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + referenced = pmdp_clear_young_notify(vma, addr, pmd); } else { - pte = page_check_address(page, mm, addr, &ptl, 0); - if (pte) { - referenced = ptep_clear_young_notify(vma, addr, pte); - pte_unmap_unlock(pte, ptl); - } + /* unexpected pmd-mapped page? */ + WARN_ON_ONCE(1); } + + spin_unlock(ptl); + if (referenced) { clear_page_idle(page); /* diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 5e139fe..92c4c36 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -196,8 +196,10 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, { unsigned long pfn; struct page *page; - BUG_ON((start_pfn) & (pageblock_nr_pages - 1)); - BUG_ON((end_pfn) & (pageblock_nr_pages - 1)); + + BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages)); + BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages)); + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 29f2f8b..2072444 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -58,7 +58,7 @@ again: if (!walk->pte_entry) continue; - split_huge_page_pmd_mm(walk->mm, addr, pmd); + split_huge_pmd(walk->vma, pmd, addr); if (pmd_trans_unstable(pmd)) goto again; err = walk_pte_range(pmd, addr, next, walk); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 4c681ba..9d47676 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -132,25 +132,13 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(!pmd_trans_huge(*pmdp)); + VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } #endif -#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH -void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) -{ - pmd_t pmd = pmd_mksplitting(*pmdp); - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - set_pmd_at(vma->vm_mm, address, pmdp, pmd); - /* tlb flush only to serialize against gup-fast */ - flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); -} -#endif - #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) @@ -23,21 +23,22 @@ * inode->i_mutex (while writing or truncating, not reading or faulting) * mm->mmap_sem * page->flags PG_locked (lock_page) - * mapping->i_mmap_rwsem - * anon_vma->rwsem - * mm->page_table_lock or pte_lock - * zone->lru_lock (in mark_page_accessed, isolate_lru_page) - * swap_lock (in swap_duplicate, swap_info_get) - * mmlist_lock (in mmput, drain_mmlist and others) - * mapping->private_lock (in __set_page_dirty_buffers) - * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) - * mapping->tree_lock (widely used) - * inode->i_lock (in set_page_dirty's __mark_inode_dirty) - * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) - * sb_lock (within inode_lock in fs/fs-writeback.c) - * mapping->tree_lock (widely used, in set_page_dirty, - * in arch-dependent flush_dcache_mmap_lock, - * within bdi.wb->list_lock in __sync_single_inode) + * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) + * mapping->i_mmap_rwsem + * anon_vma->rwsem + * mm->page_table_lock or pte_lock + * zone->lru_lock (in mark_page_accessed, isolate_lru_page) + * swap_lock (in swap_duplicate, swap_info_get) + * mmlist_lock (in mmput, drain_mmlist and others) + * mapping->private_lock (in __set_page_dirty_buffers) + * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) + * mapping->tree_lock (widely used) + * inode->i_lock (in set_page_dirty's __mark_inode_dirty) + * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) + * sb_lock (within inode_lock in fs/fs-writeback.c) + * mapping->tree_lock (widely used, in set_page_dirty, + * in arch-dependent flush_dcache_mmap_lock, + * within bdi.wb->list_lock in __sync_single_inode) * * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) * ->tasklist_lock @@ -567,27 +568,6 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma) anon_vma_unlock_read(anon_vma); } -/* - * At what user virtual address is page expected in @vma? - */ -static inline unsigned long -__vma_address(struct page *page, struct vm_area_struct *vma) -{ - pgoff_t pgoff = page_to_pgoff(page); - return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); -} - -inline unsigned long -vma_address(struct page *page, struct vm_area_struct *vma) -{ - unsigned long address = __vma_address(page, vma); - - /* page should be within @vma mapping range */ - VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); - - return address; -} - #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH static void percpu_flush_tlb_batch_pages(void *data) { @@ -819,6 +799,96 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) return 1; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * Check that @page is mapped at @address into @mm. In contrast to + * page_check_address(), this function can handle transparent huge pages. + * + * On success returns true with pte mapped and locked. For PMD-mapped + * transparent huge pages *@ptep is set to NULL. + */ +bool page_check_address_transhuge(struct page *page, struct mm_struct *mm, + unsigned long address, pmd_t **pmdp, + pte_t **ptep, spinlock_t **ptlp) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; + + if (unlikely(PageHuge(page))) { + /* when pud is not present, pte will be NULL */ + pte = huge_pte_offset(mm, address); + if (!pte) + return false; + + ptl = huge_pte_lockptr(page_hstate(page), mm, pte); + pmd = NULL; + goto check_pte; + } + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return false; + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return false; + pmd = pmd_offset(pud, address); + + if (pmd_trans_huge(*pmd)) { + ptl = pmd_lock(mm, pmd); + if (!pmd_present(*pmd)) + goto unlock_pmd; + if (unlikely(!pmd_trans_huge(*pmd))) { + spin_unlock(ptl); + goto map_pte; + } + + if (pmd_page(*pmd) != page) + goto unlock_pmd; + + pte = NULL; + goto found; +unlock_pmd: + spin_unlock(ptl); + return false; + } else { + pmd_t pmde = *pmd; + + barrier(); + if (!pmd_present(pmde) || pmd_trans_huge(pmde)) + return false; + } +map_pte: + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) { + pte_unmap(pte); + return false; + } + + ptl = pte_lockptr(mm, pmd); +check_pte: + spin_lock(ptl); + + if (!pte_present(*pte)) { + pte_unmap_unlock(pte, ptl); + return false; + } + + /* THP can be referenced by any subpage */ + if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) { + pte_unmap_unlock(pte, ptl); + return false; + } +found: + *ptep = pte; + *pmdp = pmd; + *ptlp = ptl; + return true; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + struct page_referenced_arg { int mapcount; int referenced; @@ -832,49 +902,24 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; + struct page_referenced_arg *pra = arg; + pmd_t *pmd; + pte_t *pte; spinlock_t *ptl; int referenced = 0; - struct page_referenced_arg *pra = arg; - - if (unlikely(PageTransHuge(page))) { - pmd_t *pmd; - /* - * rmap might return false positives; we must filter - * these out using page_check_address_pmd(). - */ - pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); - if (!pmd) - return SWAP_AGAIN; - - if (vma->vm_flags & VM_LOCKED) { - spin_unlock(ptl); - pra->vm_flags |= VM_LOCKED; - return SWAP_FAIL; /* To break the loop */ - } + if (!page_check_address_transhuge(page, mm, address, &pmd, &pte, &ptl)) + return SWAP_AGAIN; - /* go ahead even if the pmd is pmd_trans_splitting() */ - if (pmdp_clear_flush_young_notify(vma, address, pmd)) - referenced++; + if (vma->vm_flags & VM_LOCKED) { + if (pte) + pte_unmap(pte); spin_unlock(ptl); - } else { - pte_t *pte; - - /* - * rmap might return false positives; we must filter - * these out using page_check_address(). - */ - pte = page_check_address(page, mm, address, &ptl, 0); - if (!pte) - return SWAP_AGAIN; - - if (vma->vm_flags & VM_LOCKED) { - pte_unmap_unlock(pte, ptl); - pra->vm_flags |= VM_LOCKED; - return SWAP_FAIL; /* To break the loop */ - } + pra->vm_flags |= VM_LOCKED; + return SWAP_FAIL; /* To break the loop */ + } + if (pte) { if (ptep_clear_flush_young_notify(vma, address, pte)) { /* * Don't treat a reference through a sequentially read @@ -886,8 +931,15 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, if (likely(!(vma->vm_flags & VM_SEQ_READ))) referenced++; } - pte_unmap_unlock(pte, ptl); + pte_unmap(pte); + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + if (pmdp_clear_flush_young_notify(vma, address, pmd)) + referenced++; + } else { + /* unexpected pmd-mapped page? */ + WARN_ON_ONCE(1); } + spin_unlock(ptl); if (referenced) clear_page_idle(page); @@ -935,7 +987,7 @@ int page_referenced(struct page *page, int ret; int we_locked = 0; struct page_referenced_arg pra = { - .mapcount = page_mapcount(page), + .mapcount = total_mapcount(page), .memcg = memcg, }; struct rmap_walk_control rwc = { @@ -1124,7 +1176,7 @@ static void __page_check_anon_rmap(struct page *page, * over the call to page_add_new_anon_rmap. */ BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); - BUG_ON(page->index != linear_page_index(vma, address)); + BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address)); #endif } @@ -1133,6 +1185,7 @@ static void __page_check_anon_rmap(struct page *page, * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped + * @compound: charge the page as compound or small page * * The caller needs to hold the pte lock, and the page must be locked in * the anon_vma case: to serialize mapping,index checking after setting, @@ -1140,9 +1193,9 @@ static void __page_check_anon_rmap(struct page *page, * (but PageKsm is never downgraded to PageAnon). */ void page_add_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address) + struct vm_area_struct *vma, unsigned long address, bool compound) { - do_page_add_anon_rmap(page, vma, address, 0); + do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0); } /* @@ -1151,29 +1204,44 @@ void page_add_anon_rmap(struct page *page, * Everybody else should continue to use page_add_anon_rmap above. */ void do_page_add_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address, int exclusive) + struct vm_area_struct *vma, unsigned long address, int flags) { - int first = atomic_inc_and_test(&page->_mapcount); + bool compound = flags & RMAP_COMPOUND; + bool first; + + if (compound) { + atomic_t *mapcount; + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + mapcount = compound_mapcount_ptr(page); + first = atomic_inc_and_test(mapcount); + } else { + first = atomic_inc_and_test(&page->_mapcount); + } + if (first) { + int nr = compound ? hpage_nr_pages(page) : 1; /* * We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption * disabled. */ - if (PageTransHuge(page)) + if (compound) { __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, - hpage_nr_pages(page)); + } + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); } if (unlikely(PageKsm(page))) return; VM_BUG_ON_PAGE(!PageLocked(page), page); + /* address might be in next vma when migration races vma_adjust */ if (first) - __page_set_anon_rmap(page, vma, address, exclusive); + __page_set_anon_rmap(page, vma, address, + flags & RMAP_EXCLUSIVE); else __page_check_anon_rmap(page, vma, address); } @@ -1183,21 +1251,31 @@ void do_page_add_anon_rmap(struct page *page, * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped + * @compound: charge the page as compound or small page * * Same as page_add_anon_rmap but must only be called on *new* pages. * This means the inc-and-test can be bypassed. * Page does not have to be locked. */ void page_add_new_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address) + struct vm_area_struct *vma, unsigned long address, bool compound) { + int nr = compound ? hpage_nr_pages(page) : 1; + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); SetPageSwapBacked(page); - atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ - if (PageTransHuge(page)) + if (compound) { + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + /* increment count (starts at -1) */ + atomic_set(compound_mapcount_ptr(page), 0); __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, - hpage_nr_pages(page)); + } else { + /* Anon THP always mapped first with PMD */ + VM_BUG_ON_PAGE(PageTransCompound(page), page); + /* increment count (starts at -1) */ + atomic_set(&page->_mapcount, 0); + } + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); __page_set_anon_rmap(page, vma, address, 1); } @@ -1225,12 +1303,15 @@ static void page_remove_file_rmap(struct page *page) memcg = mem_cgroup_begin_page_stat(page); - /* page still mapped by someone else? */ - if (!atomic_add_negative(-1, &page->_mapcount)) + /* Hugepages are not counted in NR_FILE_MAPPED for now. */ + if (unlikely(PageHuge(page))) { + /* hugetlb pages are always mapped with pmds */ + atomic_dec(compound_mapcount_ptr(page)); goto out; + } - /* Hugepages are not counted in NR_FILE_MAPPED for now. */ - if (unlikely(PageHuge(page))) + /* page still mapped by someone else? */ + if (!atomic_add_negative(-1, &page->_mapcount)) goto out; /* @@ -1247,41 +1328,79 @@ out: mem_cgroup_end_page_stat(memcg); } +static void page_remove_anon_compound_rmap(struct page *page) +{ + int i, nr; + + if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) + return; + + /* Hugepages are not counted in NR_ANON_PAGES for now. */ + if (unlikely(PageHuge(page))) + return; + + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + return; + + __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + + if (TestClearPageDoubleMap(page)) { + /* + * Subpages can be mapped with PTEs too. Check how many of + * themi are still mapped. + */ + for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { + if (atomic_add_negative(-1, &page[i]._mapcount)) + nr++; + } + } else { + nr = HPAGE_PMD_NR; + } + + if (unlikely(PageMlocked(page))) + clear_page_mlock(page); + + if (nr) { + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr); + deferred_split_huge_page(page); + } +} + /** * page_remove_rmap - take down pte mapping from a page - * @page: page to remove mapping from + * @page: page to remove mapping from + * @compound: uncharge the page as compound or small page * * The caller needs to hold the pte lock. */ -void page_remove_rmap(struct page *page) +void page_remove_rmap(struct page *page, bool compound) { if (!PageAnon(page)) { + VM_BUG_ON_PAGE(compound && !PageHuge(page), page); page_remove_file_rmap(page); return; } + if (compound) + return page_remove_anon_compound_rmap(page); + /* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) return; - /* Hugepages are not counted in NR_ANON_PAGES for now. */ - if (unlikely(PageHuge(page))) - return; - /* * We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - if (PageTransHuge(page)) - __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); - - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, - -hpage_nr_pages(page)); + __dec_zone_page_state(page, NR_ANON_PAGES); if (unlikely(PageMlocked(page))) clear_page_mlock(page); + if (PageTransCompound(page)) + deferred_split_huge_page(compound_head(page)); + /* * It would be tidy to reset the PageAnon mapping here, * but that might overwrite a racing page_add_anon_rmap @@ -1293,6 +1412,11 @@ void page_remove_rmap(struct page *page) */ } +struct rmap_private { + enum ttu_flags flags; + int lazyfreed; +}; + /* * @arg: enum ttu_flags will be passed to this argument */ @@ -1304,7 +1428,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, pte_t pteval; spinlock_t *ptl; int ret = SWAP_AGAIN; - enum ttu_flags flags = (enum ttu_flags)arg; + struct rmap_private *rp = arg; + enum ttu_flags flags = rp->flags; /* munlock has nothing to gain from examining un-locked vmas */ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) @@ -1396,6 +1521,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * See handle_pte_fault() ... */ VM_BUG_ON_PAGE(!PageSwapCache(page), page); + + if (!PageDirty(page) && (flags & TTU_LZFREE)) { + /* It's a freeable page by MADV_FREE */ + dec_mm_counter(mm, MM_ANONPAGES); + rp->lazyfreed++; + goto discard; + } + if (swap_duplicate(entry) < 0) { set_pte_at(mm, address, pte, pteval); ret = SWAP_FAIL; @@ -1416,7 +1549,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } else dec_mm_counter(mm, mm_counter_file(page)); - page_remove_rmap(page); +discard: + page_remove_rmap(page, PageHuge(page)); page_cache_release(page); out_unmap: @@ -1468,9 +1602,14 @@ static int page_not_mapped(struct page *page) int try_to_unmap(struct page *page, enum ttu_flags flags) { int ret; + struct rmap_private rp = { + .flags = flags, + .lazyfreed = 0, + }; + struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, - .arg = (void *)flags, + .arg = &rp, .done = page_not_mapped, .anon_lock = page_lock_anon_vma_read, }; @@ -1490,8 +1629,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) ret = rmap_walk(page, &rwc); - if (ret != SWAP_MLOCK && !page_mapped(page)) + if (ret != SWAP_MLOCK && !page_mapped(page)) { ret = SWAP_SUCCESS; + if (rp.lazyfreed && !PageDirty(page)) + ret = SWAP_LZFREE; + } return ret; } @@ -1513,9 +1655,14 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) int try_to_munlock(struct page *page) { int ret; + struct rmap_private rp = { + .flags = TTU_MUNLOCK, + .lazyfreed = 0, + }; + struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, - .arg = (void *)TTU_MUNLOCK, + .arg = &rp, .done = page_not_mapped, .anon_lock = page_lock_anon_vma_read, @@ -1698,7 +1845,7 @@ void hugepage_add_anon_rmap(struct page *page, BUG_ON(!PageLocked(page)); BUG_ON(!anon_vma); /* address might be in next vma when migration races vma_adjust */ - first = atomic_inc_and_test(&page->_mapcount); + first = atomic_inc_and_test(compound_mapcount_ptr(page)); if (first) __hugepage_set_anon_rmap(page, vma, address, 0); } @@ -1707,7 +1854,7 @@ void hugepage_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { BUG_ON(address < vma->vm_start || address >= vma->vm_end); - atomic_set(&page->_mapcount, 0); + atomic_set(compound_mapcount_ptr(page), 0); __hugepage_set_anon_rmap(page, vma, address, 1); } #endif /* CONFIG_HUGETLB_PAGE */ @@ -810,7 +810,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page) * the shmem_swaplist_mutex which might hold up shmem_writepage(). * Charged back to the user (not to caller) when swap account is used. */ - error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg); + error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg, + false); if (error) goto out; /* No radix_tree_preload: swap entry keeps a place for page in tree */ @@ -833,9 +834,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page) if (error) { if (error != -ENOMEM) error = 0; - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); } else - mem_cgroup_commit_charge(page, memcg, true); + mem_cgroup_commit_charge(page, memcg, true, false); out: unlock_page(page); page_cache_release(page); @@ -1085,7 +1086,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, copy_highpage(newpage, oldpage); flush_dcache_page(newpage); - __set_page_locked(newpage); + __SetPageLocked(newpage); SetPageUptodate(newpage); SetPageSwapBacked(newpage); set_page_private(newpage, swap_index); @@ -1218,7 +1219,8 @@ repeat: goto failed; } - error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); + error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg, + false); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, swp_to_radix_entry(swap)); @@ -1235,14 +1237,14 @@ repeat: * "repeat": reading a hole and writing should succeed. */ if (error) { - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); delete_from_swap_cache(page); } } if (error) goto failed; - mem_cgroup_commit_charge(page, memcg, true); + mem_cgroup_commit_charge(page, memcg, true, false); spin_lock(&info->lock); info->swapped--; @@ -1277,11 +1279,12 @@ repeat: } __SetPageSwapBacked(page); - __set_page_locked(page); + __SetPageLocked(page); if (sgp == SGP_WRITE) __SetPageReferenced(page); - error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); + error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg, + false); if (error) goto decused; error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); @@ -1291,10 +1294,10 @@ repeat: radix_tree_preload_end(); } if (error) { - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); goto decused; } - mem_cgroup_commit_charge(page, memcg, false); + mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_anon(page); spin_lock(&info->lock); @@ -338,11 +338,13 @@ static inline int oo_objects(struct kmem_cache_order_objects x) */ static __always_inline void slab_lock(struct page *page) { + VM_BUG_ON_PAGE(PageTail(page), page); bit_spin_lock(PG_locked, &page->flags); } static __always_inline void slab_unlock(struct page *page) { + VM_BUG_ON_PAGE(PageTail(page), page); __bit_spin_unlock(PG_locked, &page->flags); } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 4cba9c2..b60802b 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -20,6 +20,7 @@ #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/bootmem.h> +#include <linux/memremap.h> #include <linux/highmem.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -70,7 +71,7 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) } /* need to make sure size is all the same during early stage */ -void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) +static void * __meminit alloc_block_buf(unsigned long size, int node) { void *ptr; @@ -87,6 +88,77 @@ void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) return ptr; } +static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap) +{ + return altmap->base_pfn + altmap->reserve + altmap->alloc + + altmap->align; +} + +static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap) +{ + unsigned long allocated = altmap->alloc + altmap->align; + + if (altmap->free > allocated) + return altmap->free - allocated; + return 0; +} + +/** + * vmem_altmap_alloc - allocate pages from the vmem_altmap reservation + * @altmap - reserved page pool for the allocation + * @nr_pfns - size (in pages) of the allocation + * + * Allocations are aligned to the size of the request + */ +static unsigned long __meminit vmem_altmap_alloc(struct vmem_altmap *altmap, + unsigned long nr_pfns) +{ + unsigned long pfn = vmem_altmap_next_pfn(altmap); + unsigned long nr_align; + + nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG); + nr_align = ALIGN(pfn, nr_align) - pfn; + + if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap)) + return ULONG_MAX; + altmap->alloc += nr_pfns; + altmap->align += nr_align; + return pfn + nr_align; +} + +static void * __meminit altmap_alloc_block_buf(unsigned long size, + struct vmem_altmap *altmap) +{ + unsigned long pfn, nr_pfns; + void *ptr; + + if (size & ~PAGE_MASK) { + pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n", + __func__, size); + return NULL; + } + + nr_pfns = size >> PAGE_SHIFT; + pfn = vmem_altmap_alloc(altmap, nr_pfns); + if (pfn < ULONG_MAX) + ptr = __va(__pfn_to_phys(pfn)); + else + ptr = NULL; + pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n", + __func__, pfn, altmap->alloc, altmap->align, nr_pfns); + + return ptr; +} + +/* need to make sure size is all the same during early stage */ +void * __meminit __vmemmap_alloc_block_buf(unsigned long size, int node, + struct vmem_altmap *altmap) +{ + if (altmap) + return altmap_alloc_block_buf(size, altmap); + return alloc_block_buf(size, node); +} + void __meminit vmemmap_verify(pte_t *pte, int node, unsigned long start, unsigned long end) { @@ -103,7 +175,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) pte_t *pte = pte_offset_kernel(pmd, addr); if (pte_none(*pte)) { pte_t entry; - void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node); + void *p = alloc_block_buf(PAGE_SIZE, node); if (!p) return NULL; entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); diff --git a/mm/sparse.c b/mm/sparse.c index d1b48b6..3717cee 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -748,7 +748,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) if (!memmap) return; - for (i = 0; i < PAGES_PER_SECTION; i++) { + for (i = 0; i < nr_pages; i++) { if (PageHWPoison(&memmap[i])) { atomic_long_sub(1, &num_poisoned_pages); ClearPageHWPoison(&memmap[i]); @@ -788,7 +788,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) free_map_bootmem(memmap); } -void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) +void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, + unsigned long map_offset) { struct page *memmap = NULL; unsigned long *usemap = NULL, flags; @@ -804,7 +805,8 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) } pgdat_resize_unlock(pgdat, &flags); - clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); + clear_hwpoisoned_pages(memmap + map_offset, + PAGES_PER_SECTION - map_offset); free_section_usemap(memmap, usemap); } #endif /* CONFIG_MEMORY_HOTREMOVE */ @@ -24,6 +24,7 @@ #include <linux/export.h> #include <linux/mm_inline.h> #include <linux/percpu_counter.h> +#include <linux/memremap.h> #include <linux/percpu.h> #include <linux/cpu.h> #include <linux/notifier.h> @@ -45,6 +46,7 @@ int page_cluster; static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); /* * This path almost never happens for VM activity - pages are normally @@ -89,260 +91,14 @@ static void __put_compound_page(struct page *page) (*dtor)(page); } -/** - * Two special cases here: we could avoid taking compound_lock_irqsave - * and could skip the tail refcounting(in _mapcount). - * - * 1. Hugetlbfs page: - * - * PageHeadHuge will remain true until the compound page - * is released and enters the buddy allocator, and it could - * not be split by __split_huge_page_refcount(). - * - * So if we see PageHeadHuge set, and we have the tail page pin, - * then we could safely put head page. - * - * 2. Slab THP page: - * - * PG_slab is cleared before the slab frees the head page, and - * tail pin cannot be the last reference left on the head page, - * because the slab code is free to reuse the compound page - * after a kfree/kmem_cache_free without having to check if - * there's any tail pin left. In turn all tail pinsmust be always - * released while the head is still pinned by the slab code - * and so we know PG_slab will be still set too. - * - * So if we see PageSlab set, and we have the tail page pin, - * then we could safely put head page. - */ -static __always_inline -void put_unrefcounted_compound_page(struct page *page_head, struct page *page) -{ - /* - * If @page is a THP tail, we must read the tail page - * flags after the head page flags. The - * __split_huge_page_refcount side enforces write memory barriers - * between clearing PageTail and before the head page - * can be freed and reallocated. - */ - smp_rmb(); - if (likely(PageTail(page))) { - /* - * __split_huge_page_refcount cannot race - * here, see the comment above this function. - */ - VM_BUG_ON_PAGE(!PageHead(page_head), page_head); - if (put_page_testzero(page_head)) { - /* - * If this is the tail of a slab THP page, - * the tail pin must not be the last reference - * held on the page, because the PG_slab cannot - * be cleared before all tail pins (which skips - * the _mapcount tail refcounting) have been - * released. - * - * If this is the tail of a hugetlbfs page, - * the tail pin may be the last reference on - * the page instead, because PageHeadHuge will - * not go away until the compound page enters - * the buddy allocator. - */ - VM_BUG_ON_PAGE(PageSlab(page_head), page_head); - __put_compound_page(page_head); - } - } else - /* - * __split_huge_page_refcount run before us, - * @page was a THP tail. The split @page_head - * has been freed and reallocated as slab or - * hugetlbfs page of smaller order (only - * possible if reallocated as slab on x86). - */ - if (put_page_testzero(page)) - __put_single_page(page); -} - -static __always_inline -void put_refcounted_compound_page(struct page *page_head, struct page *page) -{ - if (likely(page != page_head && get_page_unless_zero(page_head))) { - unsigned long flags; - - /* - * @page_head wasn't a dangling pointer but it may not - * be a head page anymore by the time we obtain the - * lock. That is ok as long as it can't be freed from - * under us. - */ - flags = compound_lock_irqsave(page_head); - if (unlikely(!PageTail(page))) { - /* __split_huge_page_refcount run before us */ - compound_unlock_irqrestore(page_head, flags); - if (put_page_testzero(page_head)) { - /* - * The @page_head may have been freed - * and reallocated as a compound page - * of smaller order and then freed - * again. All we know is that it - * cannot have become: a THP page, a - * compound page of higher order, a - * tail page. That is because we - * still hold the refcount of the - * split THP tail and page_head was - * the THP head before the split. - */ - if (PageHead(page_head)) - __put_compound_page(page_head); - else - __put_single_page(page_head); - } -out_put_single: - if (put_page_testzero(page)) - __put_single_page(page); - return; - } - VM_BUG_ON_PAGE(page_head != compound_head(page), page); - /* - * We can release the refcount taken by - * get_page_unless_zero() now that - * __split_huge_page_refcount() is blocked on the - * compound_lock. - */ - if (put_page_testzero(page_head)) - VM_BUG_ON_PAGE(1, page_head); - /* __split_huge_page_refcount will wait now */ - VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); - atomic_dec(&page->_mapcount); - VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); - VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); - compound_unlock_irqrestore(page_head, flags); - - if (put_page_testzero(page_head)) { - if (PageHead(page_head)) - __put_compound_page(page_head); - else - __put_single_page(page_head); - } - } else { - /* @page_head is a dangling pointer */ - VM_BUG_ON_PAGE(PageTail(page), page); - goto out_put_single; - } -} - -static void put_compound_page(struct page *page) -{ - struct page *page_head; - - /* - * We see the PageCompound set and PageTail not set, so @page maybe: - * 1. hugetlbfs head page, or - * 2. THP head page. - */ - if (likely(!PageTail(page))) { - if (put_page_testzero(page)) { - /* - * By the time all refcounts have been released - * split_huge_page cannot run anymore from under us. - */ - if (PageHead(page)) - __put_compound_page(page); - else - __put_single_page(page); - } - return; - } - - /* - * We see the PageCompound set and PageTail set, so @page maybe: - * 1. a tail hugetlbfs page, or - * 2. a tail THP page, or - * 3. a split THP page. - * - * Case 3 is possible, as we may race with - * __split_huge_page_refcount tearing down a THP page. - */ - page_head = compound_head(page); - if (!__compound_tail_refcounted(page_head)) - put_unrefcounted_compound_page(page_head, page); - else - put_refcounted_compound_page(page_head, page); -} - -void put_page(struct page *page) +void __put_page(struct page *page) { if (unlikely(PageCompound(page))) - put_compound_page(page); - else if (put_page_testzero(page)) + __put_compound_page(page); + else __put_single_page(page); } -EXPORT_SYMBOL(put_page); - -/* - * This function is exported but must not be called by anything other - * than get_page(). It implements the slow path of get_page(). - */ -bool __get_page_tail(struct page *page) -{ - /* - * This takes care of get_page() if run on a tail page - * returned by one of the get_user_pages/follow_page variants. - * get_user_pages/follow_page itself doesn't need the compound - * lock because it runs __get_page_tail_foll() under the - * proper PT lock that already serializes against - * split_huge_page(). - */ - unsigned long flags; - bool got; - struct page *page_head = compound_head(page); - - /* Ref to put_compound_page() comment. */ - if (!__compound_tail_refcounted(page_head)) { - smp_rmb(); - if (likely(PageTail(page))) { - /* - * This is a hugetlbfs page or a slab - * page. __split_huge_page_refcount - * cannot race here. - */ - VM_BUG_ON_PAGE(!PageHead(page_head), page_head); - __get_page_tail_foll(page, true); - return true; - } else { - /* - * __split_huge_page_refcount run - * before us, "page" was a THP - * tail. The split page_head has been - * freed and reallocated as slab or - * hugetlbfs page of smaller order - * (only possible if reallocated as - * slab on x86). - */ - return false; - } - } - - got = false; - if (likely(page != page_head && get_page_unless_zero(page_head))) { - /* - * page_head wasn't a dangling pointer but it - * may not be a head page anymore by the time - * we obtain the lock. That is ok as long as it - * can't be freed from under us. - */ - flags = compound_lock_irqsave(page_head); - /* here __split_huge_page_refcount won't run anymore */ - if (likely(PageTail(page))) { - __get_page_tail_foll(page, false); - got = true; - } - compound_unlock_irqrestore(page_head, flags); - if (unlikely(!got)) - put_page(page_head); - } - return got; -} -EXPORT_SYMBOL(__get_page_tail); +EXPORT_SYMBOL(__put_page); /** * put_pages_list() - release a list of pages @@ -604,6 +360,7 @@ static void __lru_cache_activate_page(struct page *page) */ void mark_page_accessed(struct page *page) { + page = compound_head(page); if (!PageActive(page) && !PageUnevictable(page) && PageReferenced(page)) { @@ -799,6 +556,24 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, update_page_reclaim_stat(lruvec, file, 0); } + +static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, + void *arg) +{ + if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + int file = page_is_file_cache(page); + int lru = page_lru_base_type(page); + + del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); + ClearPageActive(page); + ClearPageReferenced(page); + add_page_to_lru_list(page, lruvec, lru); + + __count_vm_event(PGDEACTIVATE); + update_page_reclaim_stat(lruvec, file, 0); + } +} + /* * Drain pages out of the cpu's pagevecs. * Either "cpu" is the current CPU, and preemption has already been @@ -825,6 +600,10 @@ void lru_add_drain_cpu(int cpu) if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); + pvec = &per_cpu(lru_deactivate_pvecs, cpu); + if (pagevec_count(pvec)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + activate_page_drain(cpu); } @@ -854,6 +633,26 @@ void deactivate_file_page(struct page *page) } } +/** + * deactivate_page - deactivate a page + * @page: page to deactivate + * + * deactivate_page() moves @page to the inactive list if @page was on the active + * list and was not an unevictable page. This is done to accelerate the reclaim + * of @page. + */ +void deactivate_page(struct page *page) +{ + if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + + page_cache_get(page); + if (!pagevec_add(pvec, page)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + put_cpu_var(lru_deactivate_pvecs); + } +} + void lru_add_drain(void) { lru_add_drain_cpu(get_cpu()); @@ -883,6 +682,7 @@ void lru_add_drain_all(void) if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); schedule_work_on(cpu, work); @@ -918,15 +718,6 @@ void release_pages(struct page **pages, int nr, bool cold) for (i = 0; i < nr; i++) { struct page *page = pages[i]; - if (unlikely(PageCompound(page))) { - if (zone) { - spin_unlock_irqrestore(&zone->lru_lock, flags); - zone = NULL; - } - put_compound_page(page); - continue; - } - /* * Make sure the IRQ-safe lock-holding time does not get * excessive with a continuous string of pages from the @@ -937,9 +728,19 @@ void release_pages(struct page **pages, int nr, bool cold) zone = NULL; } + page = compound_head(page); if (!put_page_testzero(page)) continue; + if (PageCompound(page)) { + if (zone) { + spin_unlock_irqrestore(&zone->lru_lock, flags); + zone = NULL; + } + __put_compound_page(page); + continue; + } + if (PageLRU(page)) { struct zone *pagezone = page_zone(page); diff --git a/mm/swap_state.c b/mm/swap_state.c index d504adb..676ff29 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -185,13 +185,12 @@ int add_to_swap(struct page *page, struct list_head *list) * deadlock in the swap out path. */ /* - * Add it to the swap cache and mark it dirty + * Add it to the swap cache. */ err = add_to_swap_cache(page, entry, __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); - if (!err) { /* Success */ - SetPageDirty(page); + if (!err) { return 1; } else { /* -ENOMEM radix-tree allocation failure */ /* @@ -353,7 +352,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, } /* May fail (-ENOMEM) if radix-tree node allocation failed. */ - __set_page_locked(new_page); + __SetPageLocked(new_page); SetPageSwapBacked(new_page); err = __add_to_swap_cache(new_page, entry); if (likely(!err)) { @@ -367,7 +366,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, } radix_tree_preload_end(); ClearPageSwapBacked(new_page); - __clear_page_locked(new_page); + __ClearPageLocked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. diff --git a/mm/swapfile.c b/mm/swapfile.c index e6b8591..2bb30aa 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -926,6 +926,9 @@ int reuse_swap_page(struct page *page) VM_BUG_ON_PAGE(!PageLocked(page), page); if (unlikely(PageKsm(page))) return 0; + /* The page is part of THP and cannot be reused */ + if (PageTransCompound(page)) + return 0; count = page_mapcount(page); if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); @@ -1108,19 +1111,9 @@ unsigned int count_swap_pages(int type, int free) } #endif /* CONFIG_HIBERNATION */ -static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) +static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) { -#ifdef CONFIG_MEM_SOFT_DIRTY - /* - * When pte keeps soft dirty bit the pte generated - * from swap entry does not has it, still it's same - * pte from logical point of view. - */ - pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); - return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); -#else - return pte_same(pte, swp_pte); -#endif + return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte); } /* @@ -1142,14 +1135,15 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, if (unlikely(!page)) return -ENOMEM; - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) { + if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, + &memcg, false)) { ret = -ENOMEM; goto out_nolock; } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { - mem_cgroup_cancel_charge(page, memcg); + if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { + mem_cgroup_cancel_charge(page, memcg, false); ret = 0; goto out; } @@ -1160,11 +1154,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { - page_add_anon_rmap(page, vma, addr); - mem_cgroup_commit_charge(page, memcg, true); + page_add_anon_rmap(page, vma, addr, false); + mem_cgroup_commit_charge(page, memcg, true, false); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, addr); - mem_cgroup_commit_charge(page, memcg, false); + page_add_new_anon_rmap(page, vma, addr, false); + mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } swap_free(entry); @@ -1206,7 +1200,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * swapoff spends a _lot_ of time in this loop! * Test inline before going to call unuse_pte. */ - if (unlikely(maybe_same_pte(*pte, swp_pte))) { + if (unlikely(pte_same_as_swp(*pte, swp_pte))) { pte_unmap(pte); ret = unuse_pte(vma, pmd, addr, entry, page); if (ret) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 77fee93..806b0c7 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -63,7 +63,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, __SetPageUptodate(page); ret = -ENOMEM; - if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg)) + if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false)) goto out_release; _dst_pte = mk_pte(page, dst_vma->vm_page_prot); @@ -76,8 +76,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, goto out_release_uncharge_unlock; inc_mm_counter(dst_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, dst_vma, dst_addr); - mem_cgroup_commit_charge(page, memcg, false); + page_add_new_anon_rmap(page, dst_vma, dst_addr, false); + mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, dst_vma); set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); @@ -91,7 +91,7 @@ out: return ret; out_release_uncharge_unlock: pte_unmap_unlock(dst_pte, ptl); - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); out_release: page_cache_release(page); goto out; @@ -386,7 +386,9 @@ struct anon_vma *page_anon_vma(struct page *page) struct address_space *page_mapping(struct page *page) { - unsigned long mapping; + struct address_space *mapping; + + page = compound_head(page); /* This happens if someone calls flush_dcache_page on slab page */ if (unlikely(PageSlab(page))) @@ -399,11 +401,25 @@ struct address_space *page_mapping(struct page *page) return swap_address_space(entry); } - mapping = (unsigned long)page->mapping; - if (mapping & PAGE_MAPPING_FLAGS) + mapping = page->mapping; + if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) return NULL; - return page->mapping; + return mapping; +} + +/* Slow path of page_mapcount() for compound pages */ +int __page_mapcount(struct page *page) +{ + int ret; + + ret = atomic_read(&page->_mapcount) + 1; + page = compound_head(page); + ret += atomic_read(compound_mapcount_ptr(page)) + 1; + if (PageDoubleMap(page)) + ret--; + return ret; } +EXPORT_SYMBOL_GPL(__page_mapcount); int overcommit_ratio_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 58ceeb1..fb42a5b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -455,7 +455,7 @@ found: free_vmap_cache = &va->rb_node; spin_unlock(&vmap_area_lock); - BUG_ON(va->va_start & (align-1)); + BUG_ON(!IS_ALIGNED(va->va_start, align)); BUG_ON(va->va_start < vstart); BUG_ON(va->va_end > vend); @@ -1086,7 +1086,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) BUG_ON(!addr); BUG_ON(addr < VMALLOC_START); BUG_ON(addr > VMALLOC_END); - BUG_ON(addr & (PAGE_SIZE-1)); + BUG_ON(!IS_ALIGNED(addr, PAGE_SIZE)); debug_check_no_locks_freed(mem, size); vmap_debug_free_range(addr, addr+size); diff --git a/mm/vmscan.c b/mm/vmscan.c index 108bd11..5ac8695 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -906,6 +906,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, int may_enter_fs; enum page_references references = PAGEREF_RECLAIM_CLEAN; bool dirty, writeback; + bool lazyfree = false; + int ret = SWAP_SUCCESS; cond_resched(); @@ -1049,6 +1051,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; if (!add_to_swap(page, page_list)) goto activate_locked; + lazyfree = true; may_enter_fs = 1; /* Adding to swap updated mapping */ @@ -1060,14 +1063,17 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, - ttu_flags|TTU_BATCH_FLUSH)) { + switch (ret = try_to_unmap(page, lazyfree ? + (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) : + (ttu_flags | TTU_BATCH_FLUSH))) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: goto keep_locked; case SWAP_MLOCK: goto cull_mlocked; + case SWAP_LZFREE: + goto lazyfree; case SWAP_SUCCESS: ; /* try to free the page below */ } @@ -1174,6 +1180,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } +lazyfree: if (!mapping || !__remove_mapping(mapping, page, true)) goto keep_locked; @@ -1184,8 +1191,11 @@ static unsigned long shrink_page_list(struct list_head *page_list, * we obviously don't have to worry about waking up a process * waiting on the page lock, because there are no references. */ - __clear_page_locked(page); + __ClearPageLocked(page); free_it: + if (ret == SWAP_LZFREE) + count_vm_event(PGLAZYFREED); + nr_reclaimed++; /* diff --git a/mm/vmstat.c b/mm/vmstat.c index 83a003b..64bd0aa 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -783,6 +783,7 @@ const char * const vmstat_text[] = { "pgfault", "pgmajfault", + "pglazyfreed", TEXTS_FOR_ZONES("pgrefill") TEXTS_FOR_ZONES("pgsteal_kswapd") @@ -844,7 +845,9 @@ const char * const vmstat_text[] = { "thp_fault_fallback", "thp_collapse_alloc", "thp_collapse_alloc_failed", - "thp_split", + "thp_split_page", + "thp_split_page_failed", + "thp_split_pmd", "thp_zero_page_alloc", "thp_zero_page_alloc_failed", #endif |