diff options
Diffstat (limited to 'mm')
63 files changed, 4100 insertions, 2685 deletions
@@ -624,7 +624,7 @@ config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT bool config DEFERRED_STRUCT_PAGE_INIT - bool "Defer initialisation of struct pages to kswapd" + bool "Defer initialisation of struct pages to kthreads" default n depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT depends on MEMORY_HOTPLUG @@ -633,9 +633,10 @@ config DEFERRED_STRUCT_PAGE_INIT single thread. On very large machines this can take a considerable amount of time. If this option is set, large machines will bring up a subset of memmap at boot and then initialise the rest in parallel - when kswapd starts. This has a potential performance impact on - processes running early in the lifetime of the systemm until kswapd - finishes the initialisation. + by starting one-off "pgdatinitX" kernel thread for each node X. This + has a potential performance impact on processes running early in the + lifetime of the system until these kthreads finish the + initialisation. config IDLE_PAGE_TRACKING bool "Enable idle page tracking" diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 7340353..926c76d 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -672,7 +672,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); if (!ret) { - bdi->wb.memcg_css = mem_cgroup_root_css; + bdi->wb.memcg_css = &root_mem_cgroup->css; bdi->wb.blkcg_css = blkcg_root_css; } return ret; @@ -989,7 +989,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) * here rather than calling cond_resched(). */ if (current->flags & PF_WQ_WORKER) - schedule_timeout(1); + schedule_timeout_uninterruptible(1); else cond_resched(); diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index d3116be..300117f 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -61,6 +61,7 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) bool dequeued_page; dequeued_page = false; + spin_lock_irqsave(&b_dev_info->pages_lock, flags); list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) { /* * Block others from accessing the 'page' while we get around @@ -75,15 +76,14 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) continue; } #endif - spin_lock_irqsave(&b_dev_info->pages_lock, flags); balloon_page_delete(page); __count_vm_event(BALLOON_DEFLATE); - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); unlock_page(page); dequeued_page = true; break; } } + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); if (!dequeued_page) { /* diff --git a/mm/bootmem.c b/mm/bootmem.c index 3b63807..91e32bc 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -33,6 +33,7 @@ EXPORT_SYMBOL(contig_page_data); unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; +unsigned long long max_possible_pfn; bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; diff --git a/mm/cleancache.c b/mm/cleancache.c index 8fc5081..ba5d8f3 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -22,7 +22,7 @@ * cleancache_ops is set by cleancache_register_ops to contain the pointers * to the cleancache "backend" implementation functions. */ -static struct cleancache_ops *cleancache_ops __read_mostly; +static const struct cleancache_ops *cleancache_ops __read_mostly; /* * Counters available via /sys/kernel/debug/cleancache (if debugfs is @@ -49,7 +49,7 @@ static void cleancache_register_ops_sb(struct super_block *sb, void *unused) /* * Register operations for cleancache. Returns 0 on success. */ -int cleancache_register_ops(struct cleancache_ops *ops) +int cleancache_register_ops(const struct cleancache_ops *ops) { if (cmpxchg(&cleancache_ops, NULL, ops)) return -EBUSY; diff --git a/mm/compaction.c b/mm/compaction.c index de3e1e7..585de54 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1658,14 +1658,15 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) !compaction_deferred(zone, cc->order)) compact_zone(zone, cc); - if (cc->order > 0) { - if (zone_watermark_ok(zone, cc->order, - low_wmark_pages(zone), 0, 0)) - compaction_defer_reset(zone, cc->order, false); - } - VM_BUG_ON(!list_empty(&cc->freepages)); VM_BUG_ON(!list_empty(&cc->migratepages)); + + if (is_via_compact_memory(cc->order)) + continue; + + if (zone_watermark_ok(zone, cc->order, + low_wmark_pages(zone), 0, 0)) + compaction_defer_reset(zone, cc->order, false); } } @@ -1708,7 +1709,10 @@ static void compact_nodes(void) /* The written value is actually unused, all memory is compacted */ int sysctl_compact_memory; -/* This is the entry point for compacting all nodes via /proc/sys/vm */ +/* + * This is the entry point for compacting all nodes via + * /proc/sys/vm/compact_memory + */ int sysctl_compaction_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -40,9 +40,6 @@ static const struct trace_print_flags pageflag_names[] = { #ifdef CONFIG_MEMORY_FAILURE {1UL << PG_hwpoison, "hwpoison" }, #endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - {1UL << PG_compound_lock, "compound_lock" }, -#endif #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) {1UL << PG_young, "young" }, {1UL << PG_idle, "idle" }, @@ -82,9 +79,12 @@ static void dump_flags(unsigned long flags, void dump_page_badflags(struct page *page, const char *reason, unsigned long badflags) { - pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", + pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", page, atomic_read(&page->_count), page_mapcount(page), page->mapping, page->index); + if (PageCompound(page)) + pr_cont(" compound_mapcount: %d", compound_mapcount(page)); + pr_cont("\n"); BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names)); if (reason) @@ -175,7 +175,7 @@ void dump_mm(const struct mm_struct *mm) "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" - "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" + "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_brk %lx brk %lx start_stack %lx\n" "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" @@ -209,7 +209,7 @@ void dump_mm(const struct mm_struct *mm) mm_nr_pmds((struct mm_struct *)mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, - mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, + mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, mm->start_code, mm->end_code, mm->start_data, mm->end_data, mm->start_brk, mm->brk, mm->start_stack, mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, diff --git a/mm/filemap.c b/mm/filemap.c index 1bb00762..bc94386 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -11,6 +11,7 @@ */ #include <linux/export.h> #include <linux/compiler.h> +#include <linux/dax.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/capability.h> @@ -123,9 +124,9 @@ static void page_cache_tree_delete(struct address_space *mapping, __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); if (shadow) { - mapping->nrshadows++; + mapping->nrexceptional++; /* - * Make sure the nrshadows update is committed before + * Make sure the nrexceptional update is committed before * the nrpages update so that final truncate racing * with reclaim does not see both counters 0 at the * same time and miss a shadow entry. @@ -204,7 +205,7 @@ void __delete_from_page_cache(struct page *page, void *shadow, __dec_zone_page_state(page, NR_FILE_PAGES); if (PageSwapBacked(page)) __dec_zone_page_state(page, NR_SHMEM); - BUG_ON(page_mapped(page)); + VM_BUG_ON_PAGE(page_mapped(page), page); /* * At this point page must be either written or cleaned by truncate. @@ -481,6 +482,12 @@ int filemap_write_and_wait_range(struct address_space *mapping, { int err = 0; + if (dax_mapping(mapping) && mapping->nrexceptional) { + err = dax_writeback_mapping_range(mapping, lstart, lend); + if (err) + return err; + } + if (mapping->nrpages) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); @@ -579,9 +586,13 @@ static int page_cache_tree_insert(struct address_space *mapping, p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); if (!radix_tree_exceptional_entry(p)) return -EEXIST; + + if (WARN_ON(dax_mapping(mapping))) + return -EINVAL; + if (shadowp) *shadowp = p; - mapping->nrshadows--; + mapping->nrexceptional--; if (node) workingset_node_shadows_dec(node); } @@ -618,7 +629,7 @@ static int __add_to_page_cache_locked(struct page *page, if (!huge) { error = mem_cgroup_try_charge(page, current->mm, - gfp_mask, &memcg); + gfp_mask, &memcg, false); if (error) return error; } @@ -626,7 +637,7 @@ static int __add_to_page_cache_locked(struct page *page, error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); if (error) { if (!huge) - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); return error; } @@ -645,7 +656,7 @@ static int __add_to_page_cache_locked(struct page *page, __inc_zone_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); if (!huge) - mem_cgroup_commit_charge(page, memcg, false); + mem_cgroup_commit_charge(page, memcg, false, false); trace_mm_filemap_add_to_page_cache(page); return 0; err_insert: @@ -653,7 +664,7 @@ err_insert: /* Leave page->index set: truncation relies upon it */ spin_unlock_irq(&mapping->tree_lock); if (!huge) - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); page_cache_release(page); return error; } @@ -682,11 +693,11 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, void *shadow = NULL; int ret; - __set_page_locked(page); + __SetPageLocked(page); ret = __add_to_page_cache_locked(page, mapping, offset, gfp_mask, &shadow); if (unlikely(ret)) - __clear_page_locked(page); + __ClearPageLocked(page); else { /* * The page might have been evicted from cache only @@ -809,6 +820,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue); */ void unlock_page(struct page *page) { + page = compound_head(page); VM_BUG_ON_PAGE(!PageLocked(page), page); clear_bit_unlock(PG_locked, &page->flags); smp_mb__after_atomic(); @@ -873,18 +885,20 @@ EXPORT_SYMBOL_GPL(page_endio); */ void __lock_page(struct page *page) { - DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + struct page *page_head = compound_head(page); + DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); - __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io, + __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__lock_page); int __lock_page_killable(struct page *page) { - DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + struct page *page_head = compound_head(page); + DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); - return __wait_on_bit_lock(page_waitqueue(page), &wait, + return __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io, TASK_KILLABLE); } EXPORT_SYMBOL_GPL(__lock_page_killable); @@ -1242,9 +1256,9 @@ repeat: if (radix_tree_deref_retry(page)) goto restart; /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Return - * it without attempting to raise page count. + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. */ goto export; } @@ -1491,6 +1505,74 @@ repeat: } EXPORT_SYMBOL(find_get_pages_tag); +/** + * find_get_entries_tag - find and return entries that match @tag + * @mapping: the address_space to search + * @start: the starting page cache index + * @tag: the tag index + * @nr_entries: the maximum number of entries + * @entries: where the resulting entries are placed + * @indices: the cache indices corresponding to the entries in @entries + * + * Like find_get_entries, except we only return entries which are tagged with + * @tag. + */ +unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, + int tag, unsigned int nr_entries, + struct page **entries, pgoff_t *indices) +{ + void **slot; + unsigned int ret = 0; + struct radix_tree_iter iter; + + if (!nr_entries) + return 0; + + rcu_read_lock(); +restart: + radix_tree_for_each_tagged(slot, &mapping->page_tree, + &iter, start, tag) { + struct page *page; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + continue; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + goto restart; + } + + /* + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. + */ + goto export; + } + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } +export: + indices[ret] = iter.index; + entries[ret] = page; + if (++ret == nr_entries) + break; + } + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(find_get_entries_tag); + /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: @@ -1812,19 +1894,18 @@ EXPORT_SYMBOL(generic_file_read_iter); * This adds the requested page to the page cache if it isn't already there, * and schedules an I/O to read in its contents from disk. */ -static int page_cache_read(struct file *file, pgoff_t offset) +static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) { struct address_space *mapping = file->f_mapping; struct page *page; int ret; do { - page = page_cache_alloc_cold(mapping); + page = __page_cache_alloc(gfp_mask|__GFP_COLD); if (!page) return -ENOMEM; - ret = add_to_page_cache_lru(page, mapping, offset, - mapping_gfp_constraint(mapping, GFP_KERNEL)); + ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL); if (ret == 0) ret = mapping->a_ops->readpage(file, page); else if (ret == -EEXIST) @@ -2005,7 +2086,7 @@ no_cached_page: * We're only likely to ever get here if MADV_RANDOM is in * effect. */ - error = page_cache_read(file, offset); + error = page_cache_read(file, offset, vmf->gfp_mask); /* * The page we want has now been added to the page cache. @@ -2682,11 +2763,11 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file->f_mapping->host; ssize_t ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) ret = __generic_file_write_iter(iocb, from); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret > 0) { ssize_t err; @@ -4,6 +4,7 @@ #include <linux/spinlock.h> #include <linux/mm.h> +#include <linux/memremap.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/swap.h> @@ -62,6 +63,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { struct mm_struct *mm = vma->vm_mm; + struct dev_pagemap *pgmap = NULL; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; @@ -98,7 +100,17 @@ retry: } page = vm_normal_page(vma, address, pte); - if (unlikely(!page)) { + if (!page && pte_devmap(pte) && (flags & FOLL_GET)) { + /* + * Only return device mapping pages in the FOLL_GET case since + * they are only valid while holding the pgmap reference. + */ + pgmap = get_dev_pagemap(pte_pfn(pte), NULL); + if (pgmap) + page = pte_page(pte); + else + goto no_page; + } else if (unlikely(!page)) { if (flags & FOLL_DUMP) { /* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); @@ -116,8 +128,28 @@ retry: } } - if (flags & FOLL_GET) - get_page_foll(page); + if (flags & FOLL_SPLIT && PageTransCompound(page)) { + int ret; + get_page(page); + pte_unmap_unlock(ptep, ptl); + lock_page(page); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + if (ret) + return ERR_PTR(ret); + goto retry; + } + + if (flags & FOLL_GET) { + get_page(page); + + /* drop the pgmap reference now that we hold the page */ + if (pgmap) { + put_dev_pagemap(pgmap); + pgmap = NULL; + } + } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) @@ -130,6 +162,10 @@ retry: mark_page_accessed(page); } if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + /* Do not mlock pte-mapped THP */ + if (PageTransCompound(page)) + goto out; + /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE @@ -220,27 +256,45 @@ struct page *follow_page_mask(struct vm_area_struct *vma, } if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) return no_page_table(vma, flags); - if (pmd_trans_huge(*pmd)) { - if (flags & FOLL_SPLIT) { - split_huge_page_pmd(vma, address, pmd); - return follow_page_pte(vma, address, pmd, flags); - } + if (pmd_devmap(*pmd)) { ptl = pmd_lock(mm, pmd); - if (likely(pmd_trans_huge(*pmd))) { - if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(ptl); - wait_split_huge_page(vma->anon_vma, pmd); - } else { - page = follow_trans_huge_pmd(vma, address, - pmd, flags); - spin_unlock(ptl); - *page_mask = HPAGE_PMD_NR - 1; - return page; - } - } else + page = follow_devmap_pmd(vma, address, pmd, flags); + spin_unlock(ptl); + if (page) + return page; + } + if (likely(!pmd_trans_huge(*pmd))) + return follow_page_pte(vma, address, pmd, flags); + + ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_trans_huge(*pmd))) { + spin_unlock(ptl); + return follow_page_pte(vma, address, pmd, flags); + } + if (flags & FOLL_SPLIT) { + int ret; + page = pmd_page(*pmd); + if (is_huge_zero_page(page)) { + spin_unlock(ptl); + ret = 0; + split_huge_pmd(vma, pmd, address); + } else { + get_page(page); spin_unlock(ptl); + lock_page(page); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + } + + return ret ? ERR_PTR(ret) : + follow_page_pte(vma, address, pmd, flags); } - return follow_page_pte(vma, address, pmd, flags); + + page = follow_trans_huge_pmd(vma, address, pmd, flags); + spin_unlock(ptl); + *page_mask = HPAGE_PMD_NR - 1; + return page; } static int get_gate_page(struct mm_struct *mm, unsigned long address, @@ -376,10 +430,8 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * Anon pages in shared mappings are surprising: now * just reject it. */ - if (!is_cow_mapping(vm_flags)) { - WARN_ON_ONCE(vm_flags & VM_MAYWRITE); + if (!is_cow_mapping(vm_flags)) return -EFAULT; - } } } else if (!(vm_flags & VM_READ)) { if (!(gup_flags & FOLL_FORCE)) @@ -564,6 +616,8 @@ EXPORT_SYMBOL(__get_user_pages); * @mm: mm_struct of target mm * @address: user address * @fault_flags:flags to pass down to handle_mm_fault() + * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller + * does not allow retry * * This is meant to be called in the specific scenario where for locking reasons * we try to access user memory in atomic context (within a pagefault_disable() @@ -575,22 +629,28 @@ EXPORT_SYMBOL(__get_user_pages); * The main difference with get_user_pages() is that this function will * unconditionally call handle_mm_fault() which will in turn perform all the * necessary SW fixup of the dirty and young bits in the PTE, while - * handle_mm_fault() only guarantees to update these in the struct page. + * get_user_pages() only guarantees to update these in the struct page. * * This is important for some architectures where those bits also gate the * access permission to the page because they are maintained in software. On * such architectures, gup() will not be enough to make a subsequent access * succeed. * - * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault(). + * This function will not return with an unlocked mmap_sem. So it has not the + * same semantics wrt the @mm->mmap_sem as does filemap_fault(). */ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, - unsigned long address, unsigned int fault_flags) + unsigned long address, unsigned int fault_flags, + bool *unlocked) { struct vm_area_struct *vma; vm_flags_t vm_flags; - int ret; + int ret, major = 0; + if (unlocked) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; + +retry: vma = find_extend_vma(mm, address); if (!vma || address < vma->vm_start) return -EFAULT; @@ -600,6 +660,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, return -EFAULT; ret = handle_mm_fault(mm, vma, address, fault_flags); + major |= ret & VM_FAULT_MAJOR; if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return -ENOMEM; @@ -609,8 +670,19 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, return -EFAULT; BUG(); } + + if (ret & VM_FAULT_RETRY) { + down_read(&mm->mmap_sem); + if (!(fault_flags & FAULT_FLAG_TRIED)) { + *unlocked = true; + fault_flags &= ~FAULT_FLAG_ALLOW_RETRY; + fault_flags |= FAULT_FLAG_TRIED; + goto retry; + } + } + if (tsk) { - if (ret & VM_FAULT_MAJOR) + if (major) tsk->maj_flt++; else tsk->min_flt++; @@ -896,7 +968,6 @@ long populate_vma_page_range(struct vm_area_struct *vma, gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; if (vma->vm_flags & VM_LOCKONFAULT) gup_flags &= ~FOLL_POPULATE; - /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW @@ -1036,9 +1107,6 @@ struct page *get_dump_page(unsigned long addr) * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free * pages containing page tables. * - * *) THP splits will broadcast an IPI, this can be achieved by overriding - * pmdp_splitting_flush. - * * *) ptes can be read atomically by the architecture. * * *) access_ok is sufficient to validate userspace address ranges. @@ -1066,7 +1134,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, * for an example see gup_get_pte in arch/x86/mm/gup.c */ pte_t pte = READ_ONCE(*ptep); - struct page *page; + struct page *head, *page; /* * Similar to the PMD case below, NUMA hinting must take slow @@ -1078,15 +1146,17 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); + head = compound_head(page); - if (!page_cache_get_speculative(page)) + if (!page_cache_get_speculative(head)) goto pte_unmap; if (unlikely(pte_val(pte) != pte_val(*ptep))) { - put_page(page); + put_page(head); goto pte_unmap; } + VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; (*nr)++; @@ -1119,7 +1189,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { - struct page *head, *page, *tail; + struct page *head, *page; int refs; if (write && !pmd_write(orig)) @@ -1128,7 +1198,6 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, refs = 0; head = pmd_page(orig); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; @@ -1149,24 +1218,13 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; } - /* - * Any tail pages need their mapcount reference taken before we - * return. (This allows the THP code to bump their ref count when - * they are split into base pages). - */ - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { - struct page *head, *page, *tail; + struct page *head, *page; int refs; if (write && !pud_write(orig)) @@ -1175,7 +1233,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, refs = 0; head = pud_page(orig); page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; @@ -1196,12 +1253,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; } - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } @@ -1210,7 +1261,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, struct page **pages, int *nr) { int refs; - struct page *head, *page, *tail; + struct page *head, *page; if (write && !pgd_write(orig)) return 0; @@ -1218,7 +1269,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, refs = 0; head = pgd_page(orig); page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; @@ -1239,12 +1289,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, return 0; } - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } @@ -1259,7 +1303,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmd_t pmd = READ_ONCE(*pmdp); next = pmd_addr_end(addr, end); - if (pmd_none(pmd) || pmd_trans_splitting(pmd)) + if (pmd_none(pmd)) return 0; if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 62fe06b..08fc0ba 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -16,12 +16,16 @@ #include <linux/swap.h> #include <linux/shrinker.h> #include <linux/mm_inline.h> +#include <linux/swapops.h> #include <linux/dax.h> #include <linux/kthread.h> #include <linux/khugepaged.h> #include <linux/freezer.h> +#include <linux/pfn_t.h> #include <linux/mman.h> +#include <linux/memremap.h> #include <linux/pagemap.h> +#include <linux/debugfs.h> #include <linux/migrate.h> #include <linux/hashtable.h> #include <linux/userfaultfd_k.h> @@ -31,6 +35,34 @@ #include <asm/pgalloc.h> #include "internal.h" +enum scan_result { + SCAN_FAIL, + SCAN_SUCCEED, + SCAN_PMD_NULL, + SCAN_EXCEED_NONE_PTE, + SCAN_PTE_NON_PRESENT, + SCAN_PAGE_RO, + SCAN_NO_REFERENCED_PAGE, + SCAN_PAGE_NULL, + SCAN_SCAN_ABORT, + SCAN_PAGE_COUNT, + SCAN_PAGE_LRU, + SCAN_PAGE_LOCK, + SCAN_PAGE_ANON, + SCAN_PAGE_COMPOUND, + SCAN_ANY_PROCESS, + SCAN_VMA_NULL, + SCAN_VMA_CHECK, + SCAN_ADDRESS_RANGE, + SCAN_SWAP_CACHE_PAGE, + SCAN_DEL_PAGE_LRU, + SCAN_ALLOC_HUGE_PAGE_FAIL, + SCAN_CGROUP_CHARGE_FAIL +}; + +#define CREATE_TRACE_POINTS +#include <trace/events/huge_memory.h> + /* * By default transparent hugepage support is disabled in order that avoid * to risk increase the memory footprint of applications without a guaranteed @@ -106,6 +138,7 @@ static struct khugepaged_scan khugepaged_scan = { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), }; +static struct shrinker deferred_split_shrinker; static void set_recommended_min_free_kbytes(void) { @@ -638,6 +671,9 @@ static int __init hugepage_init(void) err = register_shrinker(&huge_zero_page_shrinker); if (err) goto err_hzp_shrinker; + err = register_shrinker(&deferred_split_shrinker); + if (err) + goto err_split_shrinker; /* * By default disable transparent hugepages on smaller systems, @@ -655,6 +691,8 @@ static int __init hugepage_init(void) return 0; err_khugepaged: + unregister_shrinker(&deferred_split_shrinker); +err_split_shrinker: unregister_shrinker(&huge_zero_page_shrinker); err_hzp_shrinker: khugepaged_slab_exit(); @@ -711,6 +749,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) return entry; } +static inline struct list_head *page_deferred_list(struct page *page) +{ + /* + * ->lru in the tail pages is occupied by compound_head. + * Let's use ->mapping + ->index in the second tail page as list_head. + */ + return (struct list_head *)&page[2].mapping; +} + +void prep_transhuge_page(struct page *page) +{ + /* + * we use page->mapping and page->indexlru in second tail page + * as list_head: assuming THP order >= 2 + */ + BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); + + INIT_LIST_HEAD(page_deferred_list(page)); + set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); +} + static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, @@ -724,7 +783,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) { + if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -732,7 +791,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) { - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, true); put_page(page); return VM_FAULT_OOM; } @@ -748,7 +807,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_none(*pmd))) { spin_unlock(ptl); - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(mm, pgtable); } else { @@ -759,7 +818,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, int ret; spin_unlock(ptl); - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(mm, pgtable); ret = handle_userfault(vma, address, flags, @@ -770,8 +829,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - page_add_new_anon_rmap(page, vma, haddr); - mem_cgroup_commit_charge(page, memcg, false); + page_add_new_anon_rmap(page, vma, haddr, true); + mem_cgroup_commit_charge(page, memcg, false, true); lru_cache_add_active_or_unevictable(page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); @@ -799,7 +858,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return false; entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); - pgtable_trans_huge_deposit(mm, pmd, pgtable); + if (pgtable) + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); atomic_long_inc(&mm->nr_ptes); return true; @@ -865,32 +925,33 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } + prep_transhuge_page(page); return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, flags); } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write) + pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write) { struct mm_struct *mm = vma->vm_mm; pmd_t entry; spinlock_t *ptl; ptl = pmd_lock(mm, pmd); - if (pmd_none(*pmd)) { - entry = pmd_mkhuge(pfn_pmd(pfn, prot)); - if (write) { - entry = pmd_mkyoung(pmd_mkdirty(entry)); - entry = maybe_pmd_mkwrite(entry, vma); - } - set_pmd_at(mm, addr, pmd, entry); - update_mmu_cache_pmd(vma, addr, pmd); - } + entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); + if (pfn_t_devmap(pfn)) + entry = pmd_mkdevmap(entry); + if (write) { + entry = pmd_mkyoung(pmd_mkdirty(entry)); + entry = maybe_pmd_mkwrite(entry, vma); + } + set_pmd_at(mm, addr, pmd, entry); + update_mmu_cache_pmd(vma, addr, pmd); spin_unlock(ptl); } int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned long pfn, bool write) + pmd_t *pmd, pfn_t pfn, bool write) { pgprot_t pgprot = vma->vm_page_prot; /* @@ -902,7 +963,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); - BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); + BUG_ON(!pfn_t_devmap(pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; @@ -912,6 +973,63 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, return VM_FAULT_NOPAGE; } +static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd) +{ + pmd_t _pmd; + + /* + * We should set the dirty bit only for FOLL_WRITE but for now + * the dirty bit in the pmd is meaningless. And if the dirty + * bit will become meaningful and we'll only set it with + * FOLL_WRITE, an atomic set_bit will be required on the pmd to + * set the young bit, instead of the current set_pmd_at. + */ + _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); + if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, + pmd, _pmd, 1)) + update_mmu_cache_pmd(vma, addr, pmd); +} + +struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, int flags) +{ + unsigned long pfn = pmd_pfn(*pmd); + struct mm_struct *mm = vma->vm_mm; + struct dev_pagemap *pgmap; + struct page *page; + + assert_spin_locked(pmd_lockptr(mm, pmd)); + + if (flags & FOLL_WRITE && !pmd_write(*pmd)) + return NULL; + + if (pmd_present(*pmd) && pmd_devmap(*pmd)) + /* pass */; + else + return NULL; + + if (flags & FOLL_TOUCH) + touch_pmd(vma, addr, pmd); + + /* + * device mapped pages can only be returned if the + * caller will manage the page reference count. + */ + if (!(flags & FOLL_GET)) + return ERR_PTR(-EEXIST); + + pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; + pgmap = get_dev_pagemap(pfn, NULL); + if (!pgmap) + return ERR_PTR(-EFAULT); + page = pfn_to_page(pfn); + get_page(page); + put_dev_pagemap(pgmap); + + return page; +} + int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma) @@ -919,13 +1037,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, spinlock_t *dst_ptl, *src_ptl; struct page *src_page; pmd_t pmd; - pgtable_t pgtable; + pgtable_t pgtable = NULL; int ret; - ret = -ENOMEM; - pgtable = pte_alloc_one(dst_mm, addr); - if (unlikely(!pgtable)) - goto out; + if (!vma_is_dax(vma)) { + ret = -ENOMEM; + pgtable = pte_alloc_one(dst_mm, addr); + if (unlikely(!pgtable)) + goto out; + } dst_ptl = pmd_lock(dst_mm, dst_pmd); src_ptl = pmd_lockptr(src_mm, src_pmd); @@ -933,7 +1053,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = -EAGAIN; pmd = *src_pmd; - if (unlikely(!pmd_trans_huge(pmd))) { + if (unlikely(!pmd_trans_huge(pmd) && !pmd_devmap(pmd))) { pte_free(dst_mm, pgtable); goto out_unlock; } @@ -956,26 +1076,20 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, goto out_unlock; } - if (unlikely(pmd_trans_splitting(pmd))) { - /* split huge page running from under us */ - spin_unlock(src_ptl); - spin_unlock(dst_ptl); - pte_free(dst_mm, pgtable); - - wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ - goto out; + if (!vma_is_dax(vma)) { + /* thp accounting separate from pmd_devmap accounting */ + src_page = pmd_page(pmd); + VM_BUG_ON_PAGE(!PageHead(src_page), src_page); + get_page(src_page); + page_dup_rmap(src_page, true); + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + atomic_long_inc(&dst_mm->nr_ptes); + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); } - src_page = pmd_page(pmd); - VM_BUG_ON_PAGE(!PageHead(src_page), src_page); - get_page(src_page); - page_dup_rmap(src_page); - add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); pmdp_set_wrprotect(src_mm, addr, src_pmd); pmd = pmd_mkold(pmd_wrprotect(pmd)); - pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); set_pmd_at(dst_mm, addr, dst_pmd, pmd); - atomic_long_inc(&dst_mm->nr_ptes); ret = 0; out_unlock: @@ -1008,37 +1122,6 @@ unlock: spin_unlock(ptl); } -/* - * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages - * during copy_user_huge_page()'s copy_page_rep(): in the case when - * the source page gets split and a tail freed before copy completes. - * Called under pmd_lock of checked pmd, so safe from splitting itself. - */ -static void get_user_huge_page(struct page *page) -{ - if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { - struct page *endpage = page + HPAGE_PMD_NR; - - atomic_add(HPAGE_PMD_NR, &page->_count); - while (++page < endpage) - get_huge_page_tail(page); - } else { - get_page(page); - } -} - -static void put_user_huge_page(struct page *page) -{ - if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { - struct page *endpage = page + HPAGE_PMD_NR; - - while (page < endpage) - put_page(page++); - } else { - put_page(page); - } -} - static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -1068,13 +1151,14 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, vma, address, page_to_nid(page)); if (unlikely(!pages[i] || mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, - &memcg))) { + &memcg, false))) { if (pages[i]) put_page(pages[i]); while (--i >= 0) { memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - mem_cgroup_cancel_charge(pages[i], memcg); + mem_cgroup_cancel_charge(pages[i], memcg, + false); put_page(pages[i]); } kfree(pages); @@ -1112,8 +1196,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, entry = maybe_mkwrite(pte_mkdirty(entry), vma); memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - page_add_new_anon_rmap(pages[i], vma, haddr); - mem_cgroup_commit_charge(pages[i], memcg, false); + page_add_new_anon_rmap(pages[i], vma, haddr, false); + mem_cgroup_commit_charge(pages[i], memcg, false, false); lru_cache_add_active_or_unevictable(pages[i], vma); pte = pte_offset_map(&_pmd, haddr); VM_BUG_ON(!pte_none(*pte)); @@ -1124,7 +1208,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); - page_remove_rmap(page); + page_remove_rmap(page, true); spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); @@ -1141,7 +1225,7 @@ out_free_pages: for (i = 0; i < HPAGE_PMD_NR; i++) { memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - mem_cgroup_cancel_charge(pages[i], memcg); + mem_cgroup_cancel_charge(pages[i], memcg, false); put_page(pages[i]); } kfree(pages); @@ -1171,7 +1255,17 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pmd_page(orig_pmd); VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); - if (page_mapcount(page) == 1) { + /* + * We can only reuse the page if nobody else maps the huge page or it's + * part. We can do it by checking page_mapcount() on each sub-page, but + * it's expensive. + * The cheaper way is to check page_count() to be equal 1: every + * mapcount takes page reference reference, so this way we can + * guarantee, that the PMD is the only mapping. + * This can give false negative if somebody pinned the page, but that's + * fine. + */ + if (page_mapcount(page) == 1 && page_count(page) == 1) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -1180,7 +1274,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, ret |= VM_FAULT_WRITE; goto out_unlock; } - get_user_huge_page(page); + get_page(page); spin_unlock(ptl); alloc: if (transparent_hugepage_enabled(vma) && @@ -1190,30 +1284,33 @@ alloc: } else new_page = NULL; - if (unlikely(!new_page)) { + if (likely(new_page)) { + prep_transhuge_page(new_page); + } else { if (!page) { - split_huge_page_pmd(vma, address, pmd); + split_huge_pmd(vma, pmd, address); ret |= VM_FAULT_FALLBACK; } else { ret = do_huge_pmd_wp_page_fallback(mm, vma, address, pmd, orig_pmd, page, haddr); if (ret & VM_FAULT_OOM) { - split_huge_page(page); + split_huge_pmd(vma, pmd, address); ret |= VM_FAULT_FALLBACK; } - put_user_huge_page(page); + put_page(page); } count_vm_event(THP_FAULT_FALLBACK); goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg, + true))) { put_page(new_page); if (page) { - split_huge_page(page); - put_user_huge_page(page); + split_huge_pmd(vma, pmd, address); + put_page(page); } else - split_huge_page_pmd(vma, address, pmd); + split_huge_pmd(vma, pmd, address); ret |= VM_FAULT_FALLBACK; count_vm_event(THP_FAULT_FALLBACK); goto out; @@ -1233,10 +1330,10 @@ alloc: spin_lock(ptl); if (page) - put_user_huge_page(page); + put_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { spin_unlock(ptl); - mem_cgroup_cancel_charge(new_page, memcg); + mem_cgroup_cancel_charge(new_page, memcg, true); put_page(new_page); goto out_mn; } else { @@ -1244,8 +1341,8 @@ alloc: entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); pmdp_huge_clear_flush_notify(vma, haddr, pmd); - page_add_new_anon_rmap(new_page, vma, haddr); - mem_cgroup_commit_charge(new_page, memcg, false); + page_add_new_anon_rmap(new_page, vma, haddr, true); + mem_cgroup_commit_charge(new_page, memcg, false, true); lru_cache_add_active_or_unevictable(new_page, vma); set_pmd_at(mm, haddr, pmd, entry); update_mmu_cache_pmd(vma, address, pmd); @@ -1254,7 +1351,7 @@ alloc: put_huge_zero_page(); } else { VM_BUG_ON_PAGE(!PageHead(page), page); - page_remove_rmap(page); + page_remove_rmap(page, true); put_page(page); } ret |= VM_FAULT_WRITE; @@ -1292,23 +1389,23 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, page = pmd_page(*pmd); VM_BUG_ON_PAGE(!PageHead(page), page); - if (flags & FOLL_TOUCH) { - pmd_t _pmd; + if (flags & FOLL_TOUCH) + touch_pmd(vma, addr, pmd); + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* - * We should set the dirty bit only for FOLL_WRITE but - * for now the dirty bit in the pmd is meaningless. - * And if the dirty bit will become meaningful and - * we'll only set it with FOLL_WRITE, an atomic - * set_bit will be required on the pmd to set the - * young bit, instead of the current set_pmd_at. + * We don't mlock() pte-mapped THPs. This way we can avoid + * leaking mlocked pages into non-VM_LOCKED VMAs. + * + * In most cases the pmd is the only mapping of the page as we + * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for + * writable private mappings in populate_vma_page_range(). + * + * The only scenario when we have the page shared here is if we + * mlocking read-only mapping shared over fork(). We skip + * mlocking such pages. */ - _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); - if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, - pmd, _pmd, 1)) - update_mmu_cache_pmd(vma, addr, pmd); - } - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - if (page->mapping && trylock_page(page)) { + if (compound_mapcount(page) == 1 && !PageDoubleMap(page) && + page->mapping && trylock_page(page)) { lru_add_drain(); if (page->mapping) mlock_vma_page(page); @@ -1318,7 +1415,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON_PAGE(!PageCompound(page), page); if (flags & FOLL_GET) - get_page_foll(page); + get_page(page); out: return page; @@ -1453,13 +1550,86 @@ out: return 0; } +int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr, unsigned long next) + +{ + spinlock_t *ptl; + pmd_t orig_pmd; + struct page *page; + struct mm_struct *mm = tlb->mm; + int ret = 0; + + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + goto out_unlocked; + + orig_pmd = *pmd; + if (is_huge_zero_pmd(orig_pmd)) { + ret = 1; + goto out; + } + + page = pmd_page(orig_pmd); + /* + * If other processes are mapping this page, we couldn't discard + * the page unless they all do MADV_FREE so let's skip the page. + */ + if (page_mapcount(page) != 1) + goto out; + + if (!trylock_page(page)) + goto out; + + /* + * If user want to discard part-pages of THP, split it so MADV_FREE + * will deactivate only them. + */ + if (next - addr != HPAGE_PMD_SIZE) { + get_page(page); + spin_unlock(ptl); + if (split_huge_page(page)) { + put_page(page); + unlock_page(page); + goto out_unlocked; + } + put_page(page); + unlock_page(page); + ret = 1; + goto out_unlocked; + } + + if (PageDirty(page)) + ClearPageDirty(page); + unlock_page(page); + + if (PageActive(page)) + deactivate_page(page); + + if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { + orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, + tlb->fullmm); + orig_pmd = pmd_mkold(orig_pmd); + orig_pmd = pmd_mkclean(orig_pmd); + + set_pmd_at(mm, addr, pmd, orig_pmd); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + } + ret = 1; +out: + spin_unlock(ptl); +out_unlocked: + return ret; +} + int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { pmd_t orig_pmd; spinlock_t *ptl; - if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1) + ptl = __pmd_trans_huge_lock(pmd, vma); + if (!ptl) return 0; /* * For architectures like ppc64 we look at deposited pgtable @@ -1481,7 +1651,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, put_huge_zero_page(); } else { struct page *page = pmd_page(orig_pmd); - page_remove_rmap(page); + page_remove_rmap(page, true); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); VM_BUG_ON_PAGE(!PageHead(page), page); @@ -1493,13 +1663,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return 1; } -int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, +bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd) { spinlock_t *old_ptl, *new_ptl; - int ret = 0; pmd_t pmd; struct mm_struct *mm = vma->vm_mm; @@ -1508,7 +1677,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, (new_addr & ~HPAGE_PMD_MASK) || old_end - old_addr < HPAGE_PMD_SIZE || (new_vma->vm_flags & VM_NOHUGEPAGE)) - goto out; + return false; /* * The destination pmd shouldn't be established, free_pgtables() @@ -1516,15 +1685,15 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, */ if (WARN_ON(!pmd_none(*new_pmd))) { VM_BUG_ON(pmd_trans_huge(*new_pmd)); - goto out; + return false; } /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_sem prevents deadlock. */ - ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl); - if (ret == 1) { + old_ptl = __pmd_trans_huge_lock(old_pmd, vma); + if (old_ptl) { new_ptl = pmd_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); @@ -1540,9 +1709,9 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, if (new_ptl != old_ptl) spin_unlock(new_ptl); spin_unlock(old_ptl); + return true; } -out: - return ret; + return false; } /* @@ -1558,7 +1727,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + ptl = __pmd_trans_huge_lock(pmd, vma); + if (ptl) { pmd_t entry; bool preserve_write = prot_numa && pmd_write(*pmd); ret = 1; @@ -1589,405 +1759,19 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, } /* - * Returns 1 if a given pmd maps a stable (not under splitting) thp. - * Returns -1 if it maps a thp under splitting. Returns 0 otherwise. - * - * Note that if it returns 1, this routine returns without unlocking page - * table locks. So callers must unlock them. - */ -int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, - spinlock_t **ptl) -{ - *ptl = pmd_lock(vma->vm_mm, pmd); - if (likely(pmd_trans_huge(*pmd))) { - if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(*ptl); - wait_split_huge_page(vma->anon_vma, pmd); - return -1; - } else { - /* Thp mapped by 'pmd' is stable, so we can - * handle it as it is. */ - return 1; - } - } - spin_unlock(*ptl); - return 0; -} - -/* - * This function returns whether a given @page is mapped onto the @address - * in the virtual space of @mm. + * Returns true if a given pmd maps a thp, false otherwise. * - * When it's true, this function returns *pmd with holding the page table lock - * and passing it back to the caller via @ptl. - * If it's false, returns NULL without holding the page table lock. + * Note that if it returns true, this routine returns without unlocking page + * table lock. So callers must unlock it. */ -pmd_t *page_check_address_pmd(struct page *page, - struct mm_struct *mm, - unsigned long address, - enum page_check_address_pmd_flag flag, - spinlock_t **ptl) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - - if (address & ~HPAGE_PMD_MASK) - return NULL; - - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - return NULL; - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return NULL; - pmd = pmd_offset(pud, address); - - *ptl = pmd_lock(mm, pmd); - if (!pmd_present(*pmd)) - goto unlock; - if (pmd_page(*pmd) != page) - goto unlock; - /* - * split_vma() may create temporary aliased mappings. There is - * no risk as long as all huge pmd are found and have their - * splitting bit set before __split_huge_page_refcount - * runs. Finding the same huge pmd more than once during the - * same rmap walk is not a problem. - */ - if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && - pmd_trans_splitting(*pmd)) - goto unlock; - if (pmd_trans_huge(*pmd)) { - VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && - !pmd_trans_splitting(*pmd)); - return pmd; - } -unlock: - spin_unlock(*ptl); - return NULL; -} - -static int __split_huge_page_splitting(struct page *page, - struct vm_area_struct *vma, - unsigned long address) -{ - struct mm_struct *mm = vma->vm_mm; - spinlock_t *ptl; - pmd_t *pmd; - int ret = 0; - /* For mmu_notifiers */ - const unsigned long mmun_start = address; - const unsigned long mmun_end = address + HPAGE_PMD_SIZE; - - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl); - if (pmd) { - /* - * We can't temporarily set the pmd to null in order - * to split it, the pmd must remain marked huge at all - * times or the VM won't take the pmd_trans_huge paths - * and it won't wait on the anon_vma->root->rwsem to - * serialize against split_huge_page*. - */ - pmdp_splitting_flush(vma, address, pmd); - - ret = 1; - spin_unlock(ptl); - } - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - - return ret; -} - -static void __split_huge_page_refcount(struct page *page, - struct list_head *list) -{ - int i; - struct zone *zone = page_zone(page); - struct lruvec *lruvec; - int tail_count = 0; - - /* prevent PageLRU to go away from under us, and freeze lru stats */ - spin_lock_irq(&zone->lru_lock); - lruvec = mem_cgroup_page_lruvec(page, zone); - - compound_lock(page); - /* complete memcg works before add pages to LRU */ - mem_cgroup_split_huge_fixup(page); - - for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { - struct page *page_tail = page + i; - - /* tail_page->_mapcount cannot change */ - BUG_ON(page_mapcount(page_tail) < 0); - tail_count += page_mapcount(page_tail); - /* check for overflow */ - BUG_ON(tail_count < 0); - BUG_ON(atomic_read(&page_tail->_count) != 0); - /* - * tail_page->_count is zero and not changing from - * under us. But get_page_unless_zero() may be running - * from under us on the tail_page. If we used - * atomic_set() below instead of atomic_add(), we - * would then run atomic_set() concurrently with - * get_page_unless_zero(), and atomic_set() is - * implemented in C not using locked ops. spin_unlock - * on x86 sometime uses locked ops because of PPro - * errata 66, 92, so unless somebody can guarantee - * atomic_set() here would be safe on all archs (and - * not only on x86), it's safer to use atomic_add(). - */ - atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, - &page_tail->_count); - - /* after clearing PageTail the gup refcount can be released */ - smp_mb__after_atomic(); - - page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; - page_tail->flags |= (page->flags & - ((1L << PG_referenced) | - (1L << PG_swapbacked) | - (1L << PG_mlocked) | - (1L << PG_uptodate) | - (1L << PG_active) | - (1L << PG_unevictable))); - page_tail->flags |= (1L << PG_dirty); - - clear_compound_head(page_tail); - - if (page_is_young(page)) - set_page_young(page_tail); - if (page_is_idle(page)) - set_page_idle(page_tail); - - /* - * __split_huge_page_splitting() already set the - * splitting bit in all pmd that could map this - * hugepage, that will ensure no CPU can alter the - * mapcount on the head page. The mapcount is only - * accounted in the head page and it has to be - * transferred to all tail pages in the below code. So - * for this code to be safe, the split the mapcount - * can't change. But that doesn't mean userland can't - * keep changing and reading the page contents while - * we transfer the mapcount, so the pmd splitting - * status is achieved setting a reserved bit in the - * pmd, not by clearing the present bit. - */ - page_tail->_mapcount = page->_mapcount; - - BUG_ON(page_tail->mapping); - page_tail->mapping = page->mapping; - - page_tail->index = page->index + i; - page_cpupid_xchg_last(page_tail, page_cpupid_last(page)); - - BUG_ON(!PageAnon(page_tail)); - BUG_ON(!PageUptodate(page_tail)); - BUG_ON(!PageDirty(page_tail)); - BUG_ON(!PageSwapBacked(page_tail)); - - lru_add_page_tail(page, page_tail, lruvec, list); - } - atomic_sub(tail_count, &page->_count); - BUG_ON(atomic_read(&page->_count) <= 0); - - __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); - - ClearPageCompound(page); - compound_unlock(page); - spin_unlock_irq(&zone->lru_lock); - - for (i = 1; i < HPAGE_PMD_NR; i++) { - struct page *page_tail = page + i; - BUG_ON(page_count(page_tail) <= 0); - /* - * Tail pages may be freed if there wasn't any mapping - * like if add_to_swap() is running on a lru page that - * had its mapping zapped. And freeing these pages - * requires taking the lru_lock so we do the put_page - * of the tail pages after the split is complete. - */ - put_page(page_tail); - } - - /* - * Only the head page (now become a regular page) is required - * to be pinned by the caller. - */ - BUG_ON(page_count(page) <= 0); -} - -static int __split_huge_page_map(struct page *page, - struct vm_area_struct *vma, - unsigned long address) +spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { - struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; - pmd_t *pmd, _pmd; - int ret = 0, i; - pgtable_t pgtable; - unsigned long haddr; - - pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl); - if (pmd) { - pgtable = pgtable_trans_huge_withdraw(mm, pmd); - pmd_populate(mm, &_pmd, pgtable); - if (pmd_write(*pmd)) - BUG_ON(page_mapcount(page) != 1); - - haddr = address; - for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { - pte_t *pte, entry; - BUG_ON(PageCompound(page+i)); - /* - * Note that NUMA hinting access restrictions are not - * transferred to avoid any possibility of altering - * permissions across VMAs. - */ - entry = mk_pte(page + i, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (!pmd_write(*pmd)) - entry = pte_wrprotect(entry); - if (!pmd_young(*pmd)) - entry = pte_mkold(entry); - pte = pte_offset_map(&_pmd, haddr); - BUG_ON(!pte_none(*pte)); - set_pte_at(mm, haddr, pte, entry); - pte_unmap(pte); - } - - smp_wmb(); /* make pte visible before pmd */ - /* - * Up to this point the pmd is present and huge and - * userland has the whole access to the hugepage - * during the split (which happens in place). If we - * overwrite the pmd with the not-huge version - * pointing to the pte here (which of course we could - * if all CPUs were bug free), userland could trigger - * a small page size TLB miss on the small sized TLB - * while the hugepage TLB entry is still established - * in the huge TLB. Some CPU doesn't like that. See - * http://support.amd.com/us/Processor_TechDocs/41322.pdf, - * Erratum 383 on page 93. Intel should be safe but is - * also warns that it's only safe if the permission - * and cache attributes of the two entries loaded in - * the two TLB is identical (which should be the case - * here). But it is generally safer to never allow - * small and huge TLB entries for the same virtual - * address to be loaded simultaneously. So instead of - * doing "pmd_populate(); flush_pmd_tlb_range();" we first - * mark the current pmd notpresent (atomically because - * here the pmd_trans_huge and pmd_trans_splitting - * must remain set at all times on the pmd until the - * split is complete for this pmd), then we flush the - * SMP TLB and finally we write the non-huge version - * of the pmd entry with pmd_populate. - */ - pmdp_invalidate(vma, address, pmd); - pmd_populate(mm, pmd, pgtable); - ret = 1; - spin_unlock(ptl); - } - - return ret; -} - -/* must be called with anon_vma->root->rwsem held */ -static void __split_huge_page(struct page *page, - struct anon_vma *anon_vma, - struct list_head *list) -{ - int mapcount, mapcount2; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - struct anon_vma_chain *avc; - - BUG_ON(!PageHead(page)); - BUG_ON(PageTail(page)); - - mapcount = 0; - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { - struct vm_area_struct *vma = avc->vma; - unsigned long addr = vma_address(page, vma); - BUG_ON(is_vma_temporary_stack(vma)); - mapcount += __split_huge_page_splitting(page, vma, addr); - } - /* - * It is critical that new vmas are added to the tail of the - * anon_vma list. This guarantes that if copy_huge_pmd() runs - * and establishes a child pmd before - * __split_huge_page_splitting() freezes the parent pmd (so if - * we fail to prevent copy_huge_pmd() from running until the - * whole __split_huge_page() is complete), we will still see - * the newly established pmd of the child later during the - * walk, to be able to set it as pmd_trans_splitting too. - */ - if (mapcount != page_mapcount(page)) { - pr_err("mapcount %d page_mapcount %d\n", - mapcount, page_mapcount(page)); - BUG(); - } - - __split_huge_page_refcount(page, list); - - mapcount2 = 0; - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { - struct vm_area_struct *vma = avc->vma; - unsigned long addr = vma_address(page, vma); - BUG_ON(is_vma_temporary_stack(vma)); - mapcount2 += __split_huge_page_map(page, vma, addr); - } - if (mapcount != mapcount2) { - pr_err("mapcount %d mapcount2 %d page_mapcount %d\n", - mapcount, mapcount2, page_mapcount(page)); - BUG(); - } -} - -/* - * Split a hugepage into normal pages. This doesn't change the position of head - * page. If @list is null, tail pages will be added to LRU list, otherwise, to - * @list. Both head page and tail pages will inherit mapping, flags, and so on - * from the hugepage. - * Return 0 if the hugepage is split successfully otherwise return 1. - */ -int split_huge_page_to_list(struct page *page, struct list_head *list) -{ - struct anon_vma *anon_vma; - int ret = 1; - - BUG_ON(is_huge_zero_page(page)); - BUG_ON(!PageAnon(page)); - - /* - * The caller does not necessarily hold an mmap_sem that would prevent - * the anon_vma disappearing so we first we take a reference to it - * and then lock the anon_vma for write. This is similar to - * page_lock_anon_vma_read except the write lock is taken to serialise - * against parallel split or collapse operations. - */ - anon_vma = page_get_anon_vma(page); - if (!anon_vma) - goto out; - anon_vma_lock_write(anon_vma); - - ret = 0; - if (!PageCompound(page)) - goto out_unlock; - - BUG_ON(!PageSwapBacked(page)); - __split_huge_page(page, anon_vma, list); - count_vm_event(THP_SPLIT); - - BUG_ON(PageCompound(page)); -out_unlock: - anon_vma_unlock_write(anon_vma); - put_anon_vma(anon_vma); -out: - return ret; + ptl = pmd_lock(vma->vm_mm, pmd); + if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) + return ptl; + spin_unlock(ptl); + return NULL; } #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) @@ -2198,26 +1982,33 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte) { - struct page *page; + struct page *page = NULL; pte_t *_pte; - int none_or_zero = 0; + int none_or_zero = 0, result = 0; bool referenced = false, writable = false; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) + ++none_or_zero <= khugepaged_max_ptes_none) { continue; - else + } else { + result = SCAN_EXCEED_NONE_PTE; goto out; + } } - if (!pte_present(pteval)) + if (!pte_present(pteval)) { + result = SCAN_PTE_NON_PRESENT; goto out; + } page = vm_normal_page(vma, address, pteval); - if (unlikely(!page)) + if (unlikely(!page)) { + result = SCAN_PAGE_NULL; goto out; + } VM_BUG_ON_PAGE(PageCompound(page), page); VM_BUG_ON_PAGE(!PageAnon(page), page); @@ -2229,8 +2020,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * is needed to serialize against split_huge_page * when invoked from the VM. */ - if (!trylock_page(page)) + if (!trylock_page(page)) { + result = SCAN_PAGE_LOCK; goto out; + } /* * cannot use mapcount: can't collapse if there's a gup pin. @@ -2239,6 +2032,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, */ if (page_count(page) != 1 + !!PageSwapCache(page)) { unlock_page(page); + result = SCAN_PAGE_COUNT; goto out; } if (pte_write(pteval)) { @@ -2246,6 +2040,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, } else { if (PageSwapCache(page) && !reuse_swap_page(page)) { unlock_page(page); + result = SCAN_SWAP_CACHE_PAGE; goto out; } /* @@ -2260,6 +2055,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, */ if (isolate_lru_page(page)) { unlock_page(page); + result = SCAN_DEL_PAGE_LRU; goto out; } /* 0 stands for page_is_file_cache(page) == false */ @@ -2273,10 +2069,21 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, mmu_notifier_test_young(vma->vm_mm, address)) referenced = true; } - if (likely(referenced && writable)) - return 1; + if (likely(writable)) { + if (likely(referenced)) { + result = SCAN_SUCCEED; + trace_mm_collapse_huge_page_isolate(page, none_or_zero, + referenced, writable, result); + return 1; + } + } else { + result = SCAN_PAGE_RO; + } + out: release_pte_pages(pte, _pte); + trace_mm_collapse_huge_page_isolate(page, none_or_zero, + referenced, writable, result); return 0; } @@ -2321,7 +2128,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, * superfluous. */ pte_clear(vma->vm_mm, address, _pte); - page_remove_rmap(src_page); + page_remove_rmap(src_page, false); spin_unlock(ptl); free_page_and_swap_cache(src_page); } @@ -2431,6 +2238,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, return NULL; } + prep_transhuge_page(*hpage); count_vm_event(THP_COLLAPSE_ALLOC); return *hpage; } @@ -2442,8 +2250,12 @@ static int khugepaged_find_target_node(void) static inline struct page *alloc_hugepage(int defrag) { - return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), - HPAGE_PMD_ORDER); + struct page *page; + + page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER); + if (page) + prep_transhuge_page(page); + return page; } static struct page *khugepaged_alloc_hugepage(bool *wait) @@ -2493,7 +2305,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma) if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || (vma->vm_flags & VM_NOHUGEPAGE)) return false; - if (!vma->anon_vma || vma->vm_ops) return false; if (is_vma_temporary_stack(vma)) @@ -2513,7 +2324,7 @@ static void collapse_huge_page(struct mm_struct *mm, pgtable_t pgtable; struct page *new_page; spinlock_t *pmd_ptl, *pte_ptl; - int isolated; + int isolated = 0, result = 0; unsigned long hstart, hend; struct mem_cgroup *memcg; unsigned long mmun_start; /* For mmu_notifiers */ @@ -2528,12 +2339,15 @@ static void collapse_huge_page(struct mm_struct *mm, /* release the mmap_sem read lock. */ new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node); - if (!new_page) - return; + if (!new_page) { + result = SCAN_ALLOC_HUGE_PAGE_FAIL; + goto out_nolock; + } - if (unlikely(mem_cgroup_try_charge(new_page, mm, - gfp, &memcg))) - return; + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { + result = SCAN_CGROUP_CHARGE_FAIL; + goto out_nolock; + } /* * Prevent all access to pagetables with the exception of @@ -2541,21 +2355,31 @@ static void collapse_huge_page(struct mm_struct *mm, * handled by the anon_vma lock + PG_lock. */ down_write(&mm->mmap_sem); - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(khugepaged_test_exit(mm))) { + result = SCAN_ANY_PROCESS; goto out; + } vma = find_vma(mm, address); - if (!vma) + if (!vma) { + result = SCAN_VMA_NULL; goto out; + } hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; - if (address < hstart || address + HPAGE_PMD_SIZE > hend) + if (address < hstart || address + HPAGE_PMD_SIZE > hend) { + result = SCAN_ADDRESS_RANGE; goto out; - if (!hugepage_vma_check(vma)) + } + if (!hugepage_vma_check(vma)) { + result = SCAN_VMA_CHECK; goto out; + } pmd = mm_find_pmd(mm, address); - if (!pmd) + if (!pmd) { + result = SCAN_PMD_NULL; goto out; + } anon_vma_lock_write(vma->anon_vma); @@ -2592,6 +2416,7 @@ static void collapse_huge_page(struct mm_struct *mm, pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); + result = SCAN_FAIL; goto out; } @@ -2618,8 +2443,8 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - page_add_new_anon_rmap(new_page, vma, address); - mem_cgroup_commit_charge(new_page, memcg, false); + page_add_new_anon_rmap(new_page, vma, address, true); + mem_cgroup_commit_charge(new_page, memcg, false, true); lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); @@ -2629,12 +2454,17 @@ static void collapse_huge_page(struct mm_struct *mm, *hpage = NULL; khugepaged_pages_collapsed++; + result = SCAN_SUCCEED; out_up_write: up_write(&mm->mmap_sem); + trace_mm_collapse_huge_page(mm, isolated, result); return; +out_nolock: + trace_mm_collapse_huge_page(mm, isolated, result); + return; out: - mem_cgroup_cancel_charge(new_page, memcg); + mem_cgroup_cancel_charge(new_page, memcg, true); goto out_up_write; } @@ -2645,8 +2475,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, none_or_zero = 0; - struct page *page; + int ret = 0, none_or_zero = 0, result = 0; + struct page *page = NULL; unsigned long _address; spinlock_t *ptl; int node = NUMA_NO_NODE; @@ -2655,8 +2485,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); pmd = mm_find_pmd(mm, address); - if (!pmd) + if (!pmd) { + result = SCAN_PMD_NULL; goto out; + } memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); @@ -2665,19 +2497,32 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, pte_t pteval = *_pte; if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) + ++none_or_zero <= khugepaged_max_ptes_none) { continue; - else + } else { + result = SCAN_EXCEED_NONE_PTE; goto out_unmap; + } } - if (!pte_present(pteval)) + if (!pte_present(pteval)) { + result = SCAN_PTE_NON_PRESENT; goto out_unmap; + } if (pte_write(pteval)) writable = true; page = vm_normal_page(vma, _address, pteval); - if (unlikely(!page)) + if (unlikely(!page)) { + result = SCAN_PAGE_NULL; + goto out_unmap; + } + + /* TODO: teach khugepaged to collapse THP mapped with pte */ + if (PageCompound(page)) { + result = SCAN_PAGE_COMPOUND; goto out_unmap; + } + /* * Record which node the original page is from and save this * information to khugepaged_node_load[]. @@ -2685,26 +2530,48 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, * hit record. */ node = page_to_nid(page); - if (khugepaged_scan_abort(node)) + if (khugepaged_scan_abort(node)) { + result = SCAN_SCAN_ABORT; goto out_unmap; + } khugepaged_node_load[node]++; - VM_BUG_ON_PAGE(PageCompound(page), page); - if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) + if (!PageLRU(page)) { + result = SCAN_SCAN_ABORT; goto out_unmap; + } + if (PageLocked(page)) { + result = SCAN_PAGE_LOCK; + goto out_unmap; + } + if (!PageAnon(page)) { + result = SCAN_PAGE_ANON; + goto out_unmap; + } + /* * cannot use mapcount: can't collapse if there's a gup pin. * The page must only be referenced by the scanned process * and page swap cache. */ - if (page_count(page) != 1 + !!PageSwapCache(page)) + if (page_count(page) != 1 + !!PageSwapCache(page)) { + result = SCAN_PAGE_COUNT; goto out_unmap; + } if (pte_young(pteval) || page_is_young(page) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) referenced = true; } - if (referenced && writable) - ret = 1; + if (writable) { + if (referenced) { + result = SCAN_SUCCEED; + ret = 1; + } else { + result = SCAN_NO_REFERENCED_PAGE; + } + } else { + result = SCAN_PAGE_RO; + } out_unmap: pte_unmap_unlock(pte, ptl); if (ret) { @@ -2713,6 +2580,8 @@ out_unmap: collapse_huge_page(mm, address, hpage, vma, node); } out: + trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, + none_or_zero, result); return ret; } @@ -2938,8 +2807,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pmd_t _pmd; int i; - pmdp_huge_clear_flush_notify(vma, haddr, pmd); /* leave pmd empty until pte is filled */ + pmdp_huge_clear_flush_notify(vma, haddr, pmd); pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); @@ -2958,66 +2827,153 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, put_huge_zero_page(); } -void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd) +static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long haddr, bool freeze) { - spinlock_t *ptl; - struct page *page = NULL; struct mm_struct *mm = vma->vm_mm; - unsigned long haddr = address & HPAGE_PMD_MASK; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct page *page; + pgtable_t pgtable; + pmd_t _pmd; + bool young, write, dirty; + int i; - BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); + VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); + VM_BUG_ON_VMA(vma->vm_start > haddr, vma); + VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); + VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)); + + count_vm_event(THP_SPLIT_PMD); - mmun_start = haddr; - mmun_end = haddr + HPAGE_PMD_SIZE; -again: - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_trans_huge(*pmd))) - goto unlock; if (vma_is_dax(vma)) { pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); if (is_huge_zero_pmd(_pmd)) put_huge_zero_page(); + return; } else if (is_huge_zero_pmd(*pmd)) { - __split_huge_zero_page_pmd(vma, haddr, pmd); - } else { - page = pmd_page(*pmd); - VM_BUG_ON_PAGE(!page_count(page), page); - get_page(page); + return __split_huge_zero_page_pmd(vma, haddr, pmd); } - unlock: - spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - if (!page) - return; + page = pmd_page(*pmd); + VM_BUG_ON_PAGE(!page_count(page), page); + atomic_add(HPAGE_PMD_NR - 1, &page->_count); + write = pmd_write(*pmd); + young = pmd_young(*pmd); + dirty = pmd_dirty(*pmd); - split_huge_page(page); - put_page(page); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); + pmd_populate(mm, &_pmd, pgtable); + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + pte_t entry, *pte; + /* + * Note that NUMA hinting access restrictions are not + * transferred to avoid any possibility of altering + * permissions across VMAs. + */ + if (freeze) { + swp_entry_t swp_entry; + swp_entry = make_migration_entry(page + i, write); + entry = swp_entry_to_pte(swp_entry); + } else { + entry = mk_pte(page + i, vma->vm_page_prot); + entry = maybe_mkwrite(entry, vma); + if (!write) + entry = pte_wrprotect(entry); + if (!young) + entry = pte_mkold(entry); + } + if (dirty) + SetPageDirty(page + i); + pte = pte_offset_map(&_pmd, haddr); + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + atomic_inc(&page[i]._mapcount); + pte_unmap(pte); + } + + /* + * Set PG_double_map before dropping compound_mapcount to avoid + * false-negative page_mapped(). + */ + if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) { + for (i = 0; i < HPAGE_PMD_NR; i++) + atomic_inc(&page[i]._mapcount); + } + + if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { + /* Last compound_mapcount is gone. */ + __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + if (TestClearPageDoubleMap(page)) { + /* No need in mapcount reference anymore */ + for (i = 0; i < HPAGE_PMD_NR; i++) + atomic_dec(&page[i]._mapcount); + } + } + + smp_wmb(); /* make pte visible before pmd */ /* - * We don't always have down_write of mmap_sem here: a racing - * do_huge_pmd_wp_page() might have copied-on-write to another - * huge page before our split_huge_page() got the anon_vma lock. + * Up to this point the pmd is present and huge and userland has the + * whole access to the hugepage during the split (which happens in + * place). If we overwrite the pmd with the not-huge version pointing + * to the pte here (which of course we could if all CPUs were bug + * free), userland could trigger a small page size TLB miss on the + * small sized TLB while the hugepage TLB entry is still established in + * the huge TLB. Some CPU doesn't like that. + * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum + * 383 on page 93. Intel should be safe but is also warns that it's + * only safe if the permission and cache attributes of the two entries + * loaded in the two TLB is identical (which should be the case here). + * But it is generally safer to never allow small and huge TLB entries + * for the same virtual address to be loaded simultaneously. So instead + * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the + * current pmd notpresent (atomically because here the pmd_trans_huge + * and pmd_trans_splitting must remain set at all times on the pmd + * until the split is complete for this pmd), then we flush the SMP TLB + * and finally we write the non-huge version of the pmd entry with + * pmd_populate. */ - if (unlikely(pmd_trans_huge(*pmd))) - goto again; + pmdp_invalidate(vma, haddr, pmd); + pmd_populate(mm, pmd, pgtable); + + if (freeze) { + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + page_remove_rmap(page + i, false); + put_page(page + i); + } + } } -void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, - pmd_t *pmd) +void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long address) { - struct vm_area_struct *vma; + spinlock_t *ptl; + struct mm_struct *mm = vma->vm_mm; + struct page *page = NULL; + unsigned long haddr = address & HPAGE_PMD_MASK; - vma = find_vma(mm, address); - BUG_ON(vma == NULL); - split_huge_page_pmd(vma, address, pmd); + mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); + ptl = pmd_lock(mm, pmd); + if (pmd_trans_huge(*pmd)) { + page = pmd_page(*pmd); + if (PageMlocked(page)) + get_page(page); + else + page = NULL; + } else if (!pmd_devmap(*pmd)) + goto out; + __split_huge_pmd_locked(vma, pmd, haddr, false); +out: + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); + if (page) { + lock_page(page); + munlock_vma_page(page); + unlock_page(page); + put_page(page); + } } -static void split_huge_page_address(struct mm_struct *mm, +static void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address) { pgd_t *pgd; @@ -3026,7 +2982,7 @@ static void split_huge_page_address(struct mm_struct *mm, VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); - pgd = pgd_offset(mm, address); + pgd = pgd_offset(vma->vm_mm, address); if (!pgd_present(*pgd)) return; @@ -3035,13 +2991,13 @@ static void split_huge_page_address(struct mm_struct *mm, return; pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) + if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd))) return; /* * Caller holds the mmap_sem write mode, so a huge pmd cannot * materialize from under us. */ - split_huge_page_pmd_mm(mm, address, pmd); + split_huge_pmd(vma, pmd, address); } void vma_adjust_trans_huge(struct vm_area_struct *vma, @@ -3057,7 +3013,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (start & ~HPAGE_PMD_MASK && (start & HPAGE_PMD_MASK) >= vma->vm_start && (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_page_address(vma->vm_mm, start); + split_huge_pmd_address(vma, start); /* * If the new end address isn't hpage aligned and it could @@ -3067,7 +3023,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (end & ~HPAGE_PMD_MASK && (end & HPAGE_PMD_MASK) >= vma->vm_start && (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_page_address(vma->vm_mm, end); + split_huge_pmd_address(vma, end); /* * If we're also updating the vma->vm_next->vm_start, if the new @@ -3081,6 +3037,550 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (nstart & ~HPAGE_PMD_MASK && (nstart & HPAGE_PMD_MASK) >= next->vm_start && (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) - split_huge_page_address(next->vm_mm, nstart); + split_huge_pmd_address(next, nstart); + } +} + +static void freeze_page_vma(struct vm_area_struct *vma, struct page *page, + unsigned long address) +{ + unsigned long haddr = address & HPAGE_PMD_MASK; + spinlock_t *ptl; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int i, nr = HPAGE_PMD_NR; + + /* Skip pages which doesn't belong to the VMA */ + if (address < vma->vm_start) { + int off = (vma->vm_start - address) >> PAGE_SHIFT; + page += off; + nr -= off; + address = vma->vm_start; + } + + pgd = pgd_offset(vma->vm_mm, address); + if (!pgd_present(*pgd)) + return; + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return; + pmd = pmd_offset(pud, address); + ptl = pmd_lock(vma->vm_mm, pmd); + if (!pmd_present(*pmd)) { + spin_unlock(ptl); + return; + } + if (pmd_trans_huge(*pmd)) { + if (page == pmd_page(*pmd)) + __split_huge_pmd_locked(vma, pmd, haddr, true); + spin_unlock(ptl); + return; + } + spin_unlock(ptl); + + pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); + for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) { + pte_t entry, swp_pte; + swp_entry_t swp_entry; + + /* + * We've just crossed page table boundary: need to map next one. + * It can happen if THP was mremaped to non PMD-aligned address. + */ + if (unlikely(address == haddr + HPAGE_PMD_SIZE)) { + pte_unmap_unlock(pte - 1, ptl); + pmd = mm_find_pmd(vma->vm_mm, address); + if (!pmd) + return; + pte = pte_offset_map_lock(vma->vm_mm, pmd, + address, &ptl); + } + + if (!pte_present(*pte)) + continue; + if (page_to_pfn(page) != pte_pfn(*pte)) + continue; + flush_cache_page(vma, address, page_to_pfn(page)); + entry = ptep_clear_flush(vma, address, pte); + if (pte_dirty(entry)) + SetPageDirty(page); + swp_entry = make_migration_entry(page, pte_write(entry)); + swp_pte = swp_entry_to_pte(swp_entry); + if (pte_soft_dirty(entry)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(vma->vm_mm, address, pte, swp_pte); + page_remove_rmap(page, false); + put_page(page); + } + pte_unmap_unlock(pte - 1, ptl); +} + +static void freeze_page(struct anon_vma *anon_vma, struct page *page) +{ + struct anon_vma_chain *avc; + pgoff_t pgoff = page_to_pgoff(page); + + VM_BUG_ON_PAGE(!PageHead(page), page); + + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, + pgoff + HPAGE_PMD_NR - 1) { + unsigned long address = __vma_address(page, avc->vma); + + mmu_notifier_invalidate_range_start(avc->vma->vm_mm, + address, address + HPAGE_PMD_SIZE); + freeze_page_vma(avc->vma, page, address); + mmu_notifier_invalidate_range_end(avc->vma->vm_mm, + address, address + HPAGE_PMD_SIZE); + } +} + +static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page, + unsigned long address) +{ + spinlock_t *ptl; + pmd_t *pmd; + pte_t *pte, entry; + swp_entry_t swp_entry; + unsigned long haddr = address & HPAGE_PMD_MASK; + int i, nr = HPAGE_PMD_NR; + + /* Skip pages which doesn't belong to the VMA */ + if (address < vma->vm_start) { + int off = (vma->vm_start - address) >> PAGE_SHIFT; + page += off; + nr -= off; + address = vma->vm_start; + } + + pmd = mm_find_pmd(vma->vm_mm, address); + if (!pmd) + return; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); + for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) { + /* + * We've just crossed page table boundary: need to map next one. + * It can happen if THP was mremaped to non-PMD aligned address. + */ + if (unlikely(address == haddr + HPAGE_PMD_SIZE)) { + pte_unmap_unlock(pte - 1, ptl); + pmd = mm_find_pmd(vma->vm_mm, address); + if (!pmd) + return; + pte = pte_offset_map_lock(vma->vm_mm, pmd, + address, &ptl); + } + + if (!is_swap_pte(*pte)) + continue; + + swp_entry = pte_to_swp_entry(*pte); + if (!is_migration_entry(swp_entry)) + continue; + if (migration_entry_to_page(swp_entry) != page) + continue; + + get_page(page); + page_add_anon_rmap(page, vma, address, false); + + entry = pte_mkold(mk_pte(page, vma->vm_page_prot)); + if (PageDirty(page)) + entry = pte_mkdirty(entry); + if (is_write_migration_entry(swp_entry)) + entry = maybe_mkwrite(entry, vma); + + flush_dcache_page(page); + set_pte_at(vma->vm_mm, address, pte, entry); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, pte); + } + pte_unmap_unlock(pte - 1, ptl); +} + +static void unfreeze_page(struct anon_vma *anon_vma, struct page *page) +{ + struct anon_vma_chain *avc; + pgoff_t pgoff = page_to_pgoff(page); + + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, + pgoff, pgoff + HPAGE_PMD_NR - 1) { + unsigned long address = __vma_address(page, avc->vma); + + mmu_notifier_invalidate_range_start(avc->vma->vm_mm, + address, address + HPAGE_PMD_SIZE); + unfreeze_page_vma(avc->vma, page, address); + mmu_notifier_invalidate_range_end(avc->vma->vm_mm, + address, address + HPAGE_PMD_SIZE); + } +} + +static int __split_huge_page_tail(struct page *head, int tail, + struct lruvec *lruvec, struct list_head *list) +{ + int mapcount; + struct page *page_tail = head + tail; + + mapcount = atomic_read(&page_tail->_mapcount) + 1; + VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail); + + /* + * tail_page->_count is zero and not changing from under us. But + * get_page_unless_zero() may be running from under us on the + * tail_page. If we used atomic_set() below instead of atomic_add(), we + * would then run atomic_set() concurrently with + * get_page_unless_zero(), and atomic_set() is implemented in C not + * using locked ops. spin_unlock on x86 sometime uses locked ops + * because of PPro errata 66, 92, so unless somebody can guarantee + * atomic_set() here would be safe on all archs (and not only on x86), + * it's safer to use atomic_add(). + */ + atomic_add(mapcount + 1, &page_tail->_count); + + + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + page_tail->flags |= (head->flags & + ((1L << PG_referenced) | + (1L << PG_swapbacked) | + (1L << PG_mlocked) | + (1L << PG_uptodate) | + (1L << PG_active) | + (1L << PG_locked) | + (1L << PG_unevictable) | + (1L << PG_dirty))); + + /* + * After clearing PageTail the gup refcount can be released. + * Page flags also must be visible before we make the page non-compound. + */ + smp_wmb(); + + clear_compound_head(page_tail); + + if (page_is_young(head)) + set_page_young(page_tail); + if (page_is_idle(head)) + set_page_idle(page_tail); + + /* ->mapping in first tail page is compound_mapcount */ + VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, + page_tail); + page_tail->mapping = head->mapping; + + page_tail->index = head->index + tail; + page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); + lru_add_page_tail(head, page_tail, lruvec, list); + + return mapcount; +} + +static void __split_huge_page(struct page *page, struct list_head *list) +{ + struct page *head = compound_head(page); + struct zone *zone = page_zone(head); + struct lruvec *lruvec; + int i, tail_mapcount; + + /* prevent PageLRU to go away from under us, and freeze lru stats */ + spin_lock_irq(&zone->lru_lock); + lruvec = mem_cgroup_page_lruvec(head, zone); + + /* complete memcg works before add pages to LRU */ + mem_cgroup_split_huge_fixup(head); + + tail_mapcount = 0; + for (i = HPAGE_PMD_NR - 1; i >= 1; i--) + tail_mapcount += __split_huge_page_tail(head, i, lruvec, list); + atomic_sub(tail_mapcount, &head->_count); + + ClearPageCompound(head); + spin_unlock_irq(&zone->lru_lock); + + unfreeze_page(page_anon_vma(head), head); + + for (i = 0; i < HPAGE_PMD_NR; i++) { + struct page *subpage = head + i; + if (subpage == page) + continue; + unlock_page(subpage); + + /* + * Subpages may be freed if there wasn't any mapping + * like if add_to_swap() is running on a lru page that + * had its mapping zapped. And freeing these pages + * requires taking the lru_lock so we do the put_page + * of the tail pages after the split is complete. + */ + put_page(subpage); } } + +int total_mapcount(struct page *page) +{ + int i, ret; + + VM_BUG_ON_PAGE(PageTail(page), page); + + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) + 1; + + ret = compound_mapcount(page); + if (PageHuge(page)) + return ret; + for (i = 0; i < HPAGE_PMD_NR; i++) + ret += atomic_read(&page[i]._mapcount) + 1; + if (PageDoubleMap(page)) + ret -= HPAGE_PMD_NR; + return ret; +} + +/* + * This function splits huge page into normal pages. @page can point to any + * subpage of huge page to split. Split doesn't change the position of @page. + * + * Only caller must hold pin on the @page, otherwise split fails with -EBUSY. + * The huge page must be locked. + * + * If @list is null, tail pages will be added to LRU list, otherwise, to @list. + * + * Both head page and tail pages will inherit mapping, flags, and so on from + * the hugepage. + * + * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if + * they are not mapped. + * + * Returns 0 if the hugepage is split successfully. + * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under + * us. + */ +int split_huge_page_to_list(struct page *page, struct list_head *list) +{ + struct page *head = compound_head(page); + struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); + struct anon_vma *anon_vma; + int count, mapcount, ret; + bool mlocked; + unsigned long flags; + + VM_BUG_ON_PAGE(is_huge_zero_page(page), page); + VM_BUG_ON_PAGE(!PageAnon(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageSwapBacked(page), page); + VM_BUG_ON_PAGE(!PageCompound(page), page); + + /* + * The caller does not necessarily hold an mmap_sem that would prevent + * the anon_vma disappearing so we first we take a reference to it + * and then lock the anon_vma for write. This is similar to + * page_lock_anon_vma_read except the write lock is taken to serialise + * against parallel split or collapse operations. + */ + anon_vma = page_get_anon_vma(head); + if (!anon_vma) { + ret = -EBUSY; + goto out; + } + anon_vma_lock_write(anon_vma); + + /* + * Racy check if we can split the page, before freeze_page() will + * split PMDs + */ + if (total_mapcount(head) != page_count(head) - 1) { + ret = -EBUSY; + goto out_unlock; + } + + mlocked = PageMlocked(page); + freeze_page(anon_vma, head); + VM_BUG_ON_PAGE(compound_mapcount(head), head); + + /* Make sure the page is not on per-CPU pagevec as it takes pin */ + if (mlocked) + lru_add_drain(); + + /* Prevent deferred_split_scan() touching ->_count */ + spin_lock_irqsave(&pgdata->split_queue_lock, flags); + count = page_count(head); + mapcount = total_mapcount(head); + if (!mapcount && count == 1) { + if (!list_empty(page_deferred_list(head))) { + pgdata->split_queue_len--; + list_del(page_deferred_list(head)); + } + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + __split_huge_page(page, list); + ret = 0; + } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + pr_alert("total_mapcount: %u, page_count(): %u\n", + mapcount, count); + if (PageTail(page)) + dump_page(head, NULL); + dump_page(page, "total_mapcount(head) > 0"); + BUG(); + } else { + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + unfreeze_page(anon_vma, head); + ret = -EBUSY; + } + +out_unlock: + anon_vma_unlock_write(anon_vma); + put_anon_vma(anon_vma); +out: + count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); + return ret; +} + +void free_transhuge_page(struct page *page) +{ + struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); + unsigned long flags; + + spin_lock_irqsave(&pgdata->split_queue_lock, flags); + if (!list_empty(page_deferred_list(page))) { + pgdata->split_queue_len--; + list_del(page_deferred_list(page)); + } + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + free_compound_page(page); +} + +void deferred_split_huge_page(struct page *page) +{ + struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); + unsigned long flags; + + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + + spin_lock_irqsave(&pgdata->split_queue_lock, flags); + if (list_empty(page_deferred_list(page))) { + list_add_tail(page_deferred_list(page), &pgdata->split_queue); + pgdata->split_queue_len++; + } + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); +} + +static unsigned long deferred_split_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct pglist_data *pgdata = NODE_DATA(sc->nid); + return ACCESS_ONCE(pgdata->split_queue_len); +} + +static unsigned long deferred_split_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct pglist_data *pgdata = NODE_DATA(sc->nid); + unsigned long flags; + LIST_HEAD(list), *pos, *next; + struct page *page; + int split = 0; + + spin_lock_irqsave(&pgdata->split_queue_lock, flags); + /* Take pin on all head pages to avoid freeing them under us */ + list_for_each_safe(pos, next, &pgdata->split_queue) { + page = list_entry((void *)pos, struct page, mapping); + page = compound_head(page); + if (get_page_unless_zero(page)) { + list_move(page_deferred_list(page), &list); + } else { + /* We lost race with put_compound_page() */ + list_del_init(page_deferred_list(page)); + pgdata->split_queue_len--; + } + if (!--sc->nr_to_scan) + break; + } + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + + list_for_each_safe(pos, next, &list) { + page = list_entry((void *)pos, struct page, mapping); + lock_page(page); + /* split_huge_page() removes page from list on success */ + if (!split_huge_page(page)) + split++; + unlock_page(page); + put_page(page); + } + + spin_lock_irqsave(&pgdata->split_queue_lock, flags); + list_splice_tail(&list, &pgdata->split_queue); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + + /* + * Stop shrinker if we didn't split any page, but the queue is empty. + * This can happen if pages were freed under us. + */ + if (!split && list_empty(&pgdata->split_queue)) + return SHRINK_STOP; + return split; +} + +static struct shrinker deferred_split_shrinker = { + .count_objects = deferred_split_count, + .scan_objects = deferred_split_scan, + .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_NUMA_AWARE, +}; + +#ifdef CONFIG_DEBUG_FS +static int split_huge_pages_set(void *data, u64 val) +{ + struct zone *zone; + struct page *page; + unsigned long pfn, max_zone_pfn; + unsigned long total = 0, split = 0; + + if (val != 1) + return -EINVAL; + + for_each_populated_zone(zone) { + max_zone_pfn = zone_end_pfn(zone); + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { + if (!pfn_valid(pfn)) + continue; + + page = pfn_to_page(pfn); + if (!get_page_unless_zero(page)) + continue; + + if (zone != page_zone(page)) + goto next; + + if (!PageHead(page) || !PageAnon(page) || + PageHuge(page)) + goto next; + + total++; + lock_page(page); + if (!split_huge_page(page)) + split++; + unlock_page(page); +next: + put_page(page); + } + } + + pr_info("%lu of %lu THP split", split, total); + + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, + "%llu\n"); + +static int __init split_huge_pages_debugfs(void) +{ + void *ret; + + ret = debugfs_create_file("split_huge_pages", 0644, NULL, NULL, + &split_huge_pages_fops); + if (!ret) + pr_warn("Failed to create split_huge_pages in debugfs"); + return 0; +} +late_initcall(split_huge_pages_debugfs); +#endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ef6963b..06ae13e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4,7 +4,6 @@ */ #include <linux/list.h> #include <linux/init.h> -#include <linux/module.h> #include <linux/mm.h> #include <linux/seq_file.h> #include <linux/sysctl.h> @@ -1002,7 +1001,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -#if defined(CONFIG_CMA) && defined(CONFIG_X86_64) +#if defined(CONFIG_X86_64) && ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)) static void destroy_compound_gigantic_page(struct page *page, unsigned int order) { @@ -1215,8 +1214,8 @@ void free_huge_page(struct page *page) set_page_private(page, 0); page->mapping = NULL; - BUG_ON(page_count(page)); - BUG_ON(page_mapcount(page)); + VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(page_mapcount(page), page); restore_reserve = PagePrivate(page); ClearPagePrivate(page); @@ -1268,8 +1267,8 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order) /* we rely on prep_new_huge_page to set the destructor */ set_compound_order(page, order); - __SetPageHead(page); __ClearPageReserved(page); + __SetPageHead(page); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { /* * For gigantic hugepages allocated through bootmem at @@ -1287,6 +1286,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order) set_page_count(p, 0); set_compound_head(p, page); } + atomic_set(compound_mapcount_ptr(page), -1); } /* @@ -2549,25 +2549,6 @@ static void hugetlb_unregister_node(struct node *node) nhs->hugepages_kobj = NULL; } -/* - * hugetlb module exit: unregister hstate attributes from node devices - * that have them. - */ -static void hugetlb_unregister_all_nodes(void) -{ - int nid; - - /* - * disable node device registrations. - */ - register_hugetlbfs_with_node(NULL, NULL); - - /* - * remove hstate attributes from any nodes that have them. - */ - for (nid = 0; nid < nr_node_ids; nid++) - hugetlb_unregister_node(node_devices[nid]); -} /* * Register hstate attributes for a single node device. @@ -2632,27 +2613,10 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) return NULL; } -static void hugetlb_unregister_all_nodes(void) { } - static void hugetlb_register_all_nodes(void) { } #endif -static void __exit hugetlb_exit(void) -{ - struct hstate *h; - - hugetlb_unregister_all_nodes(); - - for_each_hstate(h) { - kobject_put(hstate_kobjs[hstate_index(h)]); - } - - kobject_put(hugepages_kobj); - kfree(hugetlb_fault_mutex_table); -} -module_exit(hugetlb_exit); - static int __init hugetlb_init(void) { int i; @@ -2690,7 +2654,7 @@ static int __init hugetlb_init(void) mutex_init(&hugetlb_fault_mutex_table[i]); return 0; } -module_init(hugetlb_init); +subsys_initcall(hugetlb_init); /* Should be called on processing a hugepagesz=... option */ void __init hugetlb_add_hstate(unsigned int order) @@ -3139,7 +3103,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); get_page(ptepage); - page_dup_rmap(ptepage); + page_dup_rmap(ptepage, true); set_huge_pte_at(dst, addr, dst_pte, entry); hugetlb_count_add(pages_per_huge_page(h), dst); } @@ -3223,7 +3187,7 @@ again: set_page_dirty(page); hugetlb_count_sub(pages_per_huge_page(h), mm); - page_remove_rmap(page); + page_remove_rmap(page, true); force_flush = !__tlb_remove_page(tlb, page); if (force_flush) { address += sz; @@ -3452,7 +3416,7 @@ retry_avoidcopy: mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); - page_remove_rmap(old_page); + page_remove_rmap(old_page, true); hugepage_add_new_anon_rmap(new_page, vma, address); /* Make the old page be freed below */ new_page = old_page; @@ -3622,7 +3586,7 @@ retry: ClearPagePrivate(page); hugepage_add_new_anon_rmap(page, vma, address); } else - page_dup_rmap(page); + page_dup_rmap(page, true); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); set_huge_pte_at(mm, address, ptep, new_pte); @@ -3902,7 +3866,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, same_page: if (pages) { pages[i] = mem_map_offset(page, pfn_offset); - get_page_foll(pages[i]); + get_page(pages[i]); } if (vmas) diff --git a/mm/internal.h b/mm/internal.h index 38e24b8..a38a21e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -13,6 +13,7 @@ #include <linux/fs.h> #include <linux/mm.h> +#include <linux/pagemap.h> /* * The set of flags that only affect watermark checking and reclaim @@ -66,50 +67,6 @@ static inline void set_page_refcounted(struct page *page) set_page_count(page, 1); } -static inline void __get_page_tail_foll(struct page *page, - bool get_page_head) -{ - /* - * If we're getting a tail page, the elevated page->_count is - * required only in the head page and we will elevate the head - * page->_count and tail page->_mapcount. - * - * We elevate page_tail->_mapcount for tail pages to force - * page_tail->_count to be zero at all times to avoid getting - * false positives from get_page_unless_zero() with - * speculative page access (like in - * page_cache_get_speculative()) on tail pages. - */ - VM_BUG_ON_PAGE(atomic_read(&compound_head(page)->_count) <= 0, page); - if (get_page_head) - atomic_inc(&compound_head(page)->_count); - get_huge_page_tail(page); -} - -/* - * This is meant to be called as the FOLL_GET operation of - * follow_page() and it must be called while holding the proper PT - * lock while the pte (or pmd_trans_huge) is still mapping the page. - */ -static inline void get_page_foll(struct page *page) -{ - if (unlikely(PageTail(page))) - /* - * This is safe only because - * __split_huge_page_refcount() can't run under - * get_page_foll() because we hold the proper PT lock. - */ - __get_page_tail_foll(page, true); - else { - /* - * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_count. - */ - VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); - atomic_inc(&page->_count); - } -} - extern unsigned long highest_memmap_pfn; /* @@ -259,6 +216,37 @@ static inline bool is_cow_mapping(vm_flags_t flags) return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } +/* + * These three helpers classifies VMAs for virtual memory accounting. + */ + +/* + * Executable code area - executable, not writable, not stack + */ +static inline bool is_exec_mapping(vm_flags_t flags) +{ + return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; +} + +/* + * Stack area - atomatically grows in one direction + * + * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: + * do_mmap() forbids all other combinations. + */ +static inline bool is_stack_mapping(vm_flags_t flags) +{ + return (flags & VM_STACK) == VM_STACK; +} + +/* + * Data area - private, writable, not stack + */ +static inline bool is_data_mapping(vm_flags_t flags) +{ + return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; +} + /* mm/util.c */ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent); @@ -309,10 +297,27 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -extern unsigned long vma_address(struct page *page, - struct vm_area_struct *vma); -#endif +/* + * At what user virtual address is page expected in @vma? + */ +static inline unsigned long +__vma_address(struct page *page, struct vm_area_struct *vma) +{ + pgoff_t pgoff = page_to_pgoff(page); + return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); +} + +static inline unsigned long +vma_address(struct page *page, struct vm_area_struct *vma) +{ + unsigned long address = __vma_address(page, vma); + + /* page should be within @vma mapping range */ + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); + + return address; +} + #else /* !CONFIG_MMU */ static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 6471014..a61460d 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -1,4 +1,5 @@ KASAN_SANITIZE := n +UBSAN_SANITIZE_kasan.o := n CFLAGS_REMOVE_kasan.o = -pg # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 19423a4..25c0ad3 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -122,8 +122,7 @@ #define BYTES_PER_POINTER sizeof(void *) /* GFP bitmask for kmemleak internal allocations */ -#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC | \ - __GFP_NOACCOUNT)) | \ +#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \ __GFP_NORETRY | __GFP_NOMEMALLOC | \ __GFP_NOWARN) @@ -441,20 +441,6 @@ static void break_cow(struct rmap_item *rmap_item) up_read(&mm->mmap_sem); } -static struct page *page_trans_compound_anon(struct page *page) -{ - if (PageTransCompound(page)) { - struct page *head = compound_head(page); - /* - * head may actually be splitted and freed from under - * us but it's ok here. - */ - if (PageAnon(head)) - return head; - } - return NULL; -} - static struct page *get_mergeable_page(struct rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; @@ -470,7 +456,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) page = follow_page(vma, addr, FOLL_GET); if (IS_ERR_OR_NULL(page)) goto out; - if (PageAnon(page) || page_trans_compound_anon(page)) { + if (PageAnon(page)) { flush_anon_page(vma, page, addr); flush_dcache_page(page); } else { @@ -740,8 +726,7 @@ static int remove_stable_node(struct stable_node *stable_node) static int remove_all_stable_nodes(void) { - struct stable_node *stable_node; - struct list_head *this, *next; + struct stable_node *stable_node, *next; int nid; int err = 0; @@ -756,8 +741,7 @@ static int remove_all_stable_nodes(void) cond_resched(); } } - list_for_each_safe(this, next, &migrate_nodes) { - stable_node = list_entry(this, struct stable_node, list); + list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { if (remove_stable_node(stable_node)) err = -EBUSY; cond_resched(); @@ -958,13 +942,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, } get_page(kpage); - page_add_anon_rmap(kpage, vma, addr); + page_add_anon_rmap(kpage, vma, addr, false); flush_cache_page(vma, addr, pte_pfn(*ptep)); ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); - page_remove_rmap(page); + page_remove_rmap(page, false); if (!page_mapped(page)) try_to_free_swap(page); put_page(page); @@ -977,33 +961,6 @@ out: return err; } -static int page_trans_compound_anon_split(struct page *page) -{ - int ret = 0; - struct page *transhuge_head = page_trans_compound_anon(page); - if (transhuge_head) { - /* Get the reference on the head to split it. */ - if (get_page_unless_zero(transhuge_head)) { - /* - * Recheck we got the reference while the head - * was still anonymous. - */ - if (PageAnon(transhuge_head)) - ret = split_huge_page(transhuge_head); - else - /* - * Retry later if split_huge_page run - * from under us. - */ - ret = 1; - put_page(transhuge_head); - } else - /* Retry later if split_huge_page run from under us. */ - ret = 1; - } - return ret; -} - /* * try_to_merge_one_page - take two pages and merge them into one * @vma: the vma that holds the pte pointing to page @@ -1022,9 +979,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, if (page == kpage) /* ksm page forked */ return 0; - if (PageTransCompound(page) && page_trans_compound_anon_split(page)) - goto out; - BUG_ON(PageTransCompound(page)); if (!PageAnon(page)) goto out; @@ -1037,6 +991,13 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, */ if (!trylock_page(page)) goto out; + + if (PageTransCompound(page)) { + err = split_huge_page(page); + if (err) + goto out_unlock; + } + /* * If this anonymous page is mapped only here, its pte may need * to be write-protected. If it's mapped elsewhere, all of its @@ -1052,6 +1013,12 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, */ set_page_stable_node(page, NULL); mark_page_accessed(page); + /* + * Page reclaim just frees a clean page with no dirty + * ptes: make sure that the ksm page would be swapped. + */ + if (!PageDirty(page)) + SetPageDirty(page); err = 0; } else if (pages_identical(page, kpage)) err = replace_page(vma, page, kpage, orig_pte); @@ -1067,6 +1034,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, } } +out_unlock: unlock_page(page); out: return err; @@ -1583,13 +1551,11 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) * so prune them once before each full scan. */ if (!ksm_merge_across_nodes) { - struct stable_node *stable_node; - struct list_head *this, *next; + struct stable_node *stable_node, *next; struct page *page; - list_for_each_safe(this, next, &migrate_nodes) { - stable_node = list_entry(this, - struct stable_node, list); + list_for_each_entry_safe(stable_node, next, + &migrate_nodes, list) { page = get_ksm_page(stable_node, false); if (page) put_page(page); @@ -1639,8 +1605,7 @@ next_mm: cond_resched(); continue; } - if (PageAnon(*page) || - page_trans_compound_anon(*page)) { + if (PageAnon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); rmap_item = get_next_rmap_item(slot, @@ -1903,7 +1868,7 @@ struct page *ksm_might_need_to_copy(struct page *page, SetPageDirty(new_page); __SetPageUptodate(new_page); - __set_page_locked(new_page); + __SetPageLocked(new_page); } return new_page; @@ -2012,8 +1977,7 @@ static void wait_while_offlining(void) static void ksm_check_stable_tree(unsigned long start_pfn, unsigned long end_pfn) { - struct stable_node *stable_node; - struct list_head *this, *next; + struct stable_node *stable_node, *next; struct rb_node *node; int nid; @@ -2034,8 +1998,7 @@ static void ksm_check_stable_tree(unsigned long start_pfn, cond_resched(); } } - list_for_each_safe(this, next, &migrate_nodes) { - stable_node = list_entry(this, struct stable_node, list); + list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { if (stable_node->kpfn >= start_pfn && stable_node->kpfn < end_pfn) remove_node_from_stable_tree(stable_node); diff --git a/mm/list_lru.c b/mm/list_lru.c index afc71ea..1d05cb9 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -12,7 +12,7 @@ #include <linux/mutex.h> #include <linux/memcontrol.h> -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) static LIST_HEAD(list_lrus); static DEFINE_MUTEX(list_lrus_mutex); @@ -37,9 +37,9 @@ static void list_lru_register(struct list_lru *lru) static void list_lru_unregister(struct list_lru *lru) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) static inline bool list_lru_memcg_aware(struct list_lru *lru) { /* @@ -104,7 +104,7 @@ list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) { return &nlru->lru; } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ bool list_lru_add(struct list_lru *lru, struct list_head *item) { @@ -292,7 +292,7 @@ static void init_one_lru(struct list_lru_one *l) l->nr_items = 0; } -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus, int begin, int end) { @@ -529,7 +529,7 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) static void memcg_destroy_list_lru(struct list_lru *lru) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct lock_class_key *key) diff --git a/mm/madvise.c b/mm/madvise.c index c889fcb..f56825b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -20,6 +20,9 @@ #include <linux/backing-dev.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/mmu_notifier.h> + +#include <asm/tlb.h> /* * Any behaviour which results in changes to the vma->vm_flags needs to @@ -32,6 +35,7 @@ static int madvise_need_mmap_write(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_FREE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ @@ -256,6 +260,194 @@ static long madvise_willneed(struct vm_area_struct *vma, return 0; } +static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) + +{ + struct mmu_gather *tlb = walk->private; + struct mm_struct *mm = tlb->mm; + struct vm_area_struct *vma = walk->vma; + spinlock_t *ptl; + pte_t *orig_pte, *pte, ptent; + struct page *page; + int nr_swap = 0; + unsigned long next; + + next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) + if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) + goto next; + + if (pmd_trans_unstable(pmd)) + return 0; + + orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + arch_enter_lazy_mmu_mode(); + for (; addr != end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + + if (pte_none(ptent)) + continue; + /* + * If the pte has swp_entry, just clear page table to + * prevent swap-in which is more expensive rather than + * (page allocation + zeroing). + */ + if (!pte_present(ptent)) { + swp_entry_t entry; + + entry = pte_to_swp_entry(ptent); + if (non_swap_entry(entry)) + continue; + nr_swap--; + free_swap_and_cache(entry); + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + continue; + } + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + /* + * If pmd isn't transhuge but the page is THP and + * is owned by only this process, split it and + * deactivate all pages. + */ + if (PageTransCompound(page)) { + if (page_mapcount(page) != 1) + goto out; + get_page(page); + if (!trylock_page(page)) { + put_page(page); + goto out; + } + pte_unmap_unlock(orig_pte, ptl); + if (split_huge_page(page)) { + unlock_page(page); + put_page(page); + pte_offset_map_lock(mm, pmd, addr, &ptl); + goto out; + } + put_page(page); + unlock_page(page); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte--; + addr -= PAGE_SIZE; + continue; + } + + VM_BUG_ON_PAGE(PageTransCompound(page), page); + + if (PageSwapCache(page) || PageDirty(page)) { + if (!trylock_page(page)) + continue; + /* + * If page is shared with others, we couldn't clear + * PG_dirty of the page. + */ + if (page_mapcount(page) != 1) { + unlock_page(page); + continue; + } + + if (PageSwapCache(page) && !try_to_free_swap(page)) { + unlock_page(page); + continue; + } + + ClearPageDirty(page); + unlock_page(page); + } + + if (pte_young(ptent) || pte_dirty(ptent)) { + /* + * Some of architecture(ex, PPC) don't update TLB + * with set_pte_at and tlb_remove_tlb_entry so for + * the portability, remap the pte with old|clean + * after pte clearing. + */ + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + + ptent = pte_mkold(ptent); + ptent = pte_mkclean(ptent); + set_pte_at(mm, addr, pte, ptent); + if (PageActive(page)) + deactivate_page(page); + tlb_remove_tlb_entry(tlb, pte, addr); + } + } +out: + if (nr_swap) { + if (current->mm == mm) + sync_mm_rss(mm); + + add_mm_counter(mm, MM_SWAPENTS, nr_swap); + } + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(orig_pte, ptl); + cond_resched(); +next: + return 0; +} + +static void madvise_free_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + struct mm_walk free_walk = { + .pmd_entry = madvise_free_pte_range, + .mm = vma->vm_mm, + .private = tlb, + }; + + tlb_start_vma(tlb, vma); + walk_page_range(addr, end, &free_walk); + tlb_end_vma(tlb, vma); +} + +static int madvise_free_single_vma(struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr) +{ + unsigned long start, end; + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) + return -EINVAL; + + /* MADV_FREE works for only anon vma at the moment */ + if (!vma_is_anonymous(vma)) + return -EINVAL; + + start = max(vma->vm_start, start_addr); + if (start >= vma->vm_end) + return -EINVAL; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + return -EINVAL; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, start, end); + update_hiwater_rss(mm); + + mmu_notifier_invalidate_range_start(mm, start, end); + madvise_free_page_range(&tlb, vma, start, end); + mmu_notifier_invalidate_range_end(mm, start, end); + tlb_finish_mmu(&tlb, start, end); + + return 0; +} + +static long madvise_free(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + *prev = vma; + return madvise_free_single_vma(vma, start, end); +} + /* * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about @@ -379,6 +571,14 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); + case MADV_FREE: + /* + * XXX: In this implementation, MADV_FREE works like + * MADV_DONTNEED on swapless system or full swap. + */ + if (get_nr_swap_pages() > 0) + return madvise_free(vma, prev, start, end); + /* passthrough */ case MADV_DONTNEED: return madvise_dontneed(vma, prev, start, end); default: @@ -398,6 +598,7 @@ madvise_behavior_valid(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_FREE: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: diff --git a/mm/memblock.c b/mm/memblock.c index d300f13..dd79899 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -96,13 +96,10 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type, { unsigned long i; - for (i = 0; i < type->cnt; i++) { - phys_addr_t rgnbase = type->regions[i].base; - phys_addr_t rgnsize = type->regions[i].size; - if (memblock_addrs_overlap(base, size, rgnbase, rgnsize)) + for (i = 0; i < type->cnt; i++) + if (memblock_addrs_overlap(base, size, type->regions[i].base, + type->regions[i].size)) break; - } - return i < type->cnt; } @@ -528,7 +525,8 @@ int __init_memblock memblock_add_range(struct memblock_type *type, bool insert = false; phys_addr_t obase = base; phys_addr_t end = base + memblock_cap_size(base, &size); - int i, nr_new; + int idx, nr_new; + struct memblock_region *rgn; if (!size) return 0; @@ -552,8 +550,7 @@ repeat: base = obase; nr_new = 0; - for (i = 0; i < type->cnt; i++) { - struct memblock_region *rgn = &type->regions[i]; + for_each_memblock_type(type, rgn) { phys_addr_t rbase = rgn->base; phys_addr_t rend = rbase + rgn->size; @@ -572,7 +569,7 @@ repeat: WARN_ON(flags != rgn->flags); nr_new++; if (insert) - memblock_insert_region(type, i++, base, + memblock_insert_region(type, idx++, base, rbase - base, nid, flags); } @@ -584,7 +581,7 @@ repeat: if (base < end) { nr_new++; if (insert) - memblock_insert_region(type, i, base, end - base, + memblock_insert_region(type, idx, base, end - base, nid, flags); } @@ -651,7 +648,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, int *start_rgn, int *end_rgn) { phys_addr_t end = base + memblock_cap_size(base, &size); - int i; + int idx; + struct memblock_region *rgn; *start_rgn = *end_rgn = 0; @@ -663,8 +661,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, if (memblock_double_array(type, base, size) < 0) return -ENOMEM; - for (i = 0; i < type->cnt; i++) { - struct memblock_region *rgn = &type->regions[i]; + for_each_memblock_type(type, rgn) { phys_addr_t rbase = rgn->base; phys_addr_t rend = rbase + rgn->size; @@ -681,7 +678,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, rgn->base = base; rgn->size -= base - rbase; type->total_size -= base - rbase; - memblock_insert_region(type, i, rbase, base - rbase, + memblock_insert_region(type, idx, rbase, base - rbase, memblock_get_region_node(rgn), rgn->flags); } else if (rend > end) { @@ -692,14 +689,14 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, rgn->base = end; rgn->size -= end - rbase; type->total_size -= end - rbase; - memblock_insert_region(type, i--, rbase, end - rbase, + memblock_insert_region(type, idx--, rbase, end - rbase, memblock_get_region_node(rgn), rgn->flags); } else { /* @rgn is fully contained, record it */ if (!*end_rgn) - *start_rgn = i; - *end_rgn = i + 1; + *start_rgn = idx; + *end_rgn = idx + 1; } } @@ -822,6 +819,17 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR); } +/** + * memblock_mark_nomap - Mark a memory region with flag MEMBLOCK_NOMAP. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return 0 on success, -errno on failure. + */ +int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(base, size, 1, MEMBLOCK_NOMAP); +} /** * __next_reserved_mem_region - next function for for_each_reserved_region() @@ -913,6 +921,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) continue; + /* skip nomap memory unless we were asked for it explicitly */ + if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) + continue; + if (!type_b) { if (out_start) *out_start = m_start; @@ -1022,6 +1034,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) continue; + /* skip nomap memory unless we were asked for it explicitly */ + if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) + continue; + if (!type_b) { if (out_start) *out_start = m_start; @@ -1432,7 +1448,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) * Remaining API functions */ -phys_addr_t __init memblock_phys_mem_size(void) +phys_addr_t __init_memblock memblock_phys_mem_size(void) { return memblock.memory.total_size; } @@ -1509,16 +1525,25 @@ static int __init_memblock memblock_search(struct memblock_type *type, phys_addr return -1; } -int __init memblock_is_reserved(phys_addr_t addr) +bool __init memblock_is_reserved(phys_addr_t addr) { return memblock_search(&memblock.reserved, addr) != -1; } -int __init_memblock memblock_is_memory(phys_addr_t addr) +bool __init_memblock memblock_is_memory(phys_addr_t addr) { return memblock_search(&memblock.memory, addr) != -1; } +int __init_memblock memblock_is_map_memory(phys_addr_t addr) +{ + int i = memblock_search(&memblock.memory, addr); + + if (i == -1) + return false; + return !memblock_is_nomap(&memblock.memory.regions[i]); +} + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int __init_memblock memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn) @@ -1613,12 +1638,12 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name { unsigned long long base, size; unsigned long flags; - int i; + int idx; + struct memblock_region *rgn; pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); - for (i = 0; i < type->cnt; i++) { - struct memblock_region *rgn = &type->regions[i]; + for_each_memblock_type(type, rgn) { char nid_buf[32] = ""; base = rgn->base; @@ -1630,7 +1655,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name memblock_get_region_node(rgn)); #endif pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n", - name, i, base, base + size - 1, size, nid_buf, flags); + name, idx, base, base + size - 1, size, nid_buf, flags); } } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e234c21..d06cae2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -66,7 +66,6 @@ #include "internal.h" #include <net/sock.h> #include <net/ip.h> -#include <net/tcp_memcontrol.h> #include "slab.h" #include <asm/uaccess.h> @@ -76,9 +75,15 @@ struct cgroup_subsys memory_cgrp_subsys __read_mostly; EXPORT_SYMBOL(memory_cgrp_subsys); +struct mem_cgroup *root_mem_cgroup __read_mostly; + #define MEM_CGROUP_RECLAIM_RETRIES 5 -static struct mem_cgroup *root_mem_cgroup __read_mostly; -struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly; + +/* Socket memory accounting disabled? */ +static bool cgroup_memory_nosocket; + +/* Kernel memory accounting disabled? */ +static bool cgroup_memory_nokmem; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP @@ -87,6 +92,12 @@ int do_swap_account __read_mostly; #define do_swap_account 0 #endif +/* Whether legacy memory+swap accounting is active */ +static bool do_memsw_account(void) +{ + return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account; +} + static const char * const mem_cgroup_stat_names[] = { "cache", "rss", @@ -230,6 +241,7 @@ enum res_type { _MEMSWAP, _OOM_TYPE, _KMEM, + _TCP, }; #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) @@ -238,13 +250,6 @@ enum res_type { /* Used for OOM nofiier */ #define OOM_CONTROL (0) -/* - * The memcg_create_mutex will be held whenever a new cgroup is created. - * As a consequence, any change that needs to protect against new child cgroups - * appearing has to hold it as well. - */ -static DEFINE_MUTEX(memcg_create_mutex); - /* Some nice accessors for the vmpressure. */ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) { @@ -288,65 +293,7 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return mem_cgroup_from_css(css); } -/* Writing them here to avoid exposing memcg's inner layout */ -#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) - -void sock_update_memcg(struct sock *sk) -{ - if (mem_cgroup_sockets_enabled) { - struct mem_cgroup *memcg; - struct cg_proto *cg_proto; - - BUG_ON(!sk->sk_prot->proto_cgroup); - - /* Socket cloning can throw us here with sk_cgrp already - * filled. It won't however, necessarily happen from - * process context. So the test for root memcg given - * the current task's memcg won't help us in this case. - * - * Respecting the original socket's memcg is a better - * decision in this case. - */ - if (sk->sk_cgrp) { - BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); - css_get(&sk->sk_cgrp->memcg->css); - return; - } - - rcu_read_lock(); - memcg = mem_cgroup_from_task(current); - cg_proto = sk->sk_prot->proto_cgroup(memcg); - if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) && - css_tryget_online(&memcg->css)) { - sk->sk_cgrp = cg_proto; - } - rcu_read_unlock(); - } -} -EXPORT_SYMBOL(sock_update_memcg); - -void sock_release_memcg(struct sock *sk) -{ - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { - struct mem_cgroup *memcg; - WARN_ON(!sk->sk_cgrp->memcg); - memcg = sk->sk_cgrp->memcg; - css_put(&sk->sk_cgrp->memcg->css); - } -} - -struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) -{ - if (!memcg || mem_cgroup_is_root(memcg)) - return NULL; - - return &memcg->tcp_mem; -} -EXPORT_SYMBOL(tcp_proto_cgroup); - -#endif - -#ifdef CONFIG_MEMCG_KMEM +#ifndef CONFIG_SLOB /* * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. * The main reason for not using cgroup id for this: @@ -395,10 +342,10 @@ void memcg_put_cache_ids(void) * conditional to this static branch, we'll have to allow modules that does * kmem_cache_alloc and the such to see this symbol as well */ -struct static_key memcg_kmem_enabled_key; +DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); EXPORT_SYMBOL(memcg_kmem_enabled_key); -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* !CONFIG_SLOB */ static struct mem_cgroup_per_zone * mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) @@ -419,26 +366,16 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) * * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup * is returned. - * - * XXX: The above description of behavior on the default hierarchy isn't - * strictly true yet as replace_page_cache_page() can modify the - * association before @page is released even on the default hierarchy; - * however, the current and planned usages don't mix the the two functions - * and replace_page_cache_page() will soon be updated to make the invariant - * actually true. */ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) { struct mem_cgroup *memcg; - rcu_read_lock(); - memcg = page->mem_cgroup; if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) memcg = root_mem_cgroup; - rcu_read_unlock(); return &memcg->css; } @@ -696,7 +633,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, - int nr_pages) + bool compound, int nr_pages) { /* * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is @@ -709,9 +646,11 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); - if (PageTransHuge(page)) + if (compound) { + VM_BUG_ON_PAGE(!PageTransHuge(page), page); __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_pages); + } /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) @@ -903,14 +842,20 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (prev && reclaim->generation != iter->generation) goto out_unlock; - do { + while (1) { pos = READ_ONCE(iter->position); + if (!pos || css_tryget(&pos->css)) + break; /* - * A racing update may change the position and - * put the last reference, hence css_tryget(), - * or retry to see the updated position. + * css reference reached zero, so iter->position will + * be cleared by ->css_released. However, we should not + * rely on this happening soon, because ->css_released + * is called from a work queue, and by busy-waiting we + * might block it. So we clear iter->position right + * away. */ - } while (pos && !css_tryget(&pos->css)); + (void)cmpxchg(&iter->position, pos, NULL); + } } if (pos) @@ -940,33 +885,20 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (css == &root->css) break; - if (css_tryget(css)) { - /* - * Make sure the memcg is initialized: - * mem_cgroup_css_online() orders the the - * initialization against setting the flag. - */ - if (smp_load_acquire(&memcg->initialized)) - break; - - css_put(css); - } + if (css_tryget(css)) + break; memcg = NULL; } if (reclaim) { - if (cmpxchg(&iter->position, pos, memcg) == pos) { - if (memcg) - css_get(&memcg->css); - if (pos) - css_put(&pos->css); - } - /* - * pairs with css_tryget when dereferencing iter->position - * above. + * The position could have already been updated by a competing + * thread, so check that the value hasn't changed since we read + * it to avoid reclaiming from the same cgroup twice. */ + (void)cmpxchg(&iter->position, pos, memcg); + if (pos) css_put(&pos->css); @@ -999,6 +931,28 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, css_put(&prev->css); } +static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) +{ + struct mem_cgroup *memcg = dead_memcg; + struct mem_cgroup_reclaim_iter *iter; + struct mem_cgroup_per_zone *mz; + int nid, zid; + int i; + + while ((memcg = parent_mem_cgroup(memcg))) { + for_each_node(nid) { + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; + for (i = 0; i <= DEF_PRIORITY; i++) { + iter = &mz->iter[i]; + cmpxchg(&iter->position, + dead_memcg, NULL); + } + } + } + } +} + /* * Iteration constructs for visiting all cgroups (under a tree). If * loops are exited prematurely (break), mem_cgroup_iter_break() must @@ -1138,9 +1092,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) return ret; } -#define mem_cgroup_from_counter(counter, member) \ - container_of(counter, struct mem_cgroup, member) - /** * mem_cgroup_margin - calculate chargeable space of a memory cgroup * @memcg: the memory cgroup @@ -1159,7 +1110,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) if (count < limit) margin = limit - count; - if (do_swap_account) { + if (do_memsw_account()) { count = page_counter_read(&memcg->memsw); limit = READ_ONCE(memcg->memsw.limit); if (count <= limit) @@ -1301,9 +1252,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) limit = memcg->memory.limit; if (mem_cgroup_swappiness(memcg)) { unsigned long memsw_limit; + unsigned long swap_limit; memsw_limit = memcg->memsw.limit; - limit = min(limit + total_swap_pages, memsw_limit); + swap_limit = memcg->swap.limit; + swap_limit = min(swap_limit, (unsigned long)total_swap_pages); + limit = min(limit + swap_limit, memsw_limit); } return limit; } @@ -1885,7 +1839,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) if (stock->nr_pages) { page_counter_uncharge(&old->memory, stock->nr_pages); - if (do_swap_account) + if (do_memsw_account()) page_counter_uncharge(&old->memsw, stock->nr_pages); css_put_many(&old->css, stock->nr_pages); stock->nr_pages = 0; @@ -1973,6 +1927,26 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, return NOTIFY_OK; } +static void reclaim_high(struct mem_cgroup *memcg, + unsigned int nr_pages, + gfp_t gfp_mask) +{ + do { + if (page_counter_read(&memcg->memory) <= memcg->high) + continue; + mem_cgroup_events(memcg, MEMCG_HIGH, 1); + try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); + } while ((memcg = parent_mem_cgroup(memcg))); +} + +static void high_work_func(struct work_struct *work) +{ + struct mem_cgroup *memcg; + + memcg = container_of(work, struct mem_cgroup, high_work); + reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL); +} + /* * Scheduled by try_charge() to be executed from the userland return path * and reclaims memory over the high limit. @@ -1980,20 +1954,13 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, void mem_cgroup_handle_over_high(void) { unsigned int nr_pages = current->memcg_nr_pages_over_high; - struct mem_cgroup *memcg, *pos; + struct mem_cgroup *memcg; if (likely(!nr_pages)) return; - pos = memcg = get_mem_cgroup_from_mm(current->mm); - - do { - if (page_counter_read(&pos->memory) <= pos->high) - continue; - mem_cgroup_events(pos, MEMCG_HIGH, 1); - try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true); - } while ((pos = parent_mem_cgroup(pos))); - + memcg = get_mem_cgroup_from_mm(current->mm); + reclaim_high(memcg, nr_pages, GFP_KERNEL); css_put(&memcg->css); current->memcg_nr_pages_over_high = 0; } @@ -2015,11 +1982,11 @@ retry: if (consume_stock(memcg, nr_pages)) return 0; - if (!do_swap_account || + if (!do_memsw_account() || page_counter_try_charge(&memcg->memsw, batch, &counter)) { if (page_counter_try_charge(&memcg->memory, batch, &counter)) goto done_restock; - if (do_swap_account) + if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, batch); mem_over_limit = mem_cgroup_from_counter(counter, memory); } else { @@ -2106,7 +2073,7 @@ force: * temporarily by force charging it. */ page_counter_charge(&memcg->memory, nr_pages); - if (do_swap_account) + if (do_memsw_account()) page_counter_charge(&memcg->memsw, nr_pages); css_get_many(&memcg->css, nr_pages); @@ -2128,6 +2095,11 @@ done_restock: */ do { if (page_counter_read(&memcg->memory) > memcg->high) { + /* Don't bother a random interrupted task */ + if (in_interrupt()) { + schedule_work(&memcg->high_work); + break; + } current->memcg_nr_pages_over_high += batch; set_notify_resume(current); break; @@ -2143,7 +2115,7 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) return; page_counter_uncharge(&memcg->memory, nr_pages); - if (do_swap_account) + if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); css_put_many(&memcg->css, nr_pages); @@ -2214,7 +2186,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, unlock_page_lru(page, isolated); } -#ifdef CONFIG_MEMCG_KMEM +#ifndef CONFIG_SLOB static int memcg_alloc_cache_id(void) { int id, size; @@ -2332,7 +2304,7 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, * Can't be called in interrupt context or from kernel threads. * This function needs to be called with rcu_read_lock() held. */ -struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) { struct mem_cgroup *memcg; struct kmem_cache *memcg_cachep; @@ -2340,6 +2312,12 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) VM_BUG_ON(!is_root_cache(cachep)); + if (cachep->flags & SLAB_ACCOUNT) + gfp |= __GFP_ACCOUNT; + + if (!(gfp & __GFP_ACCOUNT)) + return cachep; + if (current->memcg_kmem_skip_account) return cachep; @@ -2383,16 +2361,17 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, struct page_counter *counter; int ret; - if (!memcg_kmem_is_active(memcg)) + if (!memcg_kmem_online(memcg)) return 0; - if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) - return -ENOMEM; - ret = try_charge(memcg, gfp, nr_pages); - if (ret) { - page_counter_uncharge(&memcg->kmem, nr_pages); + if (ret) return ret; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && + !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { + cancel_charge(memcg, nr_pages); + return -ENOMEM; } page->mem_cgroup = memcg; @@ -2421,23 +2400,23 @@ void __memcg_kmem_uncharge(struct page *page, int order) VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - page_counter_uncharge(&memcg->kmem, nr_pages); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_uncharge(&memcg->kmem, nr_pages); + page_counter_uncharge(&memcg->memory, nr_pages); - if (do_swap_account) + if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); page->mem_cgroup = NULL; css_put_many(&memcg->css, nr_pages); } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* !CONFIG_SLOB */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * Because tail pages are not marked as "used", set it. We're under - * zone->lru_lock, 'splitting on pmd' and compound_lock. - * charge/uncharge will be never happen and move_account() is done under - * compound_lock(), so we don't have to take care of races. + * zone->lru_lock and migration entries setup in all page mappings. */ void mem_cgroup_split_huge_fixup(struct page *head) { @@ -2691,14 +2670,6 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg) { bool ret; - /* - * The lock does not prevent addition or deletion of children, but - * it prevents a new child from being initialized based on this - * parent in css_online(), so it's enough to decide whether - * hierarchically inherited attributes can still be changed or not. - */ - lockdep_assert_held(&memcg_create_mutex); - rcu_read_lock(); ret = css_next_child(NULL, &memcg->css); rcu_read_unlock(); @@ -2761,10 +2732,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); - mutex_lock(&memcg_create_mutex); - if (memcg->use_hierarchy == val) - goto out; + return 0; /* * If parent's use_hierarchy is set, we can't make any modifications @@ -2783,9 +2752,6 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, } else retval = -EINVAL; -out: - mutex_unlock(&memcg_create_mutex); - return retval; } @@ -2801,6 +2767,18 @@ static unsigned long tree_stat(struct mem_cgroup *memcg, return val; } +static unsigned long tree_events(struct mem_cgroup *memcg, + enum mem_cgroup_events_index idx) +{ + struct mem_cgroup *iter; + unsigned long val = 0; + + for_each_mem_cgroup_tree(iter, memcg) + val += mem_cgroup_read_events(iter, idx); + + return val; +} + static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { unsigned long val; @@ -2843,6 +2821,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, case _KMEM: counter = &memcg->kmem; break; + case _TCP: + counter = &memcg->tcpmem; + break; default: BUG(); } @@ -2867,103 +2848,180 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } } -#ifdef CONFIG_MEMCG_KMEM -static int memcg_activate_kmem(struct mem_cgroup *memcg, - unsigned long nr_pages) +#ifndef CONFIG_SLOB +static int memcg_online_kmem(struct mem_cgroup *memcg) { - int err = 0; int memcg_id; BUG_ON(memcg->kmemcg_id >= 0); - BUG_ON(memcg->kmem_acct_activated); - BUG_ON(memcg->kmem_acct_active); - - /* - * For simplicity, we won't allow this to be disabled. It also can't - * be changed if the cgroup has children already, or if tasks had - * already joined. - * - * If tasks join before we set the limit, a person looking at - * kmem.usage_in_bytes will have no way to determine when it took - * place, which makes the value quite meaningless. - * - * After it first became limited, changes in the value of the limit are - * of course permitted. - */ - mutex_lock(&memcg_create_mutex); - if (cgroup_is_populated(memcg->css.cgroup) || - (memcg->use_hierarchy && memcg_has_children(memcg))) - err = -EBUSY; - mutex_unlock(&memcg_create_mutex); - if (err) - goto out; + BUG_ON(memcg->kmem_state); memcg_id = memcg_alloc_cache_id(); - if (memcg_id < 0) { - err = memcg_id; - goto out; - } - - /* - * We couldn't have accounted to this cgroup, because it hasn't got - * activated yet, so this should succeed. - */ - err = page_counter_limit(&memcg->kmem, nr_pages); - VM_BUG_ON(err); + if (memcg_id < 0) + return memcg_id; - static_key_slow_inc(&memcg_kmem_enabled_key); + static_branch_inc(&memcg_kmem_enabled_key); /* - * A memory cgroup is considered kmem-active as soon as it gets + * A memory cgroup is considered kmem-online as soon as it gets * kmemcg_id. Setting the id after enabling static branching will * guarantee no one starts accounting before all call sites are * patched. */ memcg->kmemcg_id = memcg_id; - memcg->kmem_acct_activated = true; - memcg->kmem_acct_active = true; -out: - return err; + memcg->kmem_state = KMEM_ONLINE; + + return 0; } -static int memcg_update_kmem_limit(struct mem_cgroup *memcg, - unsigned long limit) +static int memcg_propagate_kmem(struct mem_cgroup *parent, + struct mem_cgroup *memcg) { - int ret; + int ret = 0; mutex_lock(&memcg_limit_mutex); - if (!memcg_kmem_is_active(memcg)) - ret = memcg_activate_kmem(memcg, limit); - else - ret = page_counter_limit(&memcg->kmem, limit); + /* + * If the parent cgroup is not kmem-online now, it cannot be + * onlined after this point, because it has at least one child + * already. + */ + if (memcg_kmem_online(parent) || + (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nokmem)) + ret = memcg_online_kmem(memcg); mutex_unlock(&memcg_limit_mutex); return ret; } -static int memcg_propagate_kmem(struct mem_cgroup *memcg) +static void memcg_offline_kmem(struct mem_cgroup *memcg) { - int ret = 0; - struct mem_cgroup *parent = parent_mem_cgroup(memcg); + struct cgroup_subsys_state *css; + struct mem_cgroup *parent, *child; + int kmemcg_id; + + if (memcg->kmem_state != KMEM_ONLINE) + return; + /* + * Clear the online state before clearing memcg_caches array + * entries. The slab_mutex in memcg_deactivate_kmem_caches() + * guarantees that no cache will be created for this cgroup + * after we are done (see memcg_create_kmem_cache()). + */ + memcg->kmem_state = KMEM_ALLOCATED; + + memcg_deactivate_kmem_caches(memcg); + kmemcg_id = memcg->kmemcg_id; + BUG_ON(kmemcg_id < 0); + + parent = parent_mem_cgroup(memcg); if (!parent) - return 0; + parent = root_mem_cgroup; - mutex_lock(&memcg_limit_mutex); /* - * If the parent cgroup is not kmem-active now, it cannot be activated - * after this point, because it has at least one child already. + * Change kmemcg_id of this cgroup and all its descendants to the + * parent's id, and then move all entries from this cgroup's list_lrus + * to ones of the parent. After we have finished, all list_lrus + * corresponding to this cgroup are guaranteed to remain empty. The + * ordering is imposed by list_lru_node->lock taken by + * memcg_drain_all_list_lrus(). */ - if (memcg_kmem_is_active(parent)) - ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX); - mutex_unlock(&memcg_limit_mutex); - return ret; + css_for_each_descendant_pre(css, &memcg->css) { + child = mem_cgroup_from_css(css); + BUG_ON(child->kmemcg_id != kmemcg_id); + child->kmemcg_id = parent->kmemcg_id; + if (!memcg->use_hierarchy) + break; + } + memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); + + memcg_free_cache_id(kmemcg_id); +} + +static void memcg_free_kmem(struct mem_cgroup *memcg) +{ + /* css_alloc() failed, offlining didn't happen */ + if (unlikely(memcg->kmem_state == KMEM_ONLINE)) + memcg_offline_kmem(memcg); + + if (memcg->kmem_state == KMEM_ALLOCATED) { + memcg_destroy_kmem_caches(memcg); + static_branch_dec(&memcg_kmem_enabled_key); + WARN_ON(page_counter_read(&memcg->kmem)); + } } #else +static int memcg_propagate_kmem(struct mem_cgroup *parent, struct mem_cgroup *memcg) +{ + return 0; +} +static int memcg_online_kmem(struct mem_cgroup *memcg) +{ + return 0; +} +static void memcg_offline_kmem(struct mem_cgroup *memcg) +{ +} +static void memcg_free_kmem(struct mem_cgroup *memcg) +{ +} +#endif /* !CONFIG_SLOB */ + static int memcg_update_kmem_limit(struct mem_cgroup *memcg, unsigned long limit) { - return -EINVAL; + int ret = 0; + + mutex_lock(&memcg_limit_mutex); + /* Top-level cgroup doesn't propagate from root */ + if (!memcg_kmem_online(memcg)) { + if (cgroup_is_populated(memcg->css.cgroup) || + (memcg->use_hierarchy && memcg_has_children(memcg))) + ret = -EBUSY; + if (ret) + goto out; + ret = memcg_online_kmem(memcg); + if (ret) + goto out; + } + ret = page_counter_limit(&memcg->kmem, limit); +out: + mutex_unlock(&memcg_limit_mutex); + return ret; +} + +static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) +{ + int ret; + + mutex_lock(&memcg_limit_mutex); + + ret = page_counter_limit(&memcg->tcpmem, limit); + if (ret) + goto out; + + if (!memcg->tcpmem_active) { + /* + * The active flag needs to be written after the static_key + * update. This is what guarantees that the socket activation + * function is the last one to run. See sock_update_memcg() for + * details, and note that we don't mark any socket as belonging + * to this memcg until that flag is up. + * + * We need to do this, because static_keys will span multiple + * sites, but we can't control their order. If we mark a socket + * as accounted, but the accounting functions are not patched in + * yet, we'll lose accounting. + * + * We never race with the readers in sock_update_memcg(), + * because when this value change, the code to process it is not + * patched in yet. + */ + static_branch_inc(&memcg_sockets_enabled_key); + memcg->tcpmem_active = true; + } +out: + mutex_unlock(&memcg_limit_mutex); + return ret; } -#endif /* CONFIG_MEMCG_KMEM */ /* * The user of this function is... @@ -2997,6 +3055,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, case _KMEM: ret = memcg_update_kmem_limit(memcg, nr_pages); break; + case _TCP: + ret = memcg_update_tcp_limit(memcg, nr_pages); + break; } break; case RES_SOFT_LIMIT: @@ -3023,6 +3084,9 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, case _KMEM: counter = &memcg->kmem; break; + case _TCP: + counter = &memcg->tcpmem; + break; default: BUG(); } @@ -3138,7 +3202,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { - if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) + if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) continue; seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i], mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); @@ -3160,14 +3224,14 @@ static int memcg_stat_show(struct seq_file *m, void *v) } seq_printf(m, "hierarchical_memory_limit %llu\n", (u64)memory * PAGE_SIZE); - if (do_swap_account) + if (do_memsw_account()) seq_printf(m, "hierarchical_memsw_limit %llu\n", (u64)memsw * PAGE_SIZE); for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { unsigned long long val = 0; - if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) + if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) continue; for_each_mem_cgroup_tree(mi, memcg) val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; @@ -3298,7 +3362,7 @@ static void mem_cgroup_threshold(struct mem_cgroup *memcg) { while (memcg) { __mem_cgroup_threshold(memcg, false); - if (do_swap_account) + if (do_memsw_account()) __mem_cgroup_threshold(memcg, true); memcg = parent_mem_cgroup(memcg); @@ -3498,16 +3562,17 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, swap_buffers: /* Swap primary and spare array */ thresholds->spare = thresholds->primary; - /* If all events are unregistered, free the spare array */ - if (!new) { - kfree(thresholds->spare); - thresholds->spare = NULL; - } rcu_assign_pointer(thresholds->primary, new); /* To be sure that nobody uses thresholds */ synchronize_rcu(); + + /* If all events are unregistered, free the spare array */ + if (!new) { + kfree(thresholds->spare); + thresholds->spare = NULL; + } unlock: mutex_unlock(&memcg->thresholds_lock); } @@ -3588,88 +3653,6 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, return 0; } -#ifdef CONFIG_MEMCG_KMEM -static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) -{ - int ret; - - ret = memcg_propagate_kmem(memcg); - if (ret) - return ret; - - return mem_cgroup_sockets_init(memcg, ss); -} - -static void memcg_deactivate_kmem(struct mem_cgroup *memcg) -{ - struct cgroup_subsys_state *css; - struct mem_cgroup *parent, *child; - int kmemcg_id; - - if (!memcg->kmem_acct_active) - return; - - /* - * Clear the 'active' flag before clearing memcg_caches arrays entries. - * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it - * guarantees no cache will be created for this cgroup after we are - * done (see memcg_create_kmem_cache()). - */ - memcg->kmem_acct_active = false; - - memcg_deactivate_kmem_caches(memcg); - - kmemcg_id = memcg->kmemcg_id; - BUG_ON(kmemcg_id < 0); - - parent = parent_mem_cgroup(memcg); - if (!parent) - parent = root_mem_cgroup; - - /* - * Change kmemcg_id of this cgroup and all its descendants to the - * parent's id, and then move all entries from this cgroup's list_lrus - * to ones of the parent. After we have finished, all list_lrus - * corresponding to this cgroup are guaranteed to remain empty. The - * ordering is imposed by list_lru_node->lock taken by - * memcg_drain_all_list_lrus(). - */ - css_for_each_descendant_pre(css, &memcg->css) { - child = mem_cgroup_from_css(css); - BUG_ON(child->kmemcg_id != kmemcg_id); - child->kmemcg_id = parent->kmemcg_id; - if (!memcg->use_hierarchy) - break; - } - memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); - - memcg_free_cache_id(kmemcg_id); -} - -static void memcg_destroy_kmem(struct mem_cgroup *memcg) -{ - if (memcg->kmem_acct_activated) { - memcg_destroy_kmem_caches(memcg); - static_key_slow_dec(&memcg_kmem_enabled_key); - WARN_ON(page_counter_read(&memcg->kmem)); - } - mem_cgroup_sockets_destroy(memcg); -} -#else -static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) -{ - return 0; -} - -static void memcg_deactivate_kmem(struct mem_cgroup *memcg) -{ -} - -static void memcg_destroy_kmem(struct mem_cgroup *memcg) -{ -} -#endif - #ifdef CONFIG_CGROUP_WRITEBACK struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg) @@ -4057,7 +4040,6 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memcg_numa_stat_show, }, #endif -#ifdef CONFIG_MEMCG_KMEM { .name = "kmem.limit_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), @@ -4090,7 +4072,29 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memcg_slab_show, }, #endif -#endif + { + .name = "kmem.tcp.limit_in_bytes", + .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), + .write = mem_cgroup_write, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.tcp.usage_in_bytes", + .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.tcp.failcnt", + .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.tcp.max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, { }, /* terminate */ }; @@ -4129,153 +4133,92 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) kfree(memcg->nodeinfo[node]); } -static struct mem_cgroup *mem_cgroup_alloc(void) -{ - struct mem_cgroup *memcg; - size_t size; - - size = sizeof(struct mem_cgroup); - size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); - - memcg = kzalloc(size, GFP_KERNEL); - if (!memcg) - return NULL; - - memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); - if (!memcg->stat) - goto out_free; - - if (memcg_wb_domain_init(memcg, GFP_KERNEL)) - goto out_free_stat; - - return memcg; - -out_free_stat: - free_percpu(memcg->stat); -out_free: - kfree(memcg); - return NULL; -} - -/* - * At destroying mem_cgroup, references from swap_cgroup can remain. - * (scanning all at force_empty is too costly...) - * - * Instead of clearing all references at force_empty, we remember - * the number of reference from swap_cgroup and free mem_cgroup when - * it goes down to 0. - * - * Removal of cgroup itself succeeds regardless of refs from swap. - */ - -static void __mem_cgroup_free(struct mem_cgroup *memcg) +static void mem_cgroup_free(struct mem_cgroup *memcg) { int node; - mem_cgroup_remove_from_trees(memcg); - + memcg_wb_domain_exit(memcg); for_each_node(node) free_mem_cgroup_per_zone_info(memcg, node); - free_percpu(memcg->stat); - memcg_wb_domain_exit(memcg); kfree(memcg); } -/* - * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. - */ -struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) -{ - if (!memcg->memory.parent) - return NULL; - return mem_cgroup_from_counter(memcg->memory.parent, memory); -} -EXPORT_SYMBOL(parent_mem_cgroup); - -static struct cgroup_subsys_state * __ref -mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) +static struct mem_cgroup *mem_cgroup_alloc(void) { struct mem_cgroup *memcg; - long error = -ENOMEM; + size_t size; int node; - memcg = mem_cgroup_alloc(); + size = sizeof(struct mem_cgroup); + size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); + + memcg = kzalloc(size, GFP_KERNEL); if (!memcg) - return ERR_PTR(error); + return NULL; + + memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); + if (!memcg->stat) + goto fail; for_each_node(node) if (alloc_mem_cgroup_per_zone_info(memcg, node)) - goto free_out; + goto fail; - /* root ? */ - if (parent_css == NULL) { - root_mem_cgroup = memcg; - mem_cgroup_root_css = &memcg->css; - page_counter_init(&memcg->memory, NULL); - memcg->high = PAGE_COUNTER_MAX; - memcg->soft_limit = PAGE_COUNTER_MAX; - page_counter_init(&memcg->memsw, NULL); - page_counter_init(&memcg->kmem, NULL); - } + if (memcg_wb_domain_init(memcg, GFP_KERNEL)) + goto fail; + INIT_WORK(&memcg->high_work, high_work_func); memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); - memcg->move_charge_at_immigrate = 0; mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); vmpressure_init(&memcg->vmpressure); INIT_LIST_HEAD(&memcg->event_list); spin_lock_init(&memcg->event_list_lock); -#ifdef CONFIG_MEMCG_KMEM + memcg->socket_pressure = jiffies; +#ifndef CONFIG_SLOB memcg->kmemcg_id = -1; #endif #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); #endif - return &memcg->css; - -free_out: - __mem_cgroup_free(memcg); - return ERR_PTR(error); + return memcg; +fail: + mem_cgroup_free(memcg); + return NULL; } -static int -mem_cgroup_css_online(struct cgroup_subsys_state *css) +static struct cgroup_subsys_state * __ref +mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); - int ret; - - if (css->id > MEM_CGROUP_ID_MAX) - return -ENOSPC; - - if (!parent) - return 0; - - mutex_lock(&memcg_create_mutex); + struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); + struct mem_cgroup *memcg; + long error = -ENOMEM; - memcg->use_hierarchy = parent->use_hierarchy; - memcg->oom_kill_disable = parent->oom_kill_disable; - memcg->swappiness = mem_cgroup_swappiness(parent); + memcg = mem_cgroup_alloc(); + if (!memcg) + return ERR_PTR(error); - if (parent->use_hierarchy) { + memcg->high = PAGE_COUNTER_MAX; + memcg->soft_limit = PAGE_COUNTER_MAX; + if (parent) { + memcg->swappiness = mem_cgroup_swappiness(parent); + memcg->oom_kill_disable = parent->oom_kill_disable; + } + if (parent && parent->use_hierarchy) { + memcg->use_hierarchy = true; page_counter_init(&memcg->memory, &parent->memory); - memcg->high = PAGE_COUNTER_MAX; - memcg->soft_limit = PAGE_COUNTER_MAX; + page_counter_init(&memcg->swap, &parent->swap); page_counter_init(&memcg->memsw, &parent->memsw); page_counter_init(&memcg->kmem, &parent->kmem); - - /* - * No need to take a reference to the parent because cgroup - * core guarantees its existence. - */ + page_counter_init(&memcg->tcpmem, &parent->tcpmem); } else { page_counter_init(&memcg->memory, NULL); - memcg->high = PAGE_COUNTER_MAX; - memcg->soft_limit = PAGE_COUNTER_MAX; + page_counter_init(&memcg->swap, NULL); page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->kmem, NULL); + page_counter_init(&memcg->tcpmem, NULL); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -4284,18 +4227,31 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) if (parent != root_mem_cgroup) memory_cgrp_subsys.broken_hierarchy = true; } - mutex_unlock(&memcg_create_mutex); - ret = memcg_init_kmem(memcg, &memory_cgrp_subsys); - if (ret) - return ret; + /* The following stuff does not apply to the root */ + if (!parent) { + root_mem_cgroup = memcg; + return &memcg->css; + } - /* - * Make sure the memcg is initialized: mem_cgroup_iter() - * orders reading memcg->initialized against its callers - * reading the memcg members. - */ - smp_store_release(&memcg->initialized, 1); + error = memcg_propagate_kmem(parent, memcg); + if (error) + goto fail; + + if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) + static_branch_inc(&memcg_sockets_enabled_key); + + return &memcg->css; +fail: + mem_cgroup_free(memcg); + return NULL; +} + +static int +mem_cgroup_css_online(struct cgroup_subsys_state *css) +{ + if (css->id > MEM_CGROUP_ID_MAX) + return -ENOSPC; return 0; } @@ -4317,19 +4273,32 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); - vmpressure_cleanup(&memcg->vmpressure); + memcg_offline_kmem(memcg); + wb_memcg_offline(memcg); +} - memcg_deactivate_kmem(memcg); +static void mem_cgroup_css_released(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); - wb_memcg_offline(memcg); + invalidate_reclaim_iterators(memcg); } static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - memcg_destroy_kmem(memcg); - __mem_cgroup_free(memcg); + if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) + static_branch_dec(&memcg_sockets_enabled_key); + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) + static_branch_dec(&memcg_sockets_enabled_key); + + vmpressure_cleanup(&memcg->vmpressure); + cancel_work_sync(&memcg->high_work); + mem_cgroup_remove_from_trees(memcg); + memcg_free_kmem(memcg); + mem_cgroup_free(memcg); } /** @@ -4445,7 +4414,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, * we call find_get_page() with swapper_space directly. */ page = find_get_page(swap_address_space(ent), ent.val); - if (do_swap_account) + if (do_memsw_account()) entry->val = ent.val; return page; @@ -4480,7 +4449,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, page = find_get_entry(mapping, pgoff); if (radix_tree_exceptional_entry(page)) { swp_entry_t swp = radix_to_swp_entry(page); - if (do_swap_account) + if (do_memsw_account()) *entry = swp; page = find_get_page(swap_address_space(swp), swp.val); } @@ -4499,38 +4468,30 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. * - * The caller must confirm following. - * - page is not on LRU (isolate_page() is useful.) - * - compound_lock is held when nr_pages > 1 + * The caller must make sure the page is not on LRU (isolate_page() is useful.) * * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" * from old cgroup. */ static int mem_cgroup_move_account(struct page *page, - unsigned int nr_pages, + bool compound, struct mem_cgroup *from, struct mem_cgroup *to) { unsigned long flags; + unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; int ret; bool anon; VM_BUG_ON(from == to); VM_BUG_ON_PAGE(PageLRU(page), page); - /* - * The page is isolated from LRU. So, collapse function - * will not handle this page. But page splitting can happen. - * Do this check under compound_page_lock(). The caller should - * hold it. - */ - ret = -EBUSY; - if (nr_pages > 1 && !PageTransHuge(page)) - goto out; + VM_BUG_ON(compound && !PageTransHuge(page)); /* * Prevent mem_cgroup_replace_page() from looking at * page->mem_cgroup of its source page while we change it. */ + ret = -EBUSY; if (!trylock_page(page)) goto out; @@ -4585,9 +4546,9 @@ static int mem_cgroup_move_account(struct page *page, ret = 0; local_irq_disable(); - mem_cgroup_charge_statistics(to, page, nr_pages); + mem_cgroup_charge_statistics(to, page, compound, nr_pages); memcg_check_events(to, page); - mem_cgroup_charge_statistics(from, page, -nr_pages); + mem_cgroup_charge_statistics(from, page, compound, -nr_pages); memcg_check_events(from, page); local_irq_enable(); out_unlock: @@ -4677,7 +4638,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) mc.precharge += HPAGE_PMD_NR; spin_unlock(ptl); @@ -4782,7 +4744,7 @@ static void mem_cgroup_clear_mc(void) static int mem_cgroup_can_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ struct mem_cgroup *from; struct task_struct *leader, *p; struct mm_struct *mm; @@ -4865,17 +4827,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, union mc_target target; struct page *page; - /* - * We don't take compound_lock() here but no race with splitting thp - * happens because: - * - if pmd_trans_huge_lock() returns 1, the relevant thp is not - * under splitting, which means there's no concurrent thp split, - * - if another thread runs into split_huge_page() just after we - * entered this if-block, the thread must wait for page table lock - * to be unlocked in __split_huge_page_splitting(), where the main - * part of thp split is not executed yet. - */ - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { if (mc.precharge < HPAGE_PMD_NR) { spin_unlock(ptl); return 0; @@ -4884,7 +4837,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, if (target_type == MC_TARGET_PAGE) { page = target.page; if (!isolate_lru_page(page)) { - if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, + if (!mem_cgroup_move_account(page, true, mc.from, mc.to)) { mc.precharge -= HPAGE_PMD_NR; mc.moved_charge += HPAGE_PMD_NR; @@ -4911,9 +4864,18 @@ retry: switch (get_mctgt_type(vma, addr, ptent, &target)) { case MC_TARGET_PAGE: page = target.page; + /* + * We can have a part of the split pmd here. Moving it + * can be done but it would be too convoluted so simply + * ignore such a partial THP and keep it in original + * memcg. There should be somebody mapping the head. + */ + if (PageTransCompound(page)) + goto put; if (isolate_lru_page(page)) goto put; - if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) { + if (!mem_cgroup_move_account(page, false, + mc.from, mc.to)) { mc.precharge--; /* we uncharge from mc.from later. */ mc.moved_charge++; @@ -5148,6 +5110,59 @@ static int memory_events_show(struct seq_file *m, void *v) return 0; } +static int memory_stat_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + int i; + + /* + * Provide statistics on the state of the memory subsystem as + * well as cumulative event counters that show past behavior. + * + * This list is ordered following a combination of these gradients: + * 1) generic big picture -> specifics and details + * 2) reflecting userspace activity -> reflecting kernel heuristics + * + * Current memory state: + */ + + seq_printf(m, "anon %llu\n", + (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE); + seq_printf(m, "file %llu\n", + (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE); + seq_printf(m, "sock %llu\n", + (u64)tree_stat(memcg, MEMCG_SOCK) * PAGE_SIZE); + + seq_printf(m, "file_mapped %llu\n", + (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) * + PAGE_SIZE); + seq_printf(m, "file_dirty %llu\n", + (u64)tree_stat(memcg, MEM_CGROUP_STAT_DIRTY) * + PAGE_SIZE); + seq_printf(m, "file_writeback %llu\n", + (u64)tree_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) * + PAGE_SIZE); + + for (i = 0; i < NR_LRU_LISTS; i++) { + struct mem_cgroup *mi; + unsigned long val = 0; + + for_each_mem_cgroup_tree(mi, memcg) + val += mem_cgroup_nr_lru_pages(mi, BIT(i)); + seq_printf(m, "%s %llu\n", + mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE); + } + + /* Accumulated memory events */ + + seq_printf(m, "pgfault %lu\n", + tree_events(memcg, MEM_CGROUP_EVENTS_PGFAULT)); + seq_printf(m, "pgmajfault %lu\n", + tree_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT)); + + return 0; +} + static struct cftype memory_files[] = { { .name = "current", @@ -5178,6 +5193,11 @@ static struct cftype memory_files[] = { .file_offset = offsetof(struct mem_cgroup, events_file), .seq_show = memory_events_show, }, + { + .name = "stat", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_stat_show, + }, { } /* terminate */ }; @@ -5185,6 +5205,7 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_alloc = mem_cgroup_css_alloc, .css_online = mem_cgroup_css_online, .css_offline = mem_cgroup_css_offline, + .css_released = mem_cgroup_css_released, .css_free = mem_cgroup_css_free, .css_reset = mem_cgroup_css_reset, .can_attach = mem_cgroup_can_attach, @@ -5251,10 +5272,11 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) * with mem_cgroup_cancel_charge() in case page instantiation fails. */ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp) + gfp_t gfp_mask, struct mem_cgroup **memcgp, + bool compound) { struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = 1; + unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; int ret = 0; if (mem_cgroup_disabled()) @@ -5284,11 +5306,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, } } - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - if (!memcg) memcg = get_mem_cgroup_from_mm(mm); @@ -5317,9 +5334,9 @@ out: * Use mem_cgroup_cancel_charge() to cancel the transaction instead. */ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare) + bool lrucare, bool compound) { - unsigned int nr_pages = 1; + unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; VM_BUG_ON_PAGE(!page->mapping, page); VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); @@ -5336,17 +5353,12 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, commit_charge(page, memcg, lrucare); - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - local_irq_disable(); - mem_cgroup_charge_statistics(memcg, page, nr_pages); + mem_cgroup_charge_statistics(memcg, page, compound, nr_pages); memcg_check_events(memcg, page); local_irq_enable(); - if (do_swap_account && PageSwapCache(page)) { + if (do_memsw_account() && PageSwapCache(page)) { swp_entry_t entry = { .val = page_private(page) }; /* * The swap entry might not get freed for a long time, @@ -5364,9 +5376,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, * * Cancel a charge transaction started by mem_cgroup_try_charge(). */ -void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) +void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, + bool compound) { - unsigned int nr_pages = 1; + unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; if (mem_cgroup_disabled()) return; @@ -5378,11 +5391,6 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) if (!memcg) return; - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - cancel_charge(memcg, nr_pages); } @@ -5395,7 +5403,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, if (!mem_cgroup_is_root(memcg)) { page_counter_uncharge(&memcg->memory, nr_pages); - if (do_swap_account) + if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); memcg_oom_recover(memcg); } @@ -5521,7 +5529,8 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) { struct mem_cgroup *memcg; - int isolated; + unsigned int nr_pages; + bool compound; VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); @@ -5541,12 +5550,133 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) if (!memcg) return; - lock_page_lru(oldpage, &isolated); - oldpage->mem_cgroup = NULL; - unlock_page_lru(oldpage, isolated); + /* Force-charge the new page. The old one will be freed soon */ + compound = PageTransHuge(newpage); + nr_pages = compound ? hpage_nr_pages(newpage) : 1; + + page_counter_charge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_charge(&memcg->memsw, nr_pages); + css_get_many(&memcg->css, nr_pages); commit_charge(newpage, memcg, true); + + local_irq_disable(); + mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages); + memcg_check_events(memcg, newpage); + local_irq_enable(); +} + +DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); +EXPORT_SYMBOL(memcg_sockets_enabled_key); + +void sock_update_memcg(struct sock *sk) +{ + struct mem_cgroup *memcg; + + /* Socket cloning can throw us here with sk_cgrp already + * filled. It won't however, necessarily happen from + * process context. So the test for root memcg given + * the current task's memcg won't help us in this case. + * + * Respecting the original socket's memcg is a better + * decision in this case. + */ + if (sk->sk_memcg) { + BUG_ON(mem_cgroup_is_root(sk->sk_memcg)); + css_get(&sk->sk_memcg->css); + return; + } + + rcu_read_lock(); + memcg = mem_cgroup_from_task(current); + if (memcg == root_mem_cgroup) + goto out; + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) + goto out; + if (css_tryget_online(&memcg->css)) + sk->sk_memcg = memcg; +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(sock_update_memcg); + +void sock_release_memcg(struct sock *sk) +{ + WARN_ON(!sk->sk_memcg); + css_put(&sk->sk_memcg->css); +} + +/** + * mem_cgroup_charge_skmem - charge socket memory + * @memcg: memcg to charge + * @nr_pages: number of pages to charge + * + * Charges @nr_pages to @memcg. Returns %true if the charge fit within + * @memcg's configured limit, %false if the charge had to be forced. + */ +bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + gfp_t gfp_mask = GFP_KERNEL; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { + struct page_counter *fail; + + if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { + memcg->tcpmem_pressure = 0; + return true; + } + page_counter_charge(&memcg->tcpmem, nr_pages); + memcg->tcpmem_pressure = 1; + return false; + } + + /* Don't block in the packet receive path */ + if (in_softirq()) + gfp_mask = GFP_NOWAIT; + + this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages); + + if (try_charge(memcg, gfp_mask, nr_pages) == 0) + return true; + + try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages); + return false; +} + +/** + * mem_cgroup_uncharge_skmem - uncharge socket memory + * @memcg - memcg to uncharge + * @nr_pages - number of pages to uncharge + */ +void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { + page_counter_uncharge(&memcg->tcpmem, nr_pages); + return; + } + + this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages); + + page_counter_uncharge(&memcg->memory, nr_pages); + css_put_many(&memcg->css, nr_pages); +} + +static int __init cgroup_memory(char *s) +{ + char *token; + + while ((token = strsep(&s, ",")) != NULL) { + if (!*token) + continue; + if (!strcmp(token, "nosocket")) + cgroup_memory_nosocket = true; + if (!strcmp(token, "nokmem")) + cgroup_memory_nokmem = true; + } + return 0; } +__setup("cgroup.memory=", cgroup_memory); /* * subsys_initcall() for memory controller. @@ -5603,7 +5733,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(page_count(page), page); - if (!do_swap_account) + if (!do_memsw_account()) return; memcg = page->mem_cgroup; @@ -5628,15 +5758,51 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * only synchronisation we have for udpating the per-CPU variables. */ VM_BUG_ON(!irqs_disabled()); - mem_cgroup_charge_statistics(memcg, page, -1); + mem_cgroup_charge_statistics(memcg, page, false, -1); memcg_check_events(memcg, page); } +/* + * mem_cgroup_try_charge_swap - try charging a swap entry + * @page: page being added to swap + * @entry: swap entry to charge + * + * Try to charge @entry to the memcg that @page belongs to. + * + * Returns 0 on success, -ENOMEM on failure. + */ +int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) +{ + struct mem_cgroup *memcg; + struct page_counter *counter; + unsigned short oldid; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account) + return 0; + + memcg = page->mem_cgroup; + + /* Readahead page, never charged */ + if (!memcg) + return 0; + + if (!mem_cgroup_is_root(memcg) && + !page_counter_try_charge(&memcg->swap, 1, &counter)) + return -ENOMEM; + + oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); + VM_BUG_ON_PAGE(oldid, page); + mem_cgroup_swap_statistics(memcg, true); + + css_get(&memcg->css); + return 0; +} + /** * mem_cgroup_uncharge_swap - uncharge a swap entry * @entry: swap entry to uncharge * - * Drop the memsw charge associated with @entry. + * Drop the swap charge associated with @entry. */ void mem_cgroup_uncharge_swap(swp_entry_t entry) { @@ -5650,14 +5816,53 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { - if (!mem_cgroup_is_root(memcg)) - page_counter_uncharge(&memcg->memsw, 1); + if (!mem_cgroup_is_root(memcg)) { + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_uncharge(&memcg->swap, 1); + else + page_counter_uncharge(&memcg->memsw, 1); + } mem_cgroup_swap_statistics(memcg, false); css_put(&memcg->css); } rcu_read_unlock(); } +long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) +{ + long nr_swap_pages = get_nr_swap_pages(); + + if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return nr_swap_pages; + for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) + nr_swap_pages = min_t(long, nr_swap_pages, + READ_ONCE(memcg->swap.limit) - + page_counter_read(&memcg->swap)); + return nr_swap_pages; +} + +bool mem_cgroup_swap_full(struct page *page) +{ + struct mem_cgroup *memcg; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + + if (vm_swap_full()) + return true; + if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return false; + + memcg = page->mem_cgroup; + if (!memcg) + return false; + + for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) + if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit) + return true; + + return false; +} + /* for remember boot option*/ #ifdef CONFIG_MEMCG_SWAP_ENABLED static int really_do_swap_account __initdata = 1; @@ -5675,6 +5880,63 @@ static int __init enable_swap_account(char *s) } __setup("swapaccount=", enable_swap_account); +static u64 swap_current_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; +} + +static int swap_max_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long max = READ_ONCE(memcg->swap.limit); + + if (max == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); + + return 0; +} + +static ssize_t swap_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long max; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &max); + if (err) + return err; + + mutex_lock(&memcg_limit_mutex); + err = page_counter_limit(&memcg->swap, max); + mutex_unlock(&memcg_limit_mutex); + if (err) + return err; + + return nbytes; +} + +static struct cftype swap_files[] = { + { + .name = "swap.current", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = swap_current_read, + }, + { + .name = "swap.max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = swap_max_show, + .write = swap_max_write, + }, + { } /* terminate */ +}; + static struct cftype memsw_cgroup_files[] = { { .name = "memsw.usage_in_bytes", @@ -5706,6 +5968,8 @@ static int __init mem_cgroup_swap_init(void) { if (!mem_cgroup_disabled() && really_do_swap_account) { do_swap_account = 1; + WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, + swap_files)); WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_cgroup_files)); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 8424b64..ac595e7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -882,15 +882,7 @@ int get_hwpoison_page(struct page *page) { struct page *head = compound_head(page); - if (PageHuge(head)) - return get_page_unless_zero(head); - - /* - * Thp tail page has special refcounting rule (refcount of tail pages - * is stored in ->_mapcount,) so we can't call get_page_unless_zero() - * directly for tail pages. - */ - if (PageTransHuge(head)) { + if (!PageHuge(head) && PageTransHuge(head)) { /* * Non anonymous thp exists only in allocation/free time. We * can't handle such a case correctly, so let's give it up. @@ -902,41 +894,12 @@ int get_hwpoison_page(struct page *page) page_to_pfn(page)); return 0; } - - if (get_page_unless_zero(head)) { - if (PageTail(page)) - get_page(page); - return 1; - } else { - return 0; - } } - return get_page_unless_zero(page); + return get_page_unless_zero(head); } EXPORT_SYMBOL_GPL(get_hwpoison_page); -/** - * put_hwpoison_page() - Put refcount for memory error handling: - * @page: raw error page (hit by memory error) - */ -void put_hwpoison_page(struct page *page) -{ - struct page *head = compound_head(page); - - if (PageHuge(head)) { - put_page(head); - return; - } - - if (PageTransHuge(head)) - if (page != head) - put_page(head); - - put_page(page); -} -EXPORT_SYMBOL_GPL(put_hwpoison_page); - /* * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. @@ -1149,7 +1112,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } if (!PageHuge(p) && PageTransHuge(hpage)) { + lock_page(hpage); if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { + unlock_page(hpage); if (!PageAnon(hpage)) pr_err("MCE: %#lx: non anonymous thp\n", pfn); else @@ -1159,6 +1124,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) put_hwpoison_page(p); return -EBUSY; } + unlock_page(hpage); + get_hwpoison_page(p); + put_hwpoison_page(hpage); VM_BUG_ON_PAGE(!page_count(p), p); hpage = compound_head(p); } @@ -1166,7 +1134,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) /* * We ignore non-LRU pages for good reasons. * - PG_locked is only well defined for LRU pages and a few others - * - to avoid races with __set_page_locked() + * - to avoid races with __SetPageLocked() * - to avoid races with __SetPageSlab*() (and more non-atomic ops) * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. @@ -1572,7 +1540,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) * Did it turn free? */ ret = __get_any_page(page, pfn, 0); - if (!PageLRU(page)) { + if (ret == 1 && !PageLRU(page)) { /* Drop page reference which is from __get_any_page() */ put_hwpoison_page(page); pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", @@ -1716,6 +1684,49 @@ static int __soft_offline_page(struct page *page, int flags) return ret; } +static int soft_offline_in_use_page(struct page *page, int flags) +{ + int ret; + struct page *hpage = compound_head(page); + + if (!PageHuge(page) && PageTransHuge(hpage)) { + lock_page(hpage); + if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { + unlock_page(hpage); + if (!PageAnon(hpage)) + pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page)); + else + pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page)); + put_hwpoison_page(hpage); + return -EBUSY; + } + unlock_page(hpage); + get_hwpoison_page(page); + put_hwpoison_page(hpage); + } + + if (PageHuge(page)) + ret = soft_offline_huge_page(page, flags); + else + ret = __soft_offline_page(page, flags); + + return ret; +} + +static void soft_offline_free_page(struct page *page) +{ + if (PageHuge(page)) { + struct page *hpage = compound_head(page); + + set_page_hwpoison_huge_page(hpage); + if (!dequeue_hwpoisoned_huge_page(hpage)) + num_poisoned_pages_add(1 << compound_order(hpage)); + } else { + if (!TestSetPageHWPoison(page)) + num_poisoned_pages_inc(); + } +} + /** * soft_offline_page - Soft offline a page. * @page: page to offline @@ -1742,7 +1753,6 @@ int soft_offline_page(struct page *page, int flags) { int ret; unsigned long pfn = page_to_pfn(page); - struct page *hpage = compound_head(page); if (PageHWPoison(page)) { pr_info("soft offline: %#lx page already poisoned\n", pfn); @@ -1750,34 +1760,15 @@ int soft_offline_page(struct page *page, int flags) put_hwpoison_page(page); return -EBUSY; } - if (!PageHuge(page) && PageTransHuge(hpage)) { - if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { - pr_info("soft offline: %#lx: failed to split THP\n", - pfn); - if (flags & MF_COUNT_INCREASED) - put_hwpoison_page(page); - return -EBUSY; - } - } get_online_mems(); - ret = get_any_page(page, pfn, flags); put_online_mems(); - if (ret > 0) { /* for in-use pages */ - if (PageHuge(page)) - ret = soft_offline_huge_page(page, flags); - else - ret = __soft_offline_page(page, flags); - } else if (ret == 0) { /* for free pages */ - if (PageHuge(page)) { - set_page_hwpoison_huge_page(hpage); - if (!dequeue_hwpoisoned_huge_page(hpage)) - num_poisoned_pages_add(1 << compound_order(hpage)); - } else { - if (!TestSetPageHWPoison(page)) - num_poisoned_pages_inc(); - } - } + + if (ret > 0) + ret = soft_offline_in_use_page(page, flags); + else if (ret == 0) + soft_offline_free_page(page); + return ret; } diff --git a/mm/memory.c b/mm/memory.c index c387430..635451a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -50,6 +50,7 @@ #include <linux/export.h> #include <linux/delayacct.h> #include <linux/init.h> +#include <linux/pfn_t.h> #include <linux/writeback.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> @@ -566,7 +567,6 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, { spinlock_t *ptl; pgtable_t new = pte_alloc_one(mm, address); - int wait_split_huge_page; if (!new) return -ENOMEM; @@ -586,18 +586,14 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ ptl = pmd_lock(mm, pmd); - wait_split_huge_page = 0; if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ atomic_long_inc(&mm->nr_ptes); pmd_populate(mm, pmd, new); new = NULL; - } else if (unlikely(pmd_trans_splitting(*pmd))) - wait_split_huge_page = 1; + } spin_unlock(ptl); if (new) pte_free(mm, new); - if (wait_split_huge_page) - wait_split_huge_page(vma->anon_vma, pmd); return 0; } @@ -613,8 +609,7 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ pmd_populate_kernel(&init_mm, pmd, new); new = NULL; - } else - VM_BUG_ON(pmd_trans_splitting(*pmd)); + } spin_unlock(&init_mm.page_table_lock); if (new) pte_free_kernel(&init_mm, new); @@ -832,10 +827,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, } else if (is_migration_entry(entry)) { page = migration_entry_to_page(entry); - if (PageAnon(page)) - rss[MM_ANONPAGES]++; - else - rss[MM_FILEPAGES]++; + rss[mm_counter(page)]++; if (is_write_migration_entry(entry) && is_cow_mapping(vm_flags)) { @@ -873,11 +865,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, page = vm_normal_page(vma, addr, pte); if (page) { get_page(page); - page_dup_rmap(page); - if (PageAnon(page)) - rss[MM_ANONPAGES]++; - else - rss[MM_FILEPAGES]++; + page_dup_rmap(page, false); + rss[mm_counter(page)]++; } out_set_pte: @@ -961,7 +950,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_trans_huge(*src_pmd)) { + if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { int err; VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); err = copy_huge_pmd(dst_mm, src_mm, @@ -1113,9 +1102,8 @@ again: tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; - if (PageAnon(page)) - rss[MM_ANONPAGES]--; - else { + + if (!PageAnon(page)) { if (pte_dirty(ptent)) { force_flush = 1; set_page_dirty(page); @@ -1123,9 +1111,9 @@ again: if (pte_young(ptent) && likely(!(vma->vm_flags & VM_SEQ_READ))) mark_page_accessed(page); - rss[MM_FILEPAGES]--; } - page_remove_rmap(page); + rss[mm_counter(page)]--; + page_remove_rmap(page, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); if (unlikely(!__tlb_remove_page(tlb, page))) { @@ -1146,11 +1134,7 @@ again: struct page *page; page = migration_entry_to_page(entry); - - if (PageAnon(page)) - rss[MM_ANONPAGES]--; - else - rss[MM_FILEPAGES]--; + rss[mm_counter(page)]--; } if (unlikely(!free_swap_and_cache(entry))) print_bad_pte(vma, addr, ptent, NULL); @@ -1193,7 +1177,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_trans_huge(*pmd)) { + if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { #ifdef CONFIG_DEBUG_VM if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { @@ -1204,7 +1188,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, BUG(); } #endif - split_huge_page_pmd(vma, addr, pmd); + split_huge_pmd(vma, pmd, addr); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ @@ -1460,7 +1444,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter_fast(mm, MM_FILEPAGES); + inc_mm_counter_fast(mm, mm_counter_file(page)); page_add_file_rmap(page); set_pte_at(mm, addr, pte, mk_pte(page, prot)); @@ -1517,7 +1501,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, EXPORT_SYMBOL(vm_insert_page); static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, pgprot_t prot) + pfn_t pfn, pgprot_t prot) { struct mm_struct *mm = vma->vm_mm; int retval; @@ -1533,7 +1517,10 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, goto out_unlock; /* Ok, finally just insert the thing.. */ - entry = pte_mkspecial(pfn_pte(pfn, prot)); + if (pfn_t_devmap(pfn)) + entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); + else + entry = pte_mkspecial(pfn_t_pte(pfn, prot)); set_pte_at(mm, addr, pte, entry); update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ @@ -1580,17 +1567,17 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; - if (track_pfn_insert(vma, &pgprot, pfn)) + if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV))) return -EINVAL; - ret = insert_pfn(vma, addr, pfn, pgprot); + ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot); return ret; } EXPORT_SYMBOL(vm_insert_pfn); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn) + pfn_t pfn) { BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); @@ -1604,10 +1591,15 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP * without pte special, it would there be refcounted as a normal page. */ - if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { + if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { struct page *page; - page = pfn_to_page(pfn); + /* + * At this point we are committed to insert_page() + * regardless of whether the caller specified flags that + * result in pfn_t_has_page() == false. + */ + page = pfn_to_page(pfn_t_to_pfn(pfn)); return insert_page(vma, addr, page, vma->vm_page_prot); } return insert_pfn(vma, addr, pfn, vma->vm_page_prot); @@ -1949,6 +1941,20 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo copy_user_highpage(dst, src, va, vma); } +static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) +{ + struct file *vm_file = vma->vm_file; + + if (vm_file) + return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO; + + /* + * Special mappings (e.g. VDSO) do not have any file so fake + * a default GFP_KERNEL for them. + */ + return GFP_KERNEL; +} + /* * Notify the address space that the page is about to become writable so that * it can prohibit this or wait for the page to get into an appropriate state. @@ -1964,6 +1970,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, vmf.virtual_address = (void __user *)(address & PAGE_MASK); vmf.pgoff = page->index; vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; + vmf.gfp_mask = __get_fault_gfp_mask(vma); vmf.page = page; vmf.cow_page = NULL; @@ -2083,7 +2090,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, cow_user_page(new_page, old_page, address, vma); } - if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) + if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) goto oom_free_new; __SetPageUptodate(new_page); @@ -2097,7 +2104,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { - dec_mm_counter_fast(mm, MM_FILEPAGES); + dec_mm_counter_fast(mm, + mm_counter_file(old_page)); inc_mm_counter_fast(mm, MM_ANONPAGES); } } else { @@ -2113,8 +2121,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, * thread doing COW. */ ptep_clear_flush_notify(vma, address, page_table); - page_add_new_anon_rmap(new_page, vma, address); - mem_cgroup_commit_charge(new_page, memcg, false); + page_add_new_anon_rmap(new_page, vma, address, false); + mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); /* * We call the notify macro here because, when using secondary @@ -2146,14 +2154,14 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ - page_remove_rmap(old_page); + page_remove_rmap(old_page, false); } /* Free the old page.. */ new_page = old_page; page_copied = 1; } else { - mem_cgroup_cancel_charge(new_page, memcg); + mem_cgroup_cancel_charge(new_page, memcg, false); } if (new_page) @@ -2168,7 +2176,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, */ if (page_copied && (vma->vm_flags & VM_LOCKED)) { lock_page(old_page); /* LRU manipulation */ - munlock_vma_page(old_page); + if (PageMlocked(old_page)) + munlock_vma_page(old_page); unlock_page(old_page); } page_cache_release(old_page); @@ -2228,11 +2237,6 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_get(old_page); - /* - * Only catch write-faults on shared writable pages, - * read-only shared pages can get COWed by - * get_user_pages(.write=1, .force=1). - */ if (vma->vm_ops && vma->vm_ops->page_mkwrite) { int tmp; @@ -2528,7 +2532,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_page; } - if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) { + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) { ret = VM_FAULT_OOM; goto out_page; } @@ -2562,7 +2566,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, pte = maybe_mkwrite(pte_mkdirty(pte), vma); flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; - exclusive = 1; + exclusive = RMAP_EXCLUSIVE; } flush_icache_page(vma, page); if (pte_swp_soft_dirty(orig_pte)) @@ -2570,15 +2574,16 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, set_pte_at(mm, address, page_table, pte); if (page == swapcache) { do_page_add_anon_rmap(page, vma, address, exclusive); - mem_cgroup_commit_charge(page, memcg, true); + mem_cgroup_commit_charge(page, memcg, true, false); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, address); - mem_cgroup_commit_charge(page, memcg, false); + page_add_new_anon_rmap(page, vma, address, false); + mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } swap_free(entry); - if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) + if (mem_cgroup_swap_full(page) || + (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) try_to_free_swap(page); unlock_page(page); if (page != swapcache) { @@ -2608,7 +2613,7 @@ unlock: out: return ret; out_nomap: - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); pte_unmap_unlock(page_table, ptl); out_page: unlock_page(page); @@ -2702,7 +2707,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!page) goto oom; - if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) goto oom_free_page; /* @@ -2723,15 +2728,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(page_table, ptl); - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); page_cache_release(page); return handle_userfault(vma, address, flags, VM_UFFD_MISSING); } inc_mm_counter_fast(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address); - mem_cgroup_commit_charge(page, memcg, false); + page_add_new_anon_rmap(page, vma, address, false); + mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); setpte: set_pte_at(mm, address, page_table, entry); @@ -2742,7 +2747,7 @@ unlock: pte_unmap_unlock(page_table, ptl); return 0; release: - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); page_cache_release(page); goto unlock; oom_free_page: @@ -2767,6 +2772,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, vmf.pgoff = pgoff; vmf.flags = flags; vmf.page = NULL; + vmf.gfp_mask = __get_fault_gfp_mask(vma); vmf.cow_page = cow_page; ret = vma->vm_ops->fault(vma, &vmf); @@ -2818,9 +2824,9 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address); + page_add_new_anon_rmap(page, vma, address, false); } else { - inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES); + inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page); } set_pte_at(vma->vm_mm, address, pte, entry); @@ -2933,6 +2939,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, vmf.pgoff = pgoff; vmf.max_pgoff = max_pgoff; vmf.flags = flags; + vmf.gfp_mask = __get_fault_gfp_mask(vma); vma->vm_ops->map_pages(vma, &vmf); } @@ -2993,7 +3000,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!new_page) return VM_FAULT_OOM; - if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) { + if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) { page_cache_release(new_page); return VM_FAULT_OOM; } @@ -3022,7 +3029,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto uncharge_out; } do_set_pte(vma, address, new_page, pte, true, true); - mem_cgroup_commit_charge(new_page, memcg, false); + mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); pte_unmap_unlock(pte, ptl); if (fault_page) { @@ -3037,7 +3044,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, } return ret; uncharge_out: - mem_cgroup_cancel_charge(new_page, memcg); + mem_cgroup_cancel_charge(new_page, memcg, false); page_cache_release(new_page); return ret; } @@ -3089,7 +3096,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, * pinned by vma->vm_file's reference. We rely on unlock_page()'s * release semantics to prevent the compiler from undoing this copying. */ - mapping = fault_page->mapping; + mapping = page_rmapping(fault_page); unlock_page(fault_page); if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { /* @@ -3191,6 +3198,12 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } + /* TODO: handle PTE-mapped THP */ + if (PageCompound(page)) { + pte_unmap_unlock(ptep, ptl); + return 0; + } + /* * Avoid grouping on RO pages in general. RO pages shouldn't hurt as * much anyway since they can be in shared cache state. This misses @@ -3363,17 +3376,9 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, int ret; barrier(); - if (pmd_trans_huge(orig_pmd)) { + if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { unsigned int dirty = flags & FAULT_FLAG_WRITE; - /* - * If the pmd is splitting, return and retry the - * the fault. Alternative: wait until the split - * is done, and goto retry. - */ - if (pmd_trans_splitting(orig_pmd)) - return 0; - if (pmd_protnone(orig_pmd)) return do_huge_pmd_numa_page(mm, vma, address, orig_pmd, pmd); @@ -3400,7 +3405,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; /* if an huge pmd materialized from under us just retry later */ - if (unlikely(pmd_trans_huge(*pmd))) + if (unlikely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) return 0; /* * A regular pmd is established and it can't morph into a huge pmd diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 67d488a..4af58a3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -17,6 +17,7 @@ #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/memory.h> +#include <linux/memremap.h> #include <linux/memory_hotplug.h> #include <linux/highmem.h> #include <linux/vmalloc.h> @@ -131,7 +132,8 @@ static struct resource *register_memory_resource(u64 start, u64 size) { struct resource *res; res = kzalloc(sizeof(struct resource), GFP_KERNEL); - BUG_ON(!res); + if (!res) + return ERR_PTR(-ENOMEM); res->name = "System RAM"; res->start = start; @@ -140,7 +142,7 @@ static struct resource *register_memory_resource(u64 start, u64 size) if (request_resource(&iomem_resource, res) < 0) { pr_debug("System RAM resource %pR cannot be added\n", res); kfree(res); - res = NULL; + return ERR_PTR(-EEXIST); } return res; } @@ -505,10 +507,25 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, unsigned long i; int err = 0; int start_sec, end_sec; + struct vmem_altmap *altmap; + /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); + altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn)); + if (altmap) { + /* + * Validate altmap is within bounds of the total request + */ + if (altmap->base_pfn != phys_start_pfn + || vmem_altmap_offset(altmap) > nr_pages) { + pr_warn_once("memory add fail, invalid altmap\n"); + return -EINVAL; + } + altmap->alloc = 0; + } + for (i = start_sec; i <= end_sec; i++) { err = __add_section(nid, zone, section_nr_to_pfn(i)); @@ -730,7 +747,8 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn) pgdat_resize_unlock(zone->zone_pgdat, &flags); } -static int __remove_section(struct zone *zone, struct mem_section *ms) +static int __remove_section(struct zone *zone, struct mem_section *ms, + unsigned long map_offset) { unsigned long start_pfn; int scn_nr; @@ -747,7 +765,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms) start_pfn = section_nr_to_pfn(scn_nr); __remove_zone(zone, start_pfn); - sparse_remove_one_section(zone, ms); + sparse_remove_one_section(zone, ms, map_offset); return 0; } @@ -766,9 +784,32 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, unsigned long nr_pages) { unsigned long i; - int sections_to_remove; - resource_size_t start, size; - int ret = 0; + unsigned long map_offset = 0; + int sections_to_remove, ret = 0; + + /* In the ZONE_DEVICE case device driver owns the memory region */ + if (is_dev_zone(zone)) { + struct page *page = pfn_to_page(phys_start_pfn); + struct vmem_altmap *altmap; + + altmap = to_vmem_altmap((unsigned long) page); + if (altmap) + map_offset = vmem_altmap_offset(altmap); + } else { + resource_size_t start, size; + + start = phys_start_pfn << PAGE_SHIFT; + size = nr_pages * PAGE_SIZE; + + ret = release_mem_region_adjustable(&iomem_resource, start, + size); + if (ret) { + resource_size_t endres = start + size - 1; + + pr_warn("Unable to release resource <%pa-%pa> (%d)\n", + &start, &endres, ret); + } + } /* * We can only remove entire sections @@ -776,23 +817,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); BUG_ON(nr_pages % PAGES_PER_SECTION); - start = phys_start_pfn << PAGE_SHIFT; - size = nr_pages * PAGE_SIZE; - - /* in the ZONE_DEVICE case device driver owns the memory region */ - if (!is_dev_zone(zone)) - ret = release_mem_region_adjustable(&iomem_resource, start, size); - if (ret) { - resource_size_t endres = start + size - 1; - - pr_warn("Unable to release resource <%pa-%pa> (%d)\n", - &start, &endres, ret); - } - sections_to_remove = nr_pages / PAGES_PER_SECTION; for (i = 0; i < sections_to_remove; i++) { unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; - ret = __remove_section(zone, __pfn_to_section(pfn)); + + ret = __remove_section(zone, __pfn_to_section(pfn), map_offset); + map_offset = 0; if (ret) break; } @@ -1312,8 +1342,8 @@ int __ref add_memory(int nid, u64 start, u64 size) int ret; res = register_memory_resource(start, size); - if (!res) - return -EEXIST; + if (IS_ERR(res)) + return PTR_ERR(res); ret = add_memory_resource(nid, res); if (ret < 0) @@ -1375,23 +1405,30 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) */ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) { - unsigned long pfn; + unsigned long pfn, sec_end_pfn; struct zone *zone = NULL; struct page *page; int i; - for (pfn = start_pfn; + for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn); pfn < end_pfn; - pfn += MAX_ORDER_NR_PAGES) { - i = 0; - /* This is just a CONFIG_HOLES_IN_ZONE check.*/ - while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) - i++; - if (i == MAX_ORDER_NR_PAGES) + pfn = sec_end_pfn + 1, sec_end_pfn += PAGES_PER_SECTION) { + /* Make sure the memory section is present first */ + if (!present_section_nr(pfn_to_section_nr(pfn))) continue; - page = pfn_to_page(pfn + i); - if (zone && page_zone(page) != zone) - return 0; - zone = page_zone(page); + for (; pfn < sec_end_pfn && pfn < end_pfn; + pfn += MAX_ORDER_NR_PAGES) { + i = 0; + /* This is just a CONFIG_HOLES_IN_ZONE check.*/ + while ((i < MAX_ORDER_NR_PAGES) && + !pfn_valid_within(pfn + i)) + i++; + if (i == MAX_ORDER_NR_PAGES) + continue; + page = pfn_to_page(pfn + i); + if (zone && page_zone(page) != zone) + return 0; + zone = page_zone(page); + } } return 1; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 87a1779..4c4187c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -489,14 +489,33 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, struct page *page; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; - int nid; + int nid, ret; pte_t *pte; spinlock_t *ptl; - split_huge_page_pmd(vma, addr, pmd); - if (pmd_trans_unstable(pmd)) - return 0; + if (pmd_trans_huge(*pmd)) { + ptl = pmd_lock(walk->mm, pmd); + if (pmd_trans_huge(*pmd)) { + page = pmd_page(*pmd); + if (is_huge_zero_page(page)) { + spin_unlock(ptl); + split_huge_pmd(vma, pmd, addr); + } else { + get_page(page); + spin_unlock(ptl); + lock_page(page); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + if (ret) + return 0; + } + } else { + spin_unlock(ptl); + } + } +retry: pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) @@ -513,9 +532,23 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, nid = page_to_nid(page); if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) continue; + if (PageTail(page) && PageAnon(page)) { + get_page(page); + pte_unmap_unlock(pte, ptl); + lock_page(page); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + /* Failed to split -- skip. */ + if (ret) { + pte = pte_offset_map_lock(walk->mm, pmd, + addr, &ptl); + continue; + } + goto retry; + } - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) - migrate_page_add(page, qp->pagelist, flags); + migrate_page_add(page, qp->pagelist, flags); } pte_unmap_unlock(pte - 1, ptl); cond_resched(); @@ -591,7 +624,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; - if (vma->vm_flags & VM_PFNMAP) + if (!vma_migratable(vma)) return 1; if (endvma > end) @@ -615,10 +648,8 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, return 1; } - if ((flags & MPOL_MF_STRICT) || - ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && - vma_migratable(vma))) - /* queue pages from current vma */ + /* queue pages from current vma */ + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) return 0; return 1; } @@ -2142,12 +2173,14 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) * * Remember policies even when nobody has shared memory mapped. * The policies are kept in Red-Black tree linked from the inode. - * They are protected by the sp->lock spinlock, which should be held + * They are protected by the sp->lock rwlock, which should be held * for any accesses to the tree. */ -/* lookup first element intersecting start-end */ -/* Caller holds sp->lock */ +/* + * lookup first element intersecting start-end. Caller holds sp->lock for + * reading or for writing + */ static struct sp_node * sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) { @@ -2178,8 +2211,10 @@ sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) return rb_entry(n, struct sp_node, nd); } -/* Insert a new shared policy into the list. */ -/* Caller holds sp->lock */ +/* + * Insert a new shared policy into the list. Caller holds sp->lock for + * writing. + */ static void sp_insert(struct shared_policy *sp, struct sp_node *new) { struct rb_node **p = &sp->root.rb_node; @@ -2211,13 +2246,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) if (!sp->root.rb_node) return NULL; - spin_lock(&sp->lock); + read_lock(&sp->lock); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } - spin_unlock(&sp->lock); + read_unlock(&sp->lock); return pol; } @@ -2360,7 +2395,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, int ret = 0; restart: - spin_lock(&sp->lock); + write_lock(&sp->lock); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { @@ -2393,7 +2428,7 @@ restart: } if (new) sp_insert(sp, new); - spin_unlock(&sp->lock); + write_unlock(&sp->lock); ret = 0; err_out: @@ -2405,7 +2440,7 @@ err_out: return ret; alloc_new: - spin_unlock(&sp->lock); + write_unlock(&sp->lock); ret = -ENOMEM; n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n_new) @@ -2431,7 +2466,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ - spin_lock_init(&sp->lock); + rwlock_init(&sp->lock); if (mpol) { struct vm_area_struct pvma; @@ -2497,14 +2532,14 @@ void mpol_free_shared_policy(struct shared_policy *p) if (!p->root.rb_node) return; - spin_lock(&p->lock); + write_lock(&p->lock); next = rb_first(&p->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); sp_delete(p, n); } - spin_unlock(&p->lock); + write_unlock(&p->lock); } #ifdef CONFIG_NUMA_BALANCING diff --git a/mm/migrate.c b/mm/migrate.c index 7890d0b..b1034f9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -165,9 +165,9 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, if (PageAnon(new)) hugepage_add_anon_rmap(new, vma, addr); else - page_dup_rmap(new); + page_dup_rmap(new, true); } else if (PageAnon(new)) - page_add_anon_rmap(new, vma, addr); + page_add_anon_rmap(new, vma, addr, false); else page_add_file_rmap(new); @@ -943,9 +943,13 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, goto out; } - if (unlikely(PageTransHuge(page))) - if (unlikely(split_huge_page(page))) + if (unlikely(PageTransHuge(page))) { + lock_page(page); + rc = split_huge_page(page); + unlock_page(page); + if (rc) goto out; + } rc = __unmap_and_move(page, newpage, force, mode); if (rc == MIGRATEPAGE_SUCCESS) @@ -1756,6 +1760,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, HPAGE_PMD_ORDER); if (!new_page) goto out_fail; + prep_transhuge_page(new_page); isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) { @@ -1767,7 +1772,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, flush_tlb_range(vma, mmun_start, mmun_end); /* Prepare a page as a migration target */ - __set_page_locked(new_page); + __SetPageLocked(new_page); SetPageSwapBacked(new_page); /* anon mapping, we can simply copy page->mapping to the new page: */ @@ -1815,7 +1820,7 @@ fail_putback: * guarantee the copy is visible before the pagetable update. */ flush_cache_range(vma, mmun_start, mmun_end); - page_add_anon_rmap(new_page, vma, mmun_start); + page_add_anon_rmap(new_page, vma, mmun_start, true); pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); set_pmd_at(mm, mmun_start, pmd, entry); flush_tlb_range(vma, mmun_start, mmun_end); @@ -1826,14 +1831,14 @@ fail_putback: flush_tlb_range(vma, mmun_start, mmun_end); mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); - page_remove_rmap(new_page); + page_remove_rmap(new_page, true); goto fail_putback; } mlock_migrate_page(new_page, page); set_page_memcg(new_page, page_memcg(page)); set_page_memcg(page, NULL); - page_remove_rmap(page); + page_remove_rmap(page, true); spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); diff --git a/mm/mincore.c b/mm/mincore.c index 14bb9fb..563f320 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -117,7 +117,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec = walk->private; int nr = (end - addr) >> PAGE_SHIFT; - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { memset(vec, 1, nr); spin_unlock(ptl); goto out; @@ -24,13 +24,13 @@ #include "internal.h" -int can_do_mlock(void) +bool can_do_mlock(void) { if (rlimit(RLIMIT_MEMLOCK) != 0) - return 1; + return true; if (capable(CAP_IPC_LOCK)) - return 1; - return 0; + return true; + return false; } EXPORT_SYMBOL(can_do_mlock); @@ -82,6 +82,9 @@ void mlock_vma_page(struct page *page) /* Serialize with page migration */ BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(PageTail(page), page); + VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); + if (!TestSetPageMlocked(page)) { mod_zone_page_state(page_zone(page), NR_MLOCK, hpage_nr_pages(page)); @@ -172,12 +175,14 @@ static void __munlock_isolation_failed(struct page *page) */ unsigned int munlock_vma_page(struct page *page) { - unsigned int nr_pages; + int nr_pages; struct zone *zone = page_zone(page); /* For try_to_munlock() and to serialize with page migration */ BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(PageTail(page), page); + /* * Serialize with any parallel __split_huge_page_refcount() which * might otherwise copy PageMlocked to part of the tail pages before @@ -388,6 +393,13 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, if (!page || page_zone_id(page) != zoneid) break; + /* + * Do not use pagevec for PTE-mapped THP, + * munlock_vma_pages_range() will handle them. + */ + if (PageTransCompound(page)) + break; + get_page(page); /* * Increase the address that will be returned *before* the @@ -425,7 +437,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, vma->vm_flags &= VM_LOCKED_CLEAR_MASK; while (start < end) { - struct page *page = NULL; + struct page *page; unsigned int page_mask; unsigned long page_increm; struct pagevec pvec; @@ -444,7 +456,10 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, &page_mask); if (page && !IS_ERR(page)) { - if (PageTransHuge(page)) { + if (PageTransTail(page)) { + VM_BUG_ON_PAGE(PageMlocked(page), page); + put_page(page); /* follow_page_mask() */ + } else if (PageTransHuge(page)) { lock_page(page); /* * Any THP page found by follow_page_mask() may @@ -477,8 +492,6 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, goto next; } } - /* It's a bug to munlock in the middle of a THP page */ - VM_BUG_ON((start >> PAGE_SHIFT) & page_mask); page_increm = 1 + page_mask; start += page_increm * PAGE_SIZE; next: @@ -42,6 +42,7 @@ #include <linux/memory.h> #include <linux/printk.h> #include <linux/userfaultfd_k.h> +#include <linux/moduleparam.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -58,6 +59,20 @@ #define arch_rebalance_pgtables(addr, len) (addr) #endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS +const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN; +const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX; +int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS; +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS +const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN; +const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX; +int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; +#endif + +static bool ignore_rlimit_data = true; +core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); + static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); @@ -375,8 +390,9 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma) } #ifdef CONFIG_DEBUG_VM_RB -static int browse_rb(struct rb_root *root) +static int browse_rb(struct mm_struct *mm) { + struct rb_root *root = &mm->mm_rb; int i = 0, j, bug = 0; struct rb_node *nd, *pn = NULL; unsigned long prev = 0, pend = 0; @@ -399,12 +415,14 @@ static int browse_rb(struct rb_root *root) vma->vm_start, vma->vm_end); bug = 1; } + spin_lock(&mm->page_table_lock); if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { pr_emerg("free gap %lx, correct %lx\n", vma->rb_subtree_gap, vma_compute_subtree_gap(vma)); bug = 1; } + spin_unlock(&mm->page_table_lock); i++; pn = nd; prev = vma->vm_start; @@ -441,12 +459,16 @@ static void validate_mm(struct mm_struct *mm) struct vm_area_struct *vma = mm->mmap; while (vma) { + struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; - vma_lock_anon_vma(vma); - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - anon_vma_interval_tree_verify(avc); - vma_unlock_anon_vma(vma); + if (anon_vma) { + anon_vma_lock_read(anon_vma); + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_verify(avc); + anon_vma_unlock_read(anon_vma); + } + highest_address = vma->vm_end; vma = vma->vm_next; i++; @@ -460,7 +482,7 @@ static void validate_mm(struct mm_struct *mm) mm->highest_vm_end, highest_address); bug = 1; } - i = browse_rb(&mm->mm_rb); + i = browse_rb(mm); if (i != mm->map_count) { if (i != -1) pr_emerg("map_count %d rb %d\n", mm->map_count, i); @@ -1208,24 +1230,6 @@ none: return NULL; } -#ifdef CONFIG_PROC_FS -void vm_stat_account(struct mm_struct *mm, unsigned long flags, - struct file *file, long pages) -{ - const unsigned long stack_flags - = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); - - mm->total_vm += pages; - - if (file) { - mm->shared_vm += pages; - if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) - mm->exec_vm += pages; - } else if (flags & stack_flags) - mm->stack_vm += pages; -} -#endif /* CONFIG_PROC_FS */ - /* * If a hint addr is less than mmap_min_addr change hint to be as * low as possible but still greater than mmap_min_addr @@ -1544,19 +1548,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long charged = 0; /* Check against address space limit. */ - if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { + if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { unsigned long nr_pages; /* * MAP_FIXED may remove pages of mappings that intersects with * requested mapping. Account for the pages it would unmap. */ - if (!(vm_flags & MAP_FIXED)) - return -ENOMEM; - nr_pages = count_vma_pages_range(mm, addr, addr + len); - if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) + if (!may_expand_vm(mm, vm_flags, + (len >> PAGE_SHIFT) - nr_pages)) return -ENOMEM; } @@ -1655,7 +1657,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, out: perf_event_mmap(vma); - vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); + vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))) @@ -2102,7 +2104,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns unsigned long new_start, actual_size; /* address space limit tests */ - if (!may_expand_vm(mm, grow)) + if (!may_expand_vm(mm, vma->vm_flags, grow)) return -ENOMEM; /* Stack limit test */ @@ -2147,32 +2149,27 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns int expand_upwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; - int error; + int error = 0; if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; - /* - * We must make sure the anon_vma is allocated - * so that the anon_vma locking is not a noop. - */ + /* Guard against wrapping around to address 0. */ + if (address < PAGE_ALIGN(address+4)) + address = PAGE_ALIGN(address+4); + else + return -ENOMEM; + + /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) return -ENOMEM; - vma_lock_anon_vma(vma); /* * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_sem in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. - * Also guard against wrapping around to address 0. */ - if (address < PAGE_ALIGN(address+4)) - address = PAGE_ALIGN(address+4); - else { - vma_unlock_anon_vma(vma); - return -ENOMEM; - } - error = 0; + anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address > vma->vm_end) { @@ -2190,7 +2187,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) * updates, but we only hold a shared mmap_sem * lock here, so we need to protect against * concurrent vma expansions. - * vma_lock_anon_vma() doesn't help here, as + * anon_vma_lock_write() doesn't help here, as * we don't guarantee that all growable vmas * in a mm share the same root anon vma. * So, we reuse mm->page_table_lock to guard @@ -2199,8 +2196,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, - vma->vm_file, grow); + vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; anon_vma_interval_tree_post_update_vma(vma); @@ -2214,7 +2210,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } } - vma_unlock_anon_vma(vma); + anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(mm); return error; @@ -2230,25 +2226,21 @@ int expand_downwards(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; int error; - /* - * We must make sure the anon_vma is allocated - * so that the anon_vma locking is not a noop. - */ - if (unlikely(anon_vma_prepare(vma))) - return -ENOMEM; - address &= PAGE_MASK; error = security_mmap_addr(address); if (error) return error; - vma_lock_anon_vma(vma); + /* We must make sure the anon_vma is allocated. */ + if (unlikely(anon_vma_prepare(vma))) + return -ENOMEM; /* * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_sem in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. */ + anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address < vma->vm_start) { @@ -2266,7 +2258,7 @@ int expand_downwards(struct vm_area_struct *vma, * updates, but we only hold a shared mmap_sem * lock here, so we need to protect against * concurrent vma expansions. - * vma_lock_anon_vma() doesn't help here, as + * anon_vma_lock_write() doesn't help here, as * we don't guarantee that all growable vmas * in a mm share the same root anon vma. * So, we reuse mm->page_table_lock to guard @@ -2275,8 +2267,7 @@ int expand_downwards(struct vm_area_struct *vma, spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, - vma->vm_file, grow); + vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; @@ -2288,7 +2279,7 @@ int expand_downwards(struct vm_area_struct *vma, } } } - vma_unlock_anon_vma(vma); + anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(mm); return error; @@ -2390,7 +2381,7 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) if (vma->vm_flags & VM_ACCOUNT) nr_accounted += nrpages; - vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); + vm_stat_account(mm, vma->vm_flags, -nrpages); vma = remove_vma(vma); } while (vma); vm_unacct_memory(nr_accounted); @@ -2760,7 +2751,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) } /* Check against address space limits *after* clearing old maps... */ - if (!may_expand_vm(mm, len >> PAGE_SHIFT)) + if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) return -ENOMEM; if (mm->map_count > sysctl_max_map_count) @@ -2795,6 +2786,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; + mm->data_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vma->vm_flags |= VM_SOFTDIRTY; @@ -2986,16 +2978,36 @@ out: * Return true if the calling process may expand its vm space by the passed * number of pages */ -int may_expand_vm(struct mm_struct *mm, unsigned long npages) -{ - unsigned long cur = mm->total_vm; /* pages */ - unsigned long lim; +bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) +{ + if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT) + return false; + + if (is_data_mapping(flags) && + mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) { + if (ignore_rlimit_data) + pr_warn_once("%s (%d): VmData %lu exceed data ulimit " + "%lu. Will be forbidden soon.\n", + current->comm, current->pid, + (mm->data_vm + npages) << PAGE_SHIFT, + rlimit(RLIMIT_DATA)); + else + return false; + } + + return true; +} - lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT; +void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) +{ + mm->total_vm += npages; - if (cur + npages > lim) - return 0; - return 1; + if (is_exec_mapping(flags)) + mm->exec_vm += npages; + else if (is_stack_mapping(flags)) + mm->stack_vm += npages; + else if (is_data_mapping(flags)) + mm->data_vm += npages; } static int special_mapping_fault(struct vm_area_struct *vma, @@ -3077,7 +3089,7 @@ static struct vm_area_struct *__install_special_mapping( if (ret) goto out; - mm->total_vm += len >> PAGE_SHIFT; + vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); perf_event_mmap(vma); @@ -3181,10 +3193,16 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) * mapping->flags avoid to take the same lock twice, if more than one * vma in this mm is backed by the same anon_vma or address_space. * - * We can take all the locks in random order because the VM code - * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never - * takes more than one of them in a row. Secondly we're protected - * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. + * We take locks in following order, accordingly to comment at beginning + * of mm/rmap.c: + * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for + * hugetlb mapping); + * - all i_mmap_rwsem locks; + * - all anon_vma->rwseml + * + * We can take all locks within these types randomly because the VM code + * doesn't nest them and we protected from parallel mm_take_all_locks() by + * mm_all_locks_mutex. * * mm_take_all_locks() and mm_drop_all_locks are expensive operations * that may have to take thousand of locks. @@ -3203,7 +3221,16 @@ int mm_take_all_locks(struct mm_struct *mm) for (vma = mm->mmap; vma; vma = vma->vm_next) { if (signal_pending(current)) goto out_unlock; - if (vma->vm_file && vma->vm_file->f_mapping) + if (vma->vm_file && vma->vm_file->f_mapping && + is_vm_hugetlb_page(vma)) + vm_lock_mapping(mm, vma->vm_file->f_mapping); + } + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (signal_pending(current)) + goto out_unlock; + if (vma->vm_file && vma->vm_file->f_mapping && + !is_vm_hugetlb_page(vma)) vm_lock_mapping(mm, vma->vm_file->f_mapping); } diff --git a/mm/mmzone.c b/mm/mmzone.c index 7d87ebb..52687fb 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -72,16 +72,16 @@ struct zoneref *next_zones_zonelist(struct zoneref *z, } #ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL -int memmap_valid_within(unsigned long pfn, +bool memmap_valid_within(unsigned long pfn, struct page *page, struct zone *zone) { if (page_to_pfn(page) != pfn) - return 0; + return false; if (page_zone(page) != zone) - return 0; + return false; - return 1; + return true; } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ diff --git a/mm/mprotect.c b/mm/mprotect.c index ef5be8e..8eb7bb4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -149,7 +149,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, unsigned long this_pages; next = pmd_addr_end(addr, end); - if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd)) + if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd) + && pmd_none_or_clear_bad(pmd)) continue; /* invoke the mmu notifier if the pmd is populated */ @@ -158,9 +159,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(mm, mni_start, end); } - if (pmd_trans_huge(*pmd)) { + if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) - split_huge_page_pmd(vma, addr, pmd); + split_huge_pmd(vma, pmd, addr); else { int nr_ptes = change_huge_pmd(vma, pmd, addr, newprot, prot_numa); @@ -278,6 +279,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, * even if read-only so there is no need to account for them here */ if (newflags & VM_WRITE) { + /* Check space limits when area turns into data. */ + if (!may_expand_vm(mm, newflags, nrpages) && + may_expand_vm(mm, oldflags, nrpages)) + return -ENOMEM; if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| VM_SHARED|VM_NORESERVE))) { charged = nrpages; @@ -334,8 +339,8 @@ success: populate_vma_page_range(vma, start, end, NULL); } - vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); - vm_stat_account(mm, newflags, vma->vm_file, nrpages); + vm_stat_account(mm, oldflags, -nrpages); + vm_stat_account(mm, newflags, nrpages); perf_event_mmap(vma); return 0; diff --git a/mm/mremap.c b/mm/mremap.c index c25bc62..d77946a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -192,25 +192,24 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (!new_pmd) break; if (pmd_trans_huge(*old_pmd)) { - int err = 0; if (extent == HPAGE_PMD_SIZE) { + bool moved; VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma, vma); /* See comment in move_ptes() */ if (need_rmap_locks) anon_vma_lock_write(vma->anon_vma); - err = move_huge_pmd(vma, new_vma, old_addr, + moved = move_huge_pmd(vma, new_vma, old_addr, new_addr, old_end, old_pmd, new_pmd); if (need_rmap_locks) anon_vma_unlock_write(vma->anon_vma); + if (moved) { + need_flush = true; + continue; + } } - if (err > 0) { - need_flush = true; - continue; - } else if (!err) { - split_huge_page_pmd(vma, old_addr, old_pmd); - } + split_huge_pmd(vma, old_pmd, old_addr); VM_BUG_ON(pmd_trans_huge(*old_pmd)); } if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma, @@ -317,7 +316,11 @@ static unsigned long move_vma(struct vm_area_struct *vma, * If this were a serious issue, we'd add a flag to do_munmap(). */ hiwater_vm = mm->hiwater_vm; - vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); + vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT); + + /* Tell pfnmap has moved from this vma */ + if (unlikely(vma->vm_flags & VM_PFNMAP)) + untrack_pfn_moved(vma); if (do_munmap(mm, old_addr, old_len) < 0) { /* OOM: unable to split vma, just get accounts right */ @@ -379,7 +382,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, return ERR_PTR(-EAGAIN); } - if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) + if (!may_expand_vm(mm, vma->vm_flags, + (new_len - old_len) >> PAGE_SHIFT)) return ERR_PTR(-ENOMEM); if (vma->vm_flags & VM_ACCOUNT) { @@ -541,7 +545,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, goto out; } - vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); + vm_stat_account(mm, vma->vm_flags, pages); if (vma->vm_flags & VM_LOCKED) { mm->locked_vm += pages; locked = true; diff --git a/mm/nobootmem.c b/mm/nobootmem.c index e57cf24..99feb2b 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -31,6 +31,7 @@ EXPORT_SYMBOL(contig_page_data); unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; +unsigned long long max_possible_pfn; static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, u64 goal, u64 limit) @@ -560,7 +560,7 @@ void __init mmap_init(void) ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); - vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); + vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT); } /* diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c126809..dc490c0 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -585,10 +585,11 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, */ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); mark_oom_victim(victim); - pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", + pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), K(get_mm_counter(victim->mm, MM_ANONPAGES)), - K(get_mm_counter(victim->mm, MM_FILEPAGES))); + K(get_mm_counter(victim->mm, MM_FILEPAGES)), + K(get_mm_counter(victim->mm, MM_SHMEMPAGES))); task_unlock(victim); /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d15d88c..6fe7d15 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -278,7 +278,12 @@ static unsigned long zone_dirtyable_memory(struct zone *zone) unsigned long nr_pages; nr_pages = zone_page_state(zone, NR_FREE_PAGES); - nr_pages -= min(nr_pages, zone->dirty_balance_reserve); + /* + * Pages reserved for the kernel should not be considered + * dirtyable, to prevent a situation where reclaim has to + * clean pages in order to balance the zones. + */ + nr_pages -= min(nr_pages, zone->totalreserve_pages); nr_pages += zone_page_state(zone, NR_INACTIVE_FILE); nr_pages += zone_page_state(zone, NR_ACTIVE_FILE); @@ -332,7 +337,12 @@ static unsigned long global_dirtyable_memory(void) unsigned long x; x = global_page_state(NR_FREE_PAGES); - x -= min(x, dirty_balance_reserve); + /* + * Pages reserved for the kernel should not be considered + * dirtyable, to prevent a situation where reclaim has to + * clean pages in order to balance the zones. + */ + x -= min(x, totalreserve_pages); x += global_page_state(NR_INACTIVE_FILE); x += global_page_state(NR_ACTIVE_FILE); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9d666df..838ca8bb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -43,6 +43,7 @@ #include <linux/vmalloc.h> #include <linux/vmstat.h> #include <linux/mempolicy.h> +#include <linux/memremap.h> #include <linux/stop_machine.h> #include <linux/sort.h> #include <linux/pfn.h> @@ -114,13 +115,6 @@ static DEFINE_SPINLOCK(managed_page_count_lock); unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; unsigned long totalcma_pages __read_mostly; -/* - * When calculating the number of globally allowed dirty pages, there - * is a certain number of per-zone reserves that should not be - * considered dirtyable memory. This is the sum of those reserves - * over all existing zones that contribute dirtyable memory. - */ -unsigned long dirty_balance_reserve __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; @@ -229,13 +223,15 @@ static char * const zone_names[MAX_NR_ZONES] = { #endif }; -static void free_compound_page(struct page *page); compound_page_dtor * const compound_page_dtors[] = { NULL, free_compound_page, #ifdef CONFIG_HUGETLB_PAGE free_huge_page, #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + free_transhuge_page, +#endif }; int min_free_kbytes = 1024; @@ -457,7 +453,7 @@ out: * This usage means that zero-order pages may not be compound. */ -static void free_compound_page(struct page *page) +void free_compound_page(struct page *page) { __free_pages_ok(page, compound_order(page)); } @@ -473,8 +469,10 @@ void prep_compound_page(struct page *page, unsigned int order) for (i = 1; i < nr_pages; i++) { struct page *p = page + i; set_page_count(p, 0); + p->mapping = TAIL_MAPPING; set_compound_head(p, page); } + atomic_set(compound_mapcount_ptr(page), -1); } #ifdef CONFIG_DEBUG_PAGEALLOC @@ -739,7 +737,7 @@ static inline int free_pages_check(struct page *page) const char *bad_reason = NULL; unsigned long bad_flags = 0; - if (unlikely(page_mapcount(page))) + if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; @@ -812,7 +810,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, do { int mt; /* migratetype of the to-be-freed page */ - page = list_entry(list->prev, struct page, lru); + page = list_last_entry(list, struct page, lru); /* must delete as __free_one_page list manipulates */ list_del(&page->lru); @@ -863,6 +861,27 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) ret = 0; goto out; } + switch (page - head_page) { + case 1: + /* the first tail page: ->mapping is compound_mapcount() */ + if (unlikely(compound_mapcount(page))) { + bad_page(page, "nonzero compound_mapcount", 0); + goto out; + } + break; + case 2: + /* + * the second tail page: ->mapping is + * page_deferred_list().next -- ignore value. + */ + break; + default: + if (page->mapping != TAIL_MAPPING) { + bad_page(page, "corrupted mapping in tail page", 0); + goto out; + } + break; + } if (unlikely(!PageTail(page))) { bad_page(page, "PageTail not set", 0); goto out; @@ -873,6 +892,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) } ret = 0; out: + page->mapping = NULL; clear_compound_head(page); return ret; } @@ -1336,7 +1356,7 @@ static inline int check_new_page(struct page *page) const char *bad_reason = NULL; unsigned long bad_flags = 0; - if (unlikely(page_mapcount(page))) + if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; @@ -1417,11 +1437,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, /* Find a page of the appropriate size in the preferred list */ for (current_order = order; current_order < MAX_ORDER; ++current_order) { area = &(zone->free_area[current_order]); - if (list_empty(&area->free_list[migratetype])) - continue; - - page = list_entry(area->free_list[migratetype].next, + page = list_first_entry_or_null(&area->free_list[migratetype], struct page, lru); + if (!page) + continue; list_del(&page->lru); rmv_page_order(page); area->nr_free--; @@ -1700,12 +1719,12 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac) for (order = 0; order < MAX_ORDER; order++) { struct free_area *area = &(zone->free_area[order]); - if (list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) + page = list_first_entry_or_null( + &area->free_list[MIGRATE_HIGHATOMIC], + struct page, lru); + if (!page) continue; - page = list_entry(area->free_list[MIGRATE_HIGHATOMIC].next, - struct page, lru); - /* * It should never happen but changes to locking could * inadvertently allow a per-cpu drain to add pages @@ -1753,7 +1772,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) if (fallback_mt == -1) continue; - page = list_entry(area->free_list[fallback_mt].next, + page = list_first_entry(&area->free_list[fallback_mt], struct page, lru); if (can_steal) steal_suitable_fallback(zone, page, start_migratetype); @@ -1788,7 +1807,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) * Call me with the zone->lock already held. */ static struct page *__rmqueue(struct zone *zone, unsigned int order, - int migratetype, gfp_t gfp_flags) + int migratetype) { struct page *page; @@ -1818,7 +1837,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, spin_lock(&zone->lock); for (i = 0; i < count; ++i) { - struct page *page = __rmqueue(zone, order, migratetype, 0); + struct page *page = __rmqueue(zone, order, migratetype); if (unlikely(page == NULL)) break; @@ -1988,7 +2007,7 @@ void mark_free_pages(struct zone *zone) unsigned long pfn, max_zone_pfn; unsigned long flags; unsigned int order, t; - struct list_head *curr; + struct page *page; if (zone_is_empty(zone)) return; @@ -1998,17 +2017,17 @@ void mark_free_pages(struct zone *zone) max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - + page = pfn_to_page(pfn); if (!swsusp_page_is_forbidden(page)) swsusp_unset_page_free(page); } for_each_migratetype_order(order, t) { - list_for_each(curr, &zone->free_area[order].free_list[t]) { + list_for_each_entry(page, + &zone->free_area[order].free_list[t], lru) { unsigned long i; - pfn = page_to_pfn(list_entry(curr, struct page, lru)); + pfn = page_to_pfn(page); for (i = 0; i < (1UL << order); i++) swsusp_set_page_free(pfn_to_page(pfn + i)); } @@ -2212,9 +2231,9 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, } if (cold) - page = list_entry(list->prev, struct page, lru); + page = list_last_entry(list, struct page, lru); else - page = list_entry(list->next, struct page, lru); + page = list_first_entry(list, struct page, lru); list_del(&page->lru); pcp->count--; @@ -2241,7 +2260,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, trace_mm_page_alloc_zone_locked(page, order, migratetype); } if (!page) - page = __rmqueue(zone, order, migratetype, gfp_flags); + page = __rmqueue(zone, order, migratetype); spin_unlock(&zone->lock); if (!page) goto failed; @@ -2740,8 +2759,21 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; } /* Exhausted what can be done so it's blamo time */ - if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) + if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { *did_some_progress = 1; + + if (gfp_mask & __GFP_NOFAIL) { + page = get_page_from_freelist(gfp_mask, order, + ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac); + /* + * fallback to ignore cpuset restriction if our nodes + * are depleted + */ + if (!page) + page = get_page_from_freelist(gfp_mask, order, + ALLOC_NO_WATERMARKS, ac); + } + } out: mutex_unlock(&oom_lock); return page; @@ -2876,28 +2908,6 @@ retry: return page; } -/* - * This is called in the allocator slow-path if the allocation request is of - * sufficient urgency to ignore watermarks and take other desperate measures - */ -static inline struct page * -__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, - const struct alloc_context *ac) -{ - struct page *page; - - do { - page = get_page_from_freelist(gfp_mask, order, - ALLOC_NO_WATERMARKS, ac); - - if (!page && gfp_mask & __GFP_NOFAIL) - wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, - HZ/50); - } while (!page && (gfp_mask & __GFP_NOFAIL)); - - return page; -} - static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) { struct zoneref *z; @@ -3042,28 +3052,36 @@ retry: * allocations are system rather than user orientated */ ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); - - page = __alloc_pages_high_priority(gfp_mask, order, ac); - - if (page) { + page = get_page_from_freelist(gfp_mask, order, + ALLOC_NO_WATERMARKS, ac); + if (page) goto got_pg; - } } /* Caller is not willing to reclaim, we can't balance anything */ if (!can_direct_reclaim) { /* - * All existing users of the deprecated __GFP_NOFAIL are - * blockable, so warn of any new users that actually allow this - * type of allocation to fail. + * All existing users of the __GFP_NOFAIL are blockable, so warn + * of any new users that actually allow this type of allocation + * to fail. */ WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); goto nopage; } /* Avoid recursion of direct reclaim */ - if (current->flags & PF_MEMALLOC) + if (current->flags & PF_MEMALLOC) { + /* + * __GFP_NOFAIL request from this context is rather bizarre + * because we cannot reclaim anything and only can loop waiting + * for somebody to do a work for us. + */ + if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { + cond_resched(); + goto retry; + } goto nopage; + } /* Avoid allocations with no watermarks from looping endlessly */ if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) @@ -3402,7 +3420,8 @@ EXPORT_SYMBOL(__free_page_frag); /* * alloc_kmem_pages charges newly allocated pages to the kmem resource counter - * of the current memory cgroup. + * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is + * equivalent to alloc_pages. * * It should be used when the caller would like to use kmalloc, but since the * allocation is large, it has to fall back to the page allocator. @@ -4147,8 +4166,7 @@ static void set_zonelist_order(void) static void build_zonelists(pg_data_t *pgdat) { - int j, node, load; - enum zone_type i; + int i, node, load; nodemask_t used_mask; int local_node, prev_node; struct zonelist *zonelist; @@ -4168,7 +4186,7 @@ static void build_zonelists(pg_data_t *pgdat) nodes_clear(used_mask); memset(node_order, 0, sizeof(node_order)); - j = 0; + i = 0; while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { /* @@ -4185,12 +4203,12 @@ static void build_zonelists(pg_data_t *pgdat) if (order == ZONELIST_ORDER_NODE) build_zonelists_in_node_order(pgdat, node); else - node_order[j++] = node; /* remember order */ + node_order[i++] = node; /* remember order */ } if (order == ZONELIST_ORDER_ZONE) { /* calculate node order -- i.e., DMA last! */ - build_zonelists_in_zone_order(pgdat, j); + build_zonelists_in_zone_order(pgdat, i); } build_thisnode_zonelists(pgdat); @@ -4468,16 +4486,22 @@ static inline unsigned long wait_table_bits(unsigned long size) void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, enum memmap_context context) { - pg_data_t *pgdat = NODE_DATA(nid); + struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn)); unsigned long end_pfn = start_pfn + size; + pg_data_t *pgdat = NODE_DATA(nid); unsigned long pfn; - struct zone *z; unsigned long nr_initialised = 0; if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; - z = &pgdat->node_zones[zone]; + /* + * Honor reservation requested by the driver for this ZONE_DEVICE + * memory + */ + if (altmap && start_pfn == altmap->base_pfn) + start_pfn += altmap->reserve; + for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* * There can be holes in boot-time mem_map[]s @@ -5186,6 +5210,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) pgdat->numabalancing_migrate_nr_pages = 0; pgdat->numabalancing_migrate_next_window = jiffies; #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + spin_lock_init(&pgdat->split_queue_lock); + INIT_LIST_HEAD(&pgdat->split_queue); + pgdat->split_queue_len = 0; +#endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); pgdat_page_ext_init(pgdat); @@ -5956,20 +5985,12 @@ static void calculate_totalreserve_pages(void) if (max > zone->managed_pages) max = zone->managed_pages; + + zone->totalreserve_pages = max; + reserve_pages += max; - /* - * Lowmem reserves are not available to - * GFP_HIGHUSER page cache allocations and - * kswapd tries to balance zones to their high - * watermark. As a result, neither should be - * regarded as dirtyable memory, to prevent a - * situation where reclaim has to clean pages - * in order to balance the zones. - */ - zone->dirty_balance_reserve = max; } } - dirty_balance_reserve = reserve_pages; totalreserve_pages = reserve_pages; } @@ -6599,7 +6620,7 @@ bool is_pageblock_removable_nolock(struct page *page) return !has_unmovable_pages(zone, page, 0, true); } -#ifdef CONFIG_CMA +#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) static unsigned long pfn_max_align_down(unsigned long pfn) { @@ -6724,8 +6745,12 @@ int alloc_contig_range(unsigned long start, unsigned long end, if (ret) return ret; + /* + * In case of -EBUSY, we'd like to know which page causes problem. + * So, just fall through. We will check it in test_pages_isolated(). + */ ret = __alloc_contig_migrate_range(&cc, start, end); - if (ret) + if (ret && ret != -EBUSY) goto done; /* @@ -6752,12 +6777,25 @@ int alloc_contig_range(unsigned long start, unsigned long end, outer_start = start; while (!PageBuddy(pfn_to_page(outer_start))) { if (++order >= MAX_ORDER) { - ret = -EBUSY; - goto done; + outer_start = start; + break; } outer_start &= ~0UL << order; } + if (outer_start != start) { + order = page_order(pfn_to_page(outer_start)); + + /* + * outer_start page could be small order buddy page and + * it doesn't include start page. Adjust outer_start + * in this case to report failed page properly + * on tracepoint in test_pages_isolated() + */ + if (outer_start + (1UL << order) <= start) + outer_start = start; + } + /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, false)) { pr_info("%s: [%lx, %lx) PFNs busy\n", diff --git a/mm/page_idle.c b/mm/page_idle.c index d5dd790..4ea9c4e 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -55,25 +55,26 @@ static int page_idle_clear_pte_refs_one(struct page *page, unsigned long addr, void *arg) { struct mm_struct *mm = vma->vm_mm; - spinlock_t *ptl; pmd_t *pmd; pte_t *pte; + spinlock_t *ptl; bool referenced = false; - if (unlikely(PageTransHuge(page))) { - pmd = page_check_address_pmd(page, mm, addr, - PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); - if (pmd) { - referenced = pmdp_clear_young_notify(vma, addr, pmd); - spin_unlock(ptl); - } + if (!page_check_address_transhuge(page, mm, addr, &pmd, &pte, &ptl)) + return SWAP_AGAIN; + + if (pte) { + referenced = ptep_clear_young_notify(vma, addr, pte); + pte_unmap(pte); + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + referenced = pmdp_clear_young_notify(vma, addr, pmd); } else { - pte = page_check_address(page, mm, addr, &ptl, 0); - if (pte) { - referenced = ptep_clear_young_notify(vma, addr, pte); - pte_unmap_unlock(pte, ptl); - } + /* unexpected pmd-mapped page? */ + WARN_ON_ONCE(1); } + + spin_unlock(ptl); + if (referenced) { clear_page_idle(page); /* diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 4568fd5..92c4c36 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -9,6 +9,9 @@ #include <linux/hugetlb.h> #include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/page_isolation.h> + static int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) { @@ -162,8 +165,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned long undo_pfn; struct page *page; - BUG_ON((start_pfn) & (pageblock_nr_pages - 1)); - BUG_ON((end_pfn) & (pageblock_nr_pages - 1)); + BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages)); + BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages)); for (pfn = start_pfn; pfn < end_pfn; @@ -193,8 +196,10 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, { unsigned long pfn; struct page *page; - BUG_ON((start_pfn) & (pageblock_nr_pages - 1)); - BUG_ON((end_pfn) & (pageblock_nr_pages - 1)); + + BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages)); + BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages)); + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { @@ -212,7 +217,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, * * Returns 1 if all pages in the range are isolated. */ -static int +static unsigned long __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, bool skip_hwpoisoned_pages) { @@ -237,9 +242,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, else break; } - if (pfn < end_pfn) - return 0; - return 1; + + return pfn; } int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, @@ -248,7 +252,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, unsigned long pfn, flags; struct page *page; struct zone *zone; - int ret; /* * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages @@ -266,10 +269,13 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, /* Check all pages are free or marked as ISOLATED */ zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); - ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn, + pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, skip_hwpoisoned_pages); spin_unlock_irqrestore(&zone->lock, flags); - return ret ? 0 : -EBUSY; + + trace_test_pages_isolated(start_pfn, end_pfn, pfn); + + return pfn < end_pfn ? -EBUSY : 0; } struct page *alloc_migrate_target(struct page *page, unsigned long private, diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 29f2f8b..2072444 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -58,7 +58,7 @@ again: if (!walk->pte_entry) continue; - split_huge_page_pmd_mm(walk->mm, addr, pmd); + split_huge_pmd(walk->vma, pmd, addr); if (pmd_trans_unstable(pmd)) goto again; err = walk_pte_range(pmd, addr, next, walk); diff --git a/mm/percpu.c b/mm/percpu.c index 8a943b9..998607a 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -305,16 +305,12 @@ static void *pcpu_mem_zalloc(size_t size) /** * pcpu_mem_free - free memory * @ptr: memory to free - * @size: size of the area * * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc(). */ -static void pcpu_mem_free(void *ptr, size_t size) +static void pcpu_mem_free(void *ptr) { - if (size <= PAGE_SIZE) - kfree(ptr); - else - vfree(ptr); + kvfree(ptr); } /** @@ -463,8 +459,8 @@ out_unlock: * pcpu_mem_free() might end up calling vfree() which uses * IRQ-unsafe lock and thus can't be called under pcpu_lock. */ - pcpu_mem_free(old, old_size); - pcpu_mem_free(new, new_size); + pcpu_mem_free(old); + pcpu_mem_free(new); return 0; } @@ -732,7 +728,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); if (!chunk->map) { - pcpu_mem_free(chunk, pcpu_chunk_struct_size); + pcpu_mem_free(chunk); return NULL; } @@ -753,8 +749,8 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; - pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); - pcpu_mem_free(chunk, pcpu_chunk_struct_size); + pcpu_mem_free(chunk->map); + pcpu_mem_free(chunk); } /** diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 7d3db02..9d47676 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -132,25 +132,13 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(!pmd_trans_huge(*pmdp)); + VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } #endif -#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH -void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) -{ - pmd_t pmd = pmd_mksplitting(*pmdp); - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - set_pmd_at(vma->vm_mm, address, pmdp, pmd); - /* tlb flush only to serialize against gup-fast */ - flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); -} -#endif - #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) @@ -176,13 +164,10 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) /* FIFO */ pgtable = pmd_huge_pte(mm, pmdp); - if (list_empty(&pgtable->lru)) - pmd_huge_pte(mm, pmdp) = NULL; - else { - pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next, - struct page, lru); + pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru, + struct page, lru); + if (pmd_huge_pte(mm, pmdp)) list_del(&pgtable->lru); - } return pgtable; } #endif diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index e88d071..5d453e5 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -194,7 +194,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, goto free_proc_pages; } - mm = mm_access(task, PTRACE_MODE_ATTACH); + mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS); if (!mm || IS_ERR(mm)) { rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; /* diff --git a/mm/readahead.c b/mm/readahead.c index ba22d7f..20e58e8 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -17,6 +17,7 @@ #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/file.h> +#include <linux/mm_inline.h> #include "internal.h" @@ -32,8 +33,6 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) } EXPORT_SYMBOL_GPL(file_ra_state_init); -#define list_to_page(head) (list_entry((head)->prev, struct page, lru)) - /* * see if a page needs releasing upon read_cache_pages() failure * - the caller of read_cache_pages() may have set PG_private or PG_fscache @@ -64,7 +63,7 @@ static void read_cache_pages_invalidate_pages(struct address_space *mapping, struct page *victim; while (!list_empty(pages)) { - victim = list_to_page(pages); + victim = lru_to_page(pages); list_del(&victim->lru); read_cache_pages_invalidate_page(mapping, victim); } @@ -87,7 +86,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, int ret = 0; while (!list_empty(pages)) { - page = list_to_page(pages); + page = lru_to_page(pages); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, mapping_gfp_constraint(mapping, GFP_KERNEL))) { @@ -125,7 +124,7 @@ static int read_pages(struct address_space *mapping, struct file *filp, } for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = list_to_page(pages); + struct page *page = lru_to_page(pages); list_del(&page->lru); if (!add_to_page_cache_lru(page, mapping, page->index, mapping_gfp_constraint(mapping, GFP_KERNEL))) { @@ -23,21 +23,22 @@ * inode->i_mutex (while writing or truncating, not reading or faulting) * mm->mmap_sem * page->flags PG_locked (lock_page) - * mapping->i_mmap_rwsem - * anon_vma->rwsem - * mm->page_table_lock or pte_lock - * zone->lru_lock (in mark_page_accessed, isolate_lru_page) - * swap_lock (in swap_duplicate, swap_info_get) - * mmlist_lock (in mmput, drain_mmlist and others) - * mapping->private_lock (in __set_page_dirty_buffers) - * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) - * mapping->tree_lock (widely used) - * inode->i_lock (in set_page_dirty's __mark_inode_dirty) - * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) - * sb_lock (within inode_lock in fs/fs-writeback.c) - * mapping->tree_lock (widely used, in set_page_dirty, - * in arch-dependent flush_dcache_mmap_lock, - * within bdi.wb->list_lock in __sync_single_inode) + * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) + * mapping->i_mmap_rwsem + * anon_vma->rwsem + * mm->page_table_lock or pte_lock + * zone->lru_lock (in mark_page_accessed, isolate_lru_page) + * swap_lock (in swap_duplicate, swap_info_get) + * mmlist_lock (in mmput, drain_mmlist and others) + * mapping->private_lock (in __set_page_dirty_buffers) + * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) + * mapping->tree_lock (widely used) + * inode->i_lock (in set_page_dirty's __mark_inode_dirty) + * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) + * sb_lock (within inode_lock in fs/fs-writeback.c) + * mapping->tree_lock (widely used, in set_page_dirty, + * in arch-dependent flush_dcache_mmap_lock, + * within bdi.wb->list_lock in __sync_single_inode) * * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) * ->tasklist_lock @@ -428,8 +429,10 @@ static void anon_vma_ctor(void *data) void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), - 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); - anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); + 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, + anon_vma_ctor); + anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, + SLAB_PANIC|SLAB_ACCOUNT); } /* @@ -565,27 +568,6 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma) anon_vma_unlock_read(anon_vma); } -/* - * At what user virtual address is page expected in @vma? - */ -static inline unsigned long -__vma_address(struct page *page, struct vm_area_struct *vma) -{ - pgoff_t pgoff = page_to_pgoff(page); - return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); -} - -inline unsigned long -vma_address(struct page *page, struct vm_area_struct *vma) -{ - unsigned long address = __vma_address(page, vma); - - /* page should be within @vma mapping range */ - VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); - - return address; -} - #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH static void percpu_flush_tlb_batch_pages(void *data) { @@ -817,6 +799,96 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) return 1; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * Check that @page is mapped at @address into @mm. In contrast to + * page_check_address(), this function can handle transparent huge pages. + * + * On success returns true with pte mapped and locked. For PMD-mapped + * transparent huge pages *@ptep is set to NULL. + */ +bool page_check_address_transhuge(struct page *page, struct mm_struct *mm, + unsigned long address, pmd_t **pmdp, + pte_t **ptep, spinlock_t **ptlp) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; + + if (unlikely(PageHuge(page))) { + /* when pud is not present, pte will be NULL */ + pte = huge_pte_offset(mm, address); + if (!pte) + return false; + + ptl = huge_pte_lockptr(page_hstate(page), mm, pte); + pmd = NULL; + goto check_pte; + } + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return false; + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return false; + pmd = pmd_offset(pud, address); + + if (pmd_trans_huge(*pmd)) { + ptl = pmd_lock(mm, pmd); + if (!pmd_present(*pmd)) + goto unlock_pmd; + if (unlikely(!pmd_trans_huge(*pmd))) { + spin_unlock(ptl); + goto map_pte; + } + + if (pmd_page(*pmd) != page) + goto unlock_pmd; + + pte = NULL; + goto found; +unlock_pmd: + spin_unlock(ptl); + return false; + } else { + pmd_t pmde = *pmd; + + barrier(); + if (!pmd_present(pmde) || pmd_trans_huge(pmde)) + return false; + } +map_pte: + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) { + pte_unmap(pte); + return false; + } + + ptl = pte_lockptr(mm, pmd); +check_pte: + spin_lock(ptl); + + if (!pte_present(*pte)) { + pte_unmap_unlock(pte, ptl); + return false; + } + + /* THP can be referenced by any subpage */ + if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) { + pte_unmap_unlock(pte, ptl); + return false; + } +found: + *ptep = pte; + *pmdp = pmd; + *ptlp = ptl; + return true; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + struct page_referenced_arg { int mapcount; int referenced; @@ -830,49 +902,24 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; + struct page_referenced_arg *pra = arg; + pmd_t *pmd; + pte_t *pte; spinlock_t *ptl; int referenced = 0; - struct page_referenced_arg *pra = arg; - if (unlikely(PageTransHuge(page))) { - pmd_t *pmd; + if (!page_check_address_transhuge(page, mm, address, &pmd, &pte, &ptl)) + return SWAP_AGAIN; - /* - * rmap might return false positives; we must filter - * these out using page_check_address_pmd(). - */ - pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); - if (!pmd) - return SWAP_AGAIN; - - if (vma->vm_flags & VM_LOCKED) { - spin_unlock(ptl); - pra->vm_flags |= VM_LOCKED; - return SWAP_FAIL; /* To break the loop */ - } - - /* go ahead even if the pmd is pmd_trans_splitting() */ - if (pmdp_clear_flush_young_notify(vma, address, pmd)) - referenced++; + if (vma->vm_flags & VM_LOCKED) { + if (pte) + pte_unmap(pte); spin_unlock(ptl); - } else { - pte_t *pte; - - /* - * rmap might return false positives; we must filter - * these out using page_check_address(). - */ - pte = page_check_address(page, mm, address, &ptl, 0); - if (!pte) - return SWAP_AGAIN; - - if (vma->vm_flags & VM_LOCKED) { - pte_unmap_unlock(pte, ptl); - pra->vm_flags |= VM_LOCKED; - return SWAP_FAIL; /* To break the loop */ - } + pra->vm_flags |= VM_LOCKED; + return SWAP_FAIL; /* To break the loop */ + } + if (pte) { if (ptep_clear_flush_young_notify(vma, address, pte)) { /* * Don't treat a reference through a sequentially read @@ -884,8 +931,15 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, if (likely(!(vma->vm_flags & VM_SEQ_READ))) referenced++; } - pte_unmap_unlock(pte, ptl); + pte_unmap(pte); + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + if (pmdp_clear_flush_young_notify(vma, address, pmd)) + referenced++; + } else { + /* unexpected pmd-mapped page? */ + WARN_ON_ONCE(1); } + spin_unlock(ptl); if (referenced) clear_page_idle(page); @@ -933,7 +987,7 @@ int page_referenced(struct page *page, int ret; int we_locked = 0; struct page_referenced_arg pra = { - .mapcount = page_mapcount(page), + .mapcount = total_mapcount(page), .memcg = memcg, }; struct rmap_walk_control rwc = { @@ -1122,7 +1176,7 @@ static void __page_check_anon_rmap(struct page *page, * over the call to page_add_new_anon_rmap. */ BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); - BUG_ON(page->index != linear_page_index(vma, address)); + BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address)); #endif } @@ -1131,6 +1185,7 @@ static void __page_check_anon_rmap(struct page *page, * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped + * @compound: charge the page as compound or small page * * The caller needs to hold the pte lock, and the page must be locked in * the anon_vma case: to serialize mapping,index checking after setting, @@ -1138,9 +1193,9 @@ static void __page_check_anon_rmap(struct page *page, * (but PageKsm is never downgraded to PageAnon). */ void page_add_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address) + struct vm_area_struct *vma, unsigned long address, bool compound) { - do_page_add_anon_rmap(page, vma, address, 0); + do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0); } /* @@ -1149,29 +1204,44 @@ void page_add_anon_rmap(struct page *page, * Everybody else should continue to use page_add_anon_rmap above. */ void do_page_add_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address, int exclusive) + struct vm_area_struct *vma, unsigned long address, int flags) { - int first = atomic_inc_and_test(&page->_mapcount); + bool compound = flags & RMAP_COMPOUND; + bool first; + + if (compound) { + atomic_t *mapcount; + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + mapcount = compound_mapcount_ptr(page); + first = atomic_inc_and_test(mapcount); + } else { + first = atomic_inc_and_test(&page->_mapcount); + } + if (first) { + int nr = compound ? hpage_nr_pages(page) : 1; /* * We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption * disabled. */ - if (PageTransHuge(page)) + if (compound) { __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, - hpage_nr_pages(page)); + } + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); } if (unlikely(PageKsm(page))) return; VM_BUG_ON_PAGE(!PageLocked(page), page); + /* address might be in next vma when migration races vma_adjust */ if (first) - __page_set_anon_rmap(page, vma, address, exclusive); + __page_set_anon_rmap(page, vma, address, + flags & RMAP_EXCLUSIVE); else __page_check_anon_rmap(page, vma, address); } @@ -1181,21 +1251,31 @@ void do_page_add_anon_rmap(struct page *page, * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped + * @compound: charge the page as compound or small page * * Same as page_add_anon_rmap but must only be called on *new* pages. * This means the inc-and-test can be bypassed. * Page does not have to be locked. */ void page_add_new_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address) + struct vm_area_struct *vma, unsigned long address, bool compound) { + int nr = compound ? hpage_nr_pages(page) : 1; + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); SetPageSwapBacked(page); - atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ - if (PageTransHuge(page)) + if (compound) { + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + /* increment count (starts at -1) */ + atomic_set(compound_mapcount_ptr(page), 0); __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, - hpage_nr_pages(page)); + } else { + /* Anon THP always mapped first with PMD */ + VM_BUG_ON_PAGE(PageTransCompound(page), page); + /* increment count (starts at -1) */ + atomic_set(&page->_mapcount, 0); + } + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); __page_set_anon_rmap(page, vma, address, 1); } @@ -1223,12 +1303,15 @@ static void page_remove_file_rmap(struct page *page) memcg = mem_cgroup_begin_page_stat(page); - /* page still mapped by someone else? */ - if (!atomic_add_negative(-1, &page->_mapcount)) + /* Hugepages are not counted in NR_FILE_MAPPED for now. */ + if (unlikely(PageHuge(page))) { + /* hugetlb pages are always mapped with pmds */ + atomic_dec(compound_mapcount_ptr(page)); goto out; + } - /* Hugepages are not counted in NR_FILE_MAPPED for now. */ - if (unlikely(PageHuge(page))) + /* page still mapped by someone else? */ + if (!atomic_add_negative(-1, &page->_mapcount)) goto out; /* @@ -1245,41 +1328,79 @@ out: mem_cgroup_end_page_stat(memcg); } +static void page_remove_anon_compound_rmap(struct page *page) +{ + int i, nr; + + if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) + return; + + /* Hugepages are not counted in NR_ANON_PAGES for now. */ + if (unlikely(PageHuge(page))) + return; + + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + return; + + __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + + if (TestClearPageDoubleMap(page)) { + /* + * Subpages can be mapped with PTEs too. Check how many of + * themi are still mapped. + */ + for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { + if (atomic_add_negative(-1, &page[i]._mapcount)) + nr++; + } + } else { + nr = HPAGE_PMD_NR; + } + + if (unlikely(PageMlocked(page))) + clear_page_mlock(page); + + if (nr) { + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr); + deferred_split_huge_page(page); + } +} + /** * page_remove_rmap - take down pte mapping from a page - * @page: page to remove mapping from + * @page: page to remove mapping from + * @compound: uncharge the page as compound or small page * * The caller needs to hold the pte lock. */ -void page_remove_rmap(struct page *page) +void page_remove_rmap(struct page *page, bool compound) { if (!PageAnon(page)) { + VM_BUG_ON_PAGE(compound && !PageHuge(page), page); page_remove_file_rmap(page); return; } + if (compound) + return page_remove_anon_compound_rmap(page); + /* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) return; - /* Hugepages are not counted in NR_ANON_PAGES for now. */ - if (unlikely(PageHuge(page))) - return; - /* * We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - if (PageTransHuge(page)) - __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); - - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, - -hpage_nr_pages(page)); + __dec_zone_page_state(page, NR_ANON_PAGES); if (unlikely(PageMlocked(page))) clear_page_mlock(page); + if (PageTransCompound(page)) + deferred_split_huge_page(compound_head(page)); + /* * It would be tidy to reset the PageAnon mapping here, * but that might overwrite a racing page_add_anon_rmap @@ -1291,6 +1412,11 @@ void page_remove_rmap(struct page *page) */ } +struct rmap_private { + enum ttu_flags flags; + int lazyfreed; +}; + /* * @arg: enum ttu_flags will be passed to this argument */ @@ -1302,7 +1428,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, pte_t pteval; spinlock_t *ptl; int ret = SWAP_AGAIN; - enum ttu_flags flags = (enum ttu_flags)arg; + struct rmap_private *rp = arg; + enum ttu_flags flags = rp->flags; /* munlock has nothing to gain from examining un-locked vmas */ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) @@ -1362,10 +1489,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (PageHuge(page)) { hugetlb_count_sub(1 << compound_order(page), mm); } else { - if (PageAnon(page)) - dec_mm_counter(mm, MM_ANONPAGES); - else - dec_mm_counter(mm, MM_FILEPAGES); + dec_mm_counter(mm, mm_counter(page)); } set_pte_at(mm, address, pte, swp_entry_to_pte(make_hwpoison_entry(page))); @@ -1375,10 +1499,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * interest anymore. Simply discard the pte, vmscan * will take care of the rest. */ - if (PageAnon(page)) - dec_mm_counter(mm, MM_ANONPAGES); - else - dec_mm_counter(mm, MM_FILEPAGES); + dec_mm_counter(mm, mm_counter(page)); } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) { swp_entry_t entry; pte_t swp_pte; @@ -1400,6 +1521,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * See handle_pte_fault() ... */ VM_BUG_ON_PAGE(!PageSwapCache(page), page); + + if (!PageDirty(page) && (flags & TTU_LZFREE)) { + /* It's a freeable page by MADV_FREE */ + dec_mm_counter(mm, MM_ANONPAGES); + rp->lazyfreed++; + goto discard; + } + if (swap_duplicate(entry) < 0) { set_pte_at(mm, address, pte, pteval); ret = SWAP_FAIL; @@ -1418,9 +1547,10 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_pte = pte_swp_mksoft_dirty(swp_pte); set_pte_at(mm, address, pte, swp_pte); } else - dec_mm_counter(mm, MM_FILEPAGES); + dec_mm_counter(mm, mm_counter_file(page)); - page_remove_rmap(page); +discard: + page_remove_rmap(page, PageHuge(page)); page_cache_release(page); out_unmap: @@ -1472,9 +1602,14 @@ static int page_not_mapped(struct page *page) int try_to_unmap(struct page *page, enum ttu_flags flags) { int ret; + struct rmap_private rp = { + .flags = flags, + .lazyfreed = 0, + }; + struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, - .arg = (void *)flags, + .arg = &rp, .done = page_not_mapped, .anon_lock = page_lock_anon_vma_read, }; @@ -1494,8 +1629,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) ret = rmap_walk(page, &rwc); - if (ret != SWAP_MLOCK && !page_mapped(page)) + if (ret != SWAP_MLOCK && !page_mapped(page)) { ret = SWAP_SUCCESS; + if (rp.lazyfreed && !PageDirty(page)) + ret = SWAP_LZFREE; + } return ret; } @@ -1517,9 +1655,14 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) int try_to_munlock(struct page *page) { int ret; + struct rmap_private rp = { + .flags = TTU_MUNLOCK, + .lazyfreed = 0, + }; + struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, - .arg = (void *)TTU_MUNLOCK, + .arg = &rp, .done = page_not_mapped, .anon_lock = page_lock_anon_vma_read, @@ -1702,7 +1845,7 @@ void hugepage_add_anon_rmap(struct page *page, BUG_ON(!PageLocked(page)); BUG_ON(!anon_vma); /* address might be in next vma when migration races vma_adjust */ - first = atomic_inc_and_test(&page->_mapcount); + first = atomic_inc_and_test(compound_mapcount_ptr(page)); if (first) __hugepage_set_anon_rmap(page, vma, address, 0); } @@ -1711,7 +1854,7 @@ void hugepage_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { BUG_ON(address < vma->vm_start || address >= vma->vm_end); - atomic_set(&page->_mapcount, 0); + atomic_set(compound_mapcount_ptr(page), 0); __hugepage_set_anon_rmap(page, vma, address, 1); } #endif /* CONFIG_HUGETLB_PAGE */ @@ -360,6 +360,87 @@ static int shmem_free_swap(struct address_space *mapping, } /* + * Determine (in bytes) how many of the shmem object's pages mapped by the + * given offsets are swapped out. + * + * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU, + * as long as the inode doesn't go away and racy results are not a problem. + */ +unsigned long shmem_partial_swap_usage(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + struct radix_tree_iter iter; + void **slot; + struct page *page; + unsigned long swapped = 0; + + rcu_read_lock(); + +restart: + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + if (iter.index >= end) + break; + + page = radix_tree_deref_slot(slot); + + /* + * This should only be possible to happen at index 0, so we + * don't need to reset the counter, nor do we risk infinite + * restarts. + */ + if (radix_tree_deref_retry(page)) + goto restart; + + if (radix_tree_exceptional_entry(page)) + swapped++; + + if (need_resched()) { + cond_resched_rcu(); + start = iter.index + 1; + goto restart; + } + } + + rcu_read_unlock(); + + return swapped << PAGE_SHIFT; +} + +/* + * Determine (in bytes) how many of the shmem object's pages mapped by the + * given vma is swapped out. + * + * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU, + * as long as the inode doesn't go away and racy results are not a problem. + */ +unsigned long shmem_swap_usage(struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(vma->vm_file); + struct shmem_inode_info *info = SHMEM_I(inode); + struct address_space *mapping = inode->i_mapping; + unsigned long swapped; + + /* Be careful as we don't hold info->lock */ + swapped = READ_ONCE(info->swapped); + + /* + * The easier cases are when the shmem object has nothing in swap, or + * the vma maps it whole. Then we can simply use the stats that we + * already track. + */ + if (!swapped) + return 0; + + if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) + return swapped << PAGE_SHIFT; + + /* Here comes the more involved part */ + return shmem_partial_swap_usage(mapping, + linear_page_index(vma, vma->vm_start), + linear_page_index(vma, vma->vm_end)); +} + +/* * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. */ void shmem_unlock_mapping(struct address_space *mapping) @@ -620,8 +701,7 @@ static void shmem_evict_inode(struct inode *inode) list_del_init(&info->swaplist); mutex_unlock(&shmem_swaplist_mutex); } - } else - kfree(info->symlink); + } simple_xattrs_free(&info->xattrs); WARN_ON(inode->i_blocks); @@ -729,7 +809,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page) * the shmem_swaplist_mutex which might hold up shmem_writepage(). * Charged back to the user (not to caller) when swap account is used. */ - error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg); + error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg, + false); if (error) goto out; /* No radix_tree_preload: swap entry keeps a place for page in tree */ @@ -752,9 +833,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page) if (error) { if (error != -ENOMEM) error = 0; - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); } else - mem_cgroup_commit_charge(page, memcg, true); + mem_cgroup_commit_charge(page, memcg, true, false); out: unlock_page(page); page_cache_release(page); @@ -830,6 +911,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (!swap.val) goto redirty; + if (mem_cgroup_try_charge_swap(page, swap)) + goto free_swap; + /* * Add inode to shmem_unuse()'s list of swapped-out inodes, * if it's not already there. Do it now before the page is @@ -858,6 +942,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } mutex_unlock(&shmem_swaplist_mutex); +free_swap: swapcache_free(swap); redirty: set_page_dirty(page); @@ -1004,7 +1089,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, copy_highpage(newpage, oldpage); flush_dcache_page(newpage); - __set_page_locked(newpage); + __SetPageLocked(newpage); SetPageUptodate(newpage); SetPageSwapBacked(newpage); set_page_private(newpage, swap_index); @@ -1137,7 +1222,8 @@ repeat: goto failed; } - error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); + error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg, + false); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, swp_to_radix_entry(swap)); @@ -1154,14 +1240,14 @@ repeat: * "repeat": reading a hole and writing should succeed. */ if (error) { - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); delete_from_swap_cache(page); } } if (error) goto failed; - mem_cgroup_commit_charge(page, memcg, true); + mem_cgroup_commit_charge(page, memcg, true, false); spin_lock(&info->lock); info->swapped--; @@ -1196,11 +1282,12 @@ repeat: } __SetPageSwapBacked(page); - __set_page_locked(page); + __SetPageLocked(page); if (sgp == SGP_WRITE) __SetPageReferenced(page); - error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); + error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg, + false); if (error) goto decused; error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); @@ -1210,10 +1297,10 @@ repeat: radix_tree_preload_end(); } if (error) { - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); goto decused; } - mem_cgroup_commit_charge(page, memcg, false); + mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_anon(page); spin_lock(&info->lock); @@ -1814,7 +1901,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) if (whence != SEEK_DATA && whence != SEEK_HOLE) return generic_file_llseek_size(file, offset, whence, MAX_LFS_FILESIZE, i_size_read(inode)); - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* We're holding i_mutex so we can access i_size directly */ if (offset < 0) @@ -1838,7 +1925,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) if (offset >= 0) offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return offset; } @@ -2003,7 +2090,7 @@ int shmem_add_seals(struct file *file, unsigned int seals) if (seals & ~(unsigned int)F_ALL_SEALS) return -EINVAL; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (info->seals & F_SEAL_SEAL) { error = -EPERM; @@ -2026,7 +2113,7 @@ int shmem_add_seals(struct file *file, unsigned int seals) error = 0; unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } EXPORT_SYMBOL_GPL(shmem_add_seals); @@ -2076,7 +2163,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (mode & FALLOC_FL_PUNCH_HOLE) { struct address_space *mapping = file->f_mapping; @@ -2189,7 +2276,7 @@ undone: inode->i_private = NULL; spin_unlock(&inode->i_lock); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } @@ -2438,7 +2525,6 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s int len; struct inode *inode; struct page *page; - char *kaddr; struct shmem_inode_info *info; len = strlen(symname) + 1; @@ -2462,14 +2548,14 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s info = SHMEM_I(inode); inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { - info->symlink = kmemdup(symname, len, GFP_KERNEL); - if (!info->symlink) { + inode->i_link = kmemdup(symname, len, GFP_KERNEL); + if (!inode->i_link) { iput(inode); return -ENOMEM; } inode->i_op = &shmem_short_symlink_operations; - inode->i_link = info->symlink; } else { + inode_nohighmem(inode); error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); if (error) { iput(inode); @@ -2477,9 +2563,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s } inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; - kaddr = kmap_atomic(page); - memcpy(kaddr, symname, len); - kunmap_atomic(kaddr); + memcpy(page_address(page), symname, len); SetPageUptodate(page); set_page_dirty(page); unlock_page(page); @@ -2492,23 +2576,34 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s return 0; } -static const char *shmem_follow_link(struct dentry *dentry, void **cookie) +static void shmem_put_link(void *arg) { - struct page *page = NULL; - int error = shmem_getpage(d_inode(dentry), 0, &page, SGP_READ, NULL); - if (error) - return ERR_PTR(error); - unlock_page(page); - *cookie = page; - return kmap(page); + mark_page_accessed(arg); + put_page(arg); } -static void shmem_put_link(struct inode *unused, void *cookie) +static const char *shmem_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct page *page = cookie; - kunmap(page); - mark_page_accessed(page); - page_cache_release(page); + struct page *page = NULL; + int error; + if (!dentry) { + page = find_get_page(inode->i_mapping, 0); + if (!page) + return ERR_PTR(-ECHILD); + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-ECHILD); + } + } else { + error = shmem_getpage(inode, 0, &page, SGP_READ, NULL); + if (error) + return ERR_PTR(error); + unlock_page(page); + } + set_delayed_call(done, shmem_put_link, page); + return page_address(page); } #ifdef CONFIG_TMPFS_XATTR @@ -2555,122 +2650,74 @@ static int shmem_initxattrs(struct inode *inode, return 0; } -static const struct xattr_handler *shmem_xattr_handlers[] = { -#ifdef CONFIG_TMPFS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif - NULL -}; - -static int shmem_xattr_validate(const char *name) -{ - struct { const char *prefix; size_t len; } arr[] = { - { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN }, - { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN } - }; - int i; - - for (i = 0; i < ARRAY_SIZE(arr); i++) { - size_t preflen = arr[i].len; - if (strncmp(name, arr[i].prefix, preflen) == 0) { - if (!name[preflen]) - return -EINVAL; - return 0; - } - } - return -EOPNOTSUPP; -} - -static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size) +static int shmem_xattr_handler_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); - int err; - - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_getxattr(dentry, name, buffer, size); - - err = shmem_xattr_validate(name); - if (err) - return err; + name = xattr_full_name(handler, name); return simple_xattr_get(&info->xattrs, name, buffer, size); } -static int shmem_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +static int shmem_xattr_handler_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); - int err; - - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_setxattr(dentry, name, value, size, flags); - - err = shmem_xattr_validate(name); - if (err) - return err; + name = xattr_full_name(handler, name); return simple_xattr_set(&info->xattrs, name, value, size, flags); } -static int shmem_removexattr(struct dentry *dentry, const char *name) -{ - struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); - int err; - - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_removexattr(dentry, name); +static const struct xattr_handler shmem_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = shmem_xattr_handler_get, + .set = shmem_xattr_handler_set, +}; - err = shmem_xattr_validate(name); - if (err) - return err; +static const struct xattr_handler shmem_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = shmem_xattr_handler_get, + .set = shmem_xattr_handler_set, +}; - return simple_xattr_remove(&info->xattrs, name); -} +static const struct xattr_handler *shmem_xattr_handlers[] = { +#ifdef CONFIG_TMPFS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + &shmem_security_xattr_handler, + &shmem_trusted_xattr_handler, + NULL +}; static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); - return simple_xattr_list(&info->xattrs, buffer, size); + return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size); } #endif /* CONFIG_TMPFS_XATTR */ static const struct inode_operations shmem_short_symlink_operations = { .readlink = generic_readlink, - .follow_link = simple_follow_link, + .get_link = simple_get_link, #ifdef CONFIG_TMPFS_XATTR - .setxattr = shmem_setxattr, - .getxattr = shmem_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = shmem_listxattr, - .removexattr = shmem_removexattr, + .removexattr = generic_removexattr, #endif }; static const struct inode_operations shmem_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = shmem_follow_link, - .put_link = shmem_put_link, + .get_link = shmem_get_link, #ifdef CONFIG_TMPFS_XATTR - .setxattr = shmem_setxattr, - .getxattr = shmem_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = shmem_listxattr, - .removexattr = shmem_removexattr, + .removexattr = generic_removexattr, #endif }; @@ -3083,6 +3130,7 @@ static struct inode *shmem_alloc_inode(struct super_block *sb) static void shmem_destroy_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); + kfree(inode->i_link); kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } @@ -3103,7 +3151,7 @@ static int shmem_init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), - 0, SLAB_PANIC, shmem_init_inode); + 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); return 0; } @@ -3142,10 +3190,10 @@ static const struct inode_operations shmem_inode_operations = { .getattr = shmem_getattr, .setattr = shmem_setattr, #ifdef CONFIG_TMPFS_XATTR - .setxattr = shmem_setxattr, - .getxattr = shmem_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = shmem_listxattr, - .removexattr = shmem_removexattr, + .removexattr = generic_removexattr, .set_acl = simple_set_acl, #endif }; @@ -3164,10 +3212,10 @@ static const struct inode_operations shmem_dir_inode_operations = { .tmpfile = shmem_tmpfile, #endif #ifdef CONFIG_TMPFS_XATTR - .setxattr = shmem_setxattr, - .getxattr = shmem_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = shmem_listxattr, - .removexattr = shmem_removexattr, + .removexattr = generic_removexattr, #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, @@ -3177,10 +3225,10 @@ static const struct inode_operations shmem_dir_inode_operations = { static const struct inode_operations shmem_special_inode_operations = { #ifdef CONFIG_TMPFS_XATTR - .setxattr = shmem_setxattr, - .getxattr = shmem_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = shmem_listxattr, - .removexattr = shmem_removexattr, + .removexattr = generic_removexattr, #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, @@ -2756,6 +2756,21 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, #define cache_free_debugcheck(x,objp,z) (objp) #endif +static struct page *get_first_slab(struct kmem_cache_node *n) +{ + struct page *page; + + page = list_first_entry_or_null(&n->slabs_partial, + struct page, lru); + if (!page) { + n->free_touched = 1; + page = list_first_entry_or_null(&n->slabs_free, + struct page, lru); + } + + return page; +} + static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, bool force_refill) { @@ -2791,18 +2806,12 @@ retry: } while (batchcount > 0) { - struct list_head *entry; struct page *page; /* Get slab alloc is to come from. */ - entry = n->slabs_partial.next; - if (entry == &n->slabs_partial) { - n->free_touched = 1; - entry = n->slabs_free.next; - if (entry == &n->slabs_free) - goto must_grow; - } + page = get_first_slab(n); + if (!page) + goto must_grow; - page = list_entry(entry, struct page, lru); check_spinlock_acquired(cachep); /* @@ -3085,7 +3094,6 @@ retry: static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - struct list_head *entry; struct page *page; struct kmem_cache_node *n; void *obj; @@ -3098,15 +3106,10 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, retry: check_irq_off(); spin_lock(&n->list_lock); - entry = n->slabs_partial.next; - if (entry == &n->slabs_partial) { - n->free_touched = 1; - entry = n->slabs_free.next; - if (entry == &n->slabs_free) - goto must_grow; - } + page = get_first_slab(n); + if (!page) + goto must_grow; - page = list_entry(entry, struct page, lru); check_spinlock_acquired_node(cachep, nodeid); STATS_INC_NODEALLOCS(cachep); @@ -3338,17 +3341,12 @@ free_done: #if STATS { int i = 0; - struct list_head *p; - - p = n->slabs_free.next; - while (p != &(n->slabs_free)) { - struct page *page; + struct page *page; - page = list_entry(p, struct page, lru); + list_for_each_entry(page, &n->slabs_free, lru) { BUG_ON(page->active); i++; - p = p->next; } STATS_SET_FREEABLE(cachep, i); } @@ -128,10 +128,11 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, #if defined(CONFIG_SLAB) #define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ - SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK) + SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \ + SLAB_NOTRACK | SLAB_ACCOUNT) #elif defined(CONFIG_SLUB) #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | SLAB_NOTRACK) + SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT) #else #define SLAB_CACHE_FLAGS (0) #endif @@ -172,7 +173,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) /* * Iterate over all memcg caches of the given root cache. The caller must hold * slab_mutex. @@ -250,7 +251,7 @@ static __always_inline int memcg_charge_slab(struct page *page, extern void slab_init_memcg_params(struct kmem_cache *); -#else /* !CONFIG_MEMCG_KMEM */ +#else /* CONFIG_MEMCG && !CONFIG_SLOB */ #define for_each_memcg_cache(iter, root) \ for ((void)(iter), (void)(root); 0; ) @@ -291,7 +292,7 @@ static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, static inline void slab_init_memcg_params(struct kmem_cache *s) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) { diff --git a/mm/slab_common.c b/mm/slab_common.c index 3c6a86b..b50aef0 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -37,7 +37,8 @@ struct kmem_cache *kmem_cache; SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ SLAB_FAILSLAB) -#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | SLAB_NOTRACK) +#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ + SLAB_NOTRACK | SLAB_ACCOUNT) /* * Merge control. If this is set then no merging of slab caches will occur. @@ -127,7 +128,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, return i; } -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) void slab_init_memcg_params(struct kmem_cache *s) { s->memcg_params.is_root_cache = true; @@ -220,7 +221,7 @@ static inline int init_memcg_params(struct kmem_cache *s, static inline void destroy_memcg_params(struct kmem_cache *s) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ /* * Find a mergeable slab cache @@ -476,7 +477,7 @@ static void release_caches(struct list_head *release, bool need_rcu_barrier) } } -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) /* * memcg_create_kmem_cache - Create a cache for a memory cgroup. * @memcg: The memory cgroup the new cache is for. @@ -502,10 +503,10 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, mutex_lock(&slab_mutex); /* - * The memory cgroup could have been deactivated while the cache + * The memory cgroup could have been offlined while the cache * creation work was pending. */ - if (!memcg_kmem_is_active(memcg)) + if (!memcg_kmem_online(memcg)) goto out_unlock; idx = memcg_cache_id(memcg); @@ -688,7 +689,7 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s, { return 0; } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ void slab_kmem_cache_release(struct kmem_cache *s) { @@ -1122,7 +1123,7 @@ static int slab_show(struct seq_file *m, void *p) return 0; } -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) int memcg_slab_show(struct seq_file *m, void *p) { struct kmem_cache *s = list_entry(p, struct kmem_cache, list); @@ -338,11 +338,13 @@ static inline int oo_objects(struct kmem_cache_order_objects x) */ static __always_inline void slab_lock(struct page *page) { + VM_BUG_ON_PAGE(PageTail(page), page); bit_spin_lock(PG_locked, &page->flags); } static __always_inline void slab_unlock(struct page *page) { + VM_BUG_ON_PAGE(PageTail(page), page); __bit_spin_unlock(PG_locked, &page->flags); } @@ -5205,7 +5207,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, return -EIO; err = attribute->store(s, buf, len); -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { struct kmem_cache *c; @@ -5240,7 +5242,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, static void memcg_propagate_slab_attrs(struct kmem_cache *s) { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG int i; char *buffer = NULL; struct kmem_cache *root_cache; @@ -5326,7 +5328,7 @@ static struct kset *slab_kset; static inline struct kset *cache_kset(struct kmem_cache *s) { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (!is_root_cache(s)) return s->memcg_params.root_cache->memcg_kset; #endif @@ -5362,6 +5364,8 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'F'; if (!(s->flags & SLAB_NOTRACK)) *p++ = 't'; + if (s->flags & SLAB_ACCOUNT) + *p++ = 'A'; if (p != name + 1) *p++ = '-'; p += sprintf(p, "%07d", s->size); @@ -5401,7 +5405,7 @@ static int sysfs_slab_add(struct kmem_cache *s) if (err) goto out_del_kobj; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (is_root_cache(s)) { s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj); if (!s->memcg_kset) { @@ -5434,7 +5438,7 @@ void sysfs_slab_remove(struct kmem_cache *s) */ return; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG kset_unregister(s->memcg_kset); #endif kobject_uevent(&s->kobj, KOBJ_REMOVE); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 4cba9c2..b60802b 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -20,6 +20,7 @@ #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/bootmem.h> +#include <linux/memremap.h> #include <linux/highmem.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -70,7 +71,7 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) } /* need to make sure size is all the same during early stage */ -void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) +static void * __meminit alloc_block_buf(unsigned long size, int node) { void *ptr; @@ -87,6 +88,77 @@ void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) return ptr; } +static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap) +{ + return altmap->base_pfn + altmap->reserve + altmap->alloc + + altmap->align; +} + +static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap) +{ + unsigned long allocated = altmap->alloc + altmap->align; + + if (altmap->free > allocated) + return altmap->free - allocated; + return 0; +} + +/** + * vmem_altmap_alloc - allocate pages from the vmem_altmap reservation + * @altmap - reserved page pool for the allocation + * @nr_pfns - size (in pages) of the allocation + * + * Allocations are aligned to the size of the request + */ +static unsigned long __meminit vmem_altmap_alloc(struct vmem_altmap *altmap, + unsigned long nr_pfns) +{ + unsigned long pfn = vmem_altmap_next_pfn(altmap); + unsigned long nr_align; + + nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG); + nr_align = ALIGN(pfn, nr_align) - pfn; + + if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap)) + return ULONG_MAX; + altmap->alloc += nr_pfns; + altmap->align += nr_align; + return pfn + nr_align; +} + +static void * __meminit altmap_alloc_block_buf(unsigned long size, + struct vmem_altmap *altmap) +{ + unsigned long pfn, nr_pfns; + void *ptr; + + if (size & ~PAGE_MASK) { + pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n", + __func__, size); + return NULL; + } + + nr_pfns = size >> PAGE_SHIFT; + pfn = vmem_altmap_alloc(altmap, nr_pfns); + if (pfn < ULONG_MAX) + ptr = __va(__pfn_to_phys(pfn)); + else + ptr = NULL; + pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n", + __func__, pfn, altmap->alloc, altmap->align, nr_pfns); + + return ptr; +} + +/* need to make sure size is all the same during early stage */ +void * __meminit __vmemmap_alloc_block_buf(unsigned long size, int node, + struct vmem_altmap *altmap) +{ + if (altmap) + return altmap_alloc_block_buf(size, altmap); + return alloc_block_buf(size, node); +} + void __meminit vmemmap_verify(pte_t *pte, int node, unsigned long start, unsigned long end) { @@ -103,7 +175,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) pte_t *pte = pte_offset_kernel(pmd, addr); if (pte_none(*pte)) { pte_t entry; - void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node); + void *p = alloc_block_buf(PAGE_SIZE, node); if (!p) return NULL; entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); diff --git a/mm/sparse.c b/mm/sparse.c index d1b48b6..3717cee 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -748,7 +748,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) if (!memmap) return; - for (i = 0; i < PAGES_PER_SECTION; i++) { + for (i = 0; i < nr_pages; i++) { if (PageHWPoison(&memmap[i])) { atomic_long_sub(1, &num_poisoned_pages); ClearPageHWPoison(&memmap[i]); @@ -788,7 +788,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) free_map_bootmem(memmap); } -void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) +void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, + unsigned long map_offset) { struct page *memmap = NULL; unsigned long *usemap = NULL, flags; @@ -804,7 +805,8 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) } pgdat_resize_unlock(pgdat, &flags); - clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); + clear_hwpoisoned_pages(memmap + map_offset, + PAGES_PER_SECTION - map_offset); free_section_usemap(memmap, usemap); } #endif /* CONFIG_MEMORY_HOTREMOVE */ @@ -24,6 +24,7 @@ #include <linux/export.h> #include <linux/mm_inline.h> #include <linux/percpu_counter.h> +#include <linux/memremap.h> #include <linux/percpu.h> #include <linux/cpu.h> #include <linux/notifier.h> @@ -45,6 +46,7 @@ int page_cluster; static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); /* * This path almost never happens for VM activity - pages are normally @@ -89,260 +91,14 @@ static void __put_compound_page(struct page *page) (*dtor)(page); } -/** - * Two special cases here: we could avoid taking compound_lock_irqsave - * and could skip the tail refcounting(in _mapcount). - * - * 1. Hugetlbfs page: - * - * PageHeadHuge will remain true until the compound page - * is released and enters the buddy allocator, and it could - * not be split by __split_huge_page_refcount(). - * - * So if we see PageHeadHuge set, and we have the tail page pin, - * then we could safely put head page. - * - * 2. Slab THP page: - * - * PG_slab is cleared before the slab frees the head page, and - * tail pin cannot be the last reference left on the head page, - * because the slab code is free to reuse the compound page - * after a kfree/kmem_cache_free without having to check if - * there's any tail pin left. In turn all tail pinsmust be always - * released while the head is still pinned by the slab code - * and so we know PG_slab will be still set too. - * - * So if we see PageSlab set, and we have the tail page pin, - * then we could safely put head page. - */ -static __always_inline -void put_unrefcounted_compound_page(struct page *page_head, struct page *page) -{ - /* - * If @page is a THP tail, we must read the tail page - * flags after the head page flags. The - * __split_huge_page_refcount side enforces write memory barriers - * between clearing PageTail and before the head page - * can be freed and reallocated. - */ - smp_rmb(); - if (likely(PageTail(page))) { - /* - * __split_huge_page_refcount cannot race - * here, see the comment above this function. - */ - VM_BUG_ON_PAGE(!PageHead(page_head), page_head); - if (put_page_testzero(page_head)) { - /* - * If this is the tail of a slab THP page, - * the tail pin must not be the last reference - * held on the page, because the PG_slab cannot - * be cleared before all tail pins (which skips - * the _mapcount tail refcounting) have been - * released. - * - * If this is the tail of a hugetlbfs page, - * the tail pin may be the last reference on - * the page instead, because PageHeadHuge will - * not go away until the compound page enters - * the buddy allocator. - */ - VM_BUG_ON_PAGE(PageSlab(page_head), page_head); - __put_compound_page(page_head); - } - } else - /* - * __split_huge_page_refcount run before us, - * @page was a THP tail. The split @page_head - * has been freed and reallocated as slab or - * hugetlbfs page of smaller order (only - * possible if reallocated as slab on x86). - */ - if (put_page_testzero(page)) - __put_single_page(page); -} - -static __always_inline -void put_refcounted_compound_page(struct page *page_head, struct page *page) -{ - if (likely(page != page_head && get_page_unless_zero(page_head))) { - unsigned long flags; - - /* - * @page_head wasn't a dangling pointer but it may not - * be a head page anymore by the time we obtain the - * lock. That is ok as long as it can't be freed from - * under us. - */ - flags = compound_lock_irqsave(page_head); - if (unlikely(!PageTail(page))) { - /* __split_huge_page_refcount run before us */ - compound_unlock_irqrestore(page_head, flags); - if (put_page_testzero(page_head)) { - /* - * The @page_head may have been freed - * and reallocated as a compound page - * of smaller order and then freed - * again. All we know is that it - * cannot have become: a THP page, a - * compound page of higher order, a - * tail page. That is because we - * still hold the refcount of the - * split THP tail and page_head was - * the THP head before the split. - */ - if (PageHead(page_head)) - __put_compound_page(page_head); - else - __put_single_page(page_head); - } -out_put_single: - if (put_page_testzero(page)) - __put_single_page(page); - return; - } - VM_BUG_ON_PAGE(page_head != compound_head(page), page); - /* - * We can release the refcount taken by - * get_page_unless_zero() now that - * __split_huge_page_refcount() is blocked on the - * compound_lock. - */ - if (put_page_testzero(page_head)) - VM_BUG_ON_PAGE(1, page_head); - /* __split_huge_page_refcount will wait now */ - VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); - atomic_dec(&page->_mapcount); - VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); - VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); - compound_unlock_irqrestore(page_head, flags); - - if (put_page_testzero(page_head)) { - if (PageHead(page_head)) - __put_compound_page(page_head); - else - __put_single_page(page_head); - } - } else { - /* @page_head is a dangling pointer */ - VM_BUG_ON_PAGE(PageTail(page), page); - goto out_put_single; - } -} - -static void put_compound_page(struct page *page) -{ - struct page *page_head; - - /* - * We see the PageCompound set and PageTail not set, so @page maybe: - * 1. hugetlbfs head page, or - * 2. THP head page. - */ - if (likely(!PageTail(page))) { - if (put_page_testzero(page)) { - /* - * By the time all refcounts have been released - * split_huge_page cannot run anymore from under us. - */ - if (PageHead(page)) - __put_compound_page(page); - else - __put_single_page(page); - } - return; - } - - /* - * We see the PageCompound set and PageTail set, so @page maybe: - * 1. a tail hugetlbfs page, or - * 2. a tail THP page, or - * 3. a split THP page. - * - * Case 3 is possible, as we may race with - * __split_huge_page_refcount tearing down a THP page. - */ - page_head = compound_head(page); - if (!__compound_tail_refcounted(page_head)) - put_unrefcounted_compound_page(page_head, page); - else - put_refcounted_compound_page(page_head, page); -} - -void put_page(struct page *page) +void __put_page(struct page *page) { if (unlikely(PageCompound(page))) - put_compound_page(page); - else if (put_page_testzero(page)) + __put_compound_page(page); + else __put_single_page(page); } -EXPORT_SYMBOL(put_page); - -/* - * This function is exported but must not be called by anything other - * than get_page(). It implements the slow path of get_page(). - */ -bool __get_page_tail(struct page *page) -{ - /* - * This takes care of get_page() if run on a tail page - * returned by one of the get_user_pages/follow_page variants. - * get_user_pages/follow_page itself doesn't need the compound - * lock because it runs __get_page_tail_foll() under the - * proper PT lock that already serializes against - * split_huge_page(). - */ - unsigned long flags; - bool got; - struct page *page_head = compound_head(page); - - /* Ref to put_compound_page() comment. */ - if (!__compound_tail_refcounted(page_head)) { - smp_rmb(); - if (likely(PageTail(page))) { - /* - * This is a hugetlbfs page or a slab - * page. __split_huge_page_refcount - * cannot race here. - */ - VM_BUG_ON_PAGE(!PageHead(page_head), page_head); - __get_page_tail_foll(page, true); - return true; - } else { - /* - * __split_huge_page_refcount run - * before us, "page" was a THP - * tail. The split page_head has been - * freed and reallocated as slab or - * hugetlbfs page of smaller order - * (only possible if reallocated as - * slab on x86). - */ - return false; - } - } - - got = false; - if (likely(page != page_head && get_page_unless_zero(page_head))) { - /* - * page_head wasn't a dangling pointer but it - * may not be a head page anymore by the time - * we obtain the lock. That is ok as long as it - * can't be freed from under us. - */ - flags = compound_lock_irqsave(page_head); - /* here __split_huge_page_refcount won't run anymore */ - if (likely(PageTail(page))) { - __get_page_tail_foll(page, false); - got = true; - } - compound_unlock_irqrestore(page_head, flags); - if (unlikely(!got)) - put_page(page_head); - } - return got; -} -EXPORT_SYMBOL(__get_page_tail); +EXPORT_SYMBOL(__put_page); /** * put_pages_list() - release a list of pages @@ -604,6 +360,7 @@ static void __lru_cache_activate_page(struct page *page) */ void mark_page_accessed(struct page *page) { + page = compound_head(page); if (!PageActive(page) && !PageUnevictable(page) && PageReferenced(page)) { @@ -799,6 +556,24 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, update_page_reclaim_stat(lruvec, file, 0); } + +static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, + void *arg) +{ + if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + int file = page_is_file_cache(page); + int lru = page_lru_base_type(page); + + del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); + ClearPageActive(page); + ClearPageReferenced(page); + add_page_to_lru_list(page, lruvec, lru); + + __count_vm_event(PGDEACTIVATE); + update_page_reclaim_stat(lruvec, file, 0); + } +} + /* * Drain pages out of the cpu's pagevecs. * Either "cpu" is the current CPU, and preemption has already been @@ -825,6 +600,10 @@ void lru_add_drain_cpu(int cpu) if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); + pvec = &per_cpu(lru_deactivate_pvecs, cpu); + if (pagevec_count(pvec)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + activate_page_drain(cpu); } @@ -854,6 +633,26 @@ void deactivate_file_page(struct page *page) } } +/** + * deactivate_page - deactivate a page + * @page: page to deactivate + * + * deactivate_page() moves @page to the inactive list if @page was on the active + * list and was not an unevictable page. This is done to accelerate the reclaim + * of @page. + */ +void deactivate_page(struct page *page) +{ + if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + + page_cache_get(page); + if (!pagevec_add(pvec, page)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + put_cpu_var(lru_deactivate_pvecs); + } +} + void lru_add_drain(void) { lru_add_drain_cpu(get_cpu()); @@ -883,6 +682,7 @@ void lru_add_drain_all(void) if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); schedule_work_on(cpu, work); @@ -918,15 +718,6 @@ void release_pages(struct page **pages, int nr, bool cold) for (i = 0; i < nr; i++) { struct page *page = pages[i]; - if (unlikely(PageCompound(page))) { - if (zone) { - spin_unlock_irqrestore(&zone->lru_lock, flags); - zone = NULL; - } - put_compound_page(page); - continue; - } - /* * Make sure the IRQ-safe lock-holding time does not get * excessive with a continuous string of pages from the @@ -937,9 +728,19 @@ void release_pages(struct page **pages, int nr, bool cold) zone = NULL; } + page = compound_head(page); if (!put_page_testzero(page)) continue; + if (PageCompound(page)) { + if (zone) { + spin_unlock_irqrestore(&zone->lru_lock, flags); + zone = NULL; + } + __put_compound_page(page); + continue; + } + if (PageLRU(page)) { struct zone *pagezone = page_zone(page); diff --git a/mm/swap_state.c b/mm/swap_state.c index d504adb..69cb246 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -170,6 +170,11 @@ int add_to_swap(struct page *page, struct list_head *list) if (!entry.val) return 0; + if (mem_cgroup_try_charge_swap(page, entry)) { + swapcache_free(entry); + return 0; + } + if (unlikely(PageTransHuge(page))) if (unlikely(split_huge_page_to_list(page, list))) { swapcache_free(entry); @@ -185,13 +190,12 @@ int add_to_swap(struct page *page, struct list_head *list) * deadlock in the swap out path. */ /* - * Add it to the swap cache and mark it dirty + * Add it to the swap cache. */ err = add_to_swap_cache(page, entry, __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); - if (!err) { /* Success */ - SetPageDirty(page); + if (!err) { return 1; } else { /* -ENOMEM radix-tree allocation failure */ /* @@ -353,7 +357,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, } /* May fail (-ENOMEM) if radix-tree node allocation failed. */ - __set_page_locked(new_page); + __SetPageLocked(new_page); SetPageSwapBacked(new_page); err = __add_to_swap_cache(new_page, entry); if (likely(!err)) { @@ -367,7 +371,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, } radix_tree_preload_end(); ClearPageSwapBacked(new_page); - __clear_page_locked(new_page); + __ClearPageLocked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. diff --git a/mm/swapfile.c b/mm/swapfile.c index 2d259fd..3888438 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -171,8 +171,6 @@ static void discard_swap_cluster(struct swap_info_struct *si, int found_extent = 0; while (nr_pages) { - struct list_head *lh; - if (se->start_page <= start_page && start_page < se->start_page + se->nr_pages) { pgoff_t offset = start_page - se->start_page; @@ -194,8 +192,7 @@ static void discard_swap_cluster(struct swap_info_struct *si, break; } - lh = se->list.next; - se = list_entry(lh, struct swap_extent, list); + se = list_next_entry(se, list); } } @@ -794,14 +791,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, count--; } - if (!count) - mem_cgroup_uncharge_swap(entry); - usage = count | has_cache; p->swap_map[offset] = usage; /* free if no reference */ if (!usage) { + mem_cgroup_uncharge_swap(entry); dec_cluster_info_page(p, p->cluster_info, offset); if (offset < p->lowest_bit) p->lowest_bit = offset; @@ -909,7 +904,7 @@ int swp_swapcount(swp_entry_t entry) VM_BUG_ON(page_private(page) != SWP_CONTINUED); do { - page = list_entry(page->lru.next, struct page, lru); + page = list_next_entry(page, lru); map = kmap_atomic(page); tmp_count = map[offset]; kunmap_atomic(map); @@ -935,6 +930,9 @@ int reuse_swap_page(struct page *page) VM_BUG_ON_PAGE(!PageLocked(page), page); if (unlikely(PageKsm(page))) return 0; + /* The page is part of THP and cannot be reused */ + if (PageTransCompound(page)) + return 0; count = page_mapcount(page); if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); @@ -1014,7 +1012,7 @@ int free_swap_and_cache(swp_entry_t entry) * Also recheck PageSwapCache now page is locked (above). */ if (PageSwapCache(page) && !PageWriteback(page) && - (!page_mapped(page) || vm_swap_full())) { + (!page_mapped(page) || mem_cgroup_swap_full(page))) { delete_from_swap_cache(page); SetPageDirty(page); } @@ -1117,19 +1115,9 @@ unsigned int count_swap_pages(int type, int free) } #endif /* CONFIG_HIBERNATION */ -static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) +static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) { -#ifdef CONFIG_MEM_SOFT_DIRTY - /* - * When pte keeps soft dirty bit the pte generated - * from swap entry does not has it, still it's same - * pte from logical point of view. - */ - pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); - return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); -#else - return pte_same(pte, swp_pte); -#endif + return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte); } /* @@ -1151,14 +1139,15 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, if (unlikely(!page)) return -ENOMEM; - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) { + if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, + &memcg, false)) { ret = -ENOMEM; goto out_nolock; } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { - mem_cgroup_cancel_charge(page, memcg); + if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { + mem_cgroup_cancel_charge(page, memcg, false); ret = 0; goto out; } @@ -1169,11 +1158,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { - page_add_anon_rmap(page, vma, addr); - mem_cgroup_commit_charge(page, memcg, true); + page_add_anon_rmap(page, vma, addr, false); + mem_cgroup_commit_charge(page, memcg, true, false); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, addr); - mem_cgroup_commit_charge(page, memcg, false); + page_add_new_anon_rmap(page, vma, addr, false); + mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } swap_free(entry); @@ -1215,7 +1204,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * swapoff spends a _lot_ of time in this loop! * Test inline before going to call unuse_pte. */ - if (unlikely(maybe_same_pte(*pte, swp_pte))) { + if (unlikely(pte_same_as_swp(*pte, swp_pte))) { pte_unmap(pte); ret = unuse_pte(vma, pmd, addr, entry, page); if (ret) @@ -1639,14 +1628,11 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) se = start_se; for ( ; ; ) { - struct list_head *lh; - if (se->start_page <= offset && offset < (se->start_page + se->nr_pages)) { return se->start_block + (offset - se->start_page); } - lh = se->list.next; - se = list_entry(lh, struct swap_extent, list); + se = list_next_entry(se, list); sis->curr_swap_extent = se; BUG_ON(se == start_se); /* It *must* be present */ } @@ -1670,7 +1656,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis) while (!list_empty(&sis->first_swap_extent.list)) { struct swap_extent *se; - se = list_entry(sis->first_swap_extent.list.next, + se = list_first_entry(&sis->first_swap_extent.list, struct swap_extent, list); list_del(&se->list); kfree(se); @@ -1976,9 +1962,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) set_blocksize(bdev, old_block_size); blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } else { - mutex_lock(&inode->i_mutex); + inode_lock(inode); inode->i_flags &= ~S_SWAPFILE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } filp_close(swap_file, NULL); @@ -2203,7 +2189,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) p->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { p->bdev = inode->i_sb->s_bdev; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (IS_SWAPFILE(inode)) return -EBUSY; } else @@ -2436,7 +2422,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) mapping = swap_file->f_mapping; inode = mapping->host; - /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */ + /* If S_ISREG(inode->i_mode) will do inode_lock(inode); */ error = claim_swapfile(p, inode); if (unlikely(error)) goto bad_swap; @@ -2581,7 +2567,7 @@ bad_swap: vfree(cluster_info); if (swap_file) { if (inode && S_ISREG(inode->i_mode)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); inode = NULL; } filp_close(swap_file, NULL); @@ -2594,7 +2580,7 @@ out: if (name) putname(name); if (inode && S_ISREG(inode->i_mode)) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } @@ -2965,11 +2951,10 @@ static void free_swap_count_continuations(struct swap_info_struct *si) struct page *head; head = vmalloc_to_page(si->swap_map + offset); if (page_private(head)) { - struct list_head *this, *next; - list_for_each_safe(this, next, &head->lru) { - struct page *page; - page = list_entry(this, struct page, lru); - list_del(this); + struct page *page, *next; + + list_for_each_entry_safe(page, next, &head->lru, lru) { + list_del(&page->lru); __free_page(page); } } diff --git a/mm/truncate.c b/mm/truncate.c index 76e35ad..e3ee0e2 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/backing-dev.h> +#include <linux/dax.h> #include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> @@ -34,31 +35,39 @@ static void clear_exceptional_entry(struct address_space *mapping, return; spin_lock_irq(&mapping->tree_lock); - /* - * Regular page slots are stabilized by the page lock even - * without the tree itself locked. These unlocked entries - * need verification under the tree lock. - */ - if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) - goto unlock; - if (*slot != entry) - goto unlock; - radix_tree_replace_slot(slot, NULL); - mapping->nrshadows--; - if (!node) - goto unlock; - workingset_node_shadows_dec(node); - /* - * Don't track node without shadow entries. - * - * Avoid acquiring the list_lru lock if already untracked. - * The list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!workingset_node_shadows(node) && - !list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, &node->private_list); - __radix_tree_delete_node(&mapping->page_tree, node); + + if (dax_mapping(mapping)) { + if (radix_tree_delete_item(&mapping->page_tree, index, entry)) + mapping->nrexceptional--; + } else { + /* + * Regular page slots are stabilized by the page lock even + * without the tree itself locked. These unlocked entries + * need verification under the tree lock. + */ + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, + &slot)) + goto unlock; + if (*slot != entry) + goto unlock; + radix_tree_replace_slot(slot, NULL); + mapping->nrexceptional--; + if (!node) + goto unlock; + workingset_node_shadows_dec(node); + /* + * Don't track node without shadow entries. + * + * Avoid acquiring the list_lru lock if already untracked. + * The list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_shadows(node) && + !list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, + &node->private_list); + __radix_tree_delete_node(&mapping->page_tree, node); + } unlock: spin_unlock_irq(&mapping->tree_lock); } @@ -228,7 +237,7 @@ void truncate_inode_pages_range(struct address_space *mapping, int i; cleancache_invalidate_inode(mapping); - if (mapping->nrpages == 0 && mapping->nrshadows == 0) + if (mapping->nrpages == 0 && mapping->nrexceptional == 0) return; /* Offsets within partial pages */ @@ -402,7 +411,7 @@ EXPORT_SYMBOL(truncate_inode_pages); */ void truncate_inode_pages_final(struct address_space *mapping) { - unsigned long nrshadows; + unsigned long nrexceptional; unsigned long nrpages; /* @@ -416,14 +425,14 @@ void truncate_inode_pages_final(struct address_space *mapping) /* * When reclaim installs eviction entries, it increases - * nrshadows first, then decreases nrpages. Make sure we see + * nrexceptional first, then decreases nrpages. Make sure we see * this in the right order or we might miss an entry. */ nrpages = mapping->nrpages; smp_rmb(); - nrshadows = mapping->nrshadows; + nrexceptional = mapping->nrexceptional; - if (nrpages || nrshadows) { + if (nrpages || nrexceptional) { /* * As truncation uses a lockless tree lookup, cycle * the tree lock to make sure any ongoing tree diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 77fee93..806b0c7 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -63,7 +63,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, __SetPageUptodate(page); ret = -ENOMEM; - if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg)) + if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false)) goto out_release; _dst_pte = mk_pte(page, dst_vma->vm_page_prot); @@ -76,8 +76,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, goto out_release_uncharge_unlock; inc_mm_counter(dst_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, dst_vma, dst_addr); - mem_cgroup_commit_charge(page, memcg, false); + page_add_new_anon_rmap(page, dst_vma, dst_addr, false); + mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, dst_vma); set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); @@ -91,7 +91,7 @@ out: return ret; out_release_uncharge_unlock: pte_unmap_unlock(dst_pte, ptl); - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); out_release: page_cache_release(page); goto out; @@ -176,6 +176,37 @@ char *strndup_user(const char __user *s, long n) } EXPORT_SYMBOL(strndup_user); +/** + * memdup_user_nul - duplicate memory region from user space and NUL-terminate + * + * @src: source address in user space + * @len: number of bytes to copy + * + * Returns an ERR_PTR() on failure. + */ +void *memdup_user_nul(const void __user *src, size_t len) +{ + char *p; + + /* + * Always use GFP_KERNEL, since copy_from_user() can sleep and + * cause pagefault, which makes it pointless to use GFP_NOFS + * or GFP_ATOMIC. + */ + p = kmalloc_track_caller(len + 1, GFP_KERNEL); + if (!p) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(p, src, len)) { + kfree(p); + return ERR_PTR(-EFAULT); + } + p[len] = '\0'; + + return p; +} +EXPORT_SYMBOL(memdup_user_nul); + void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent) { @@ -199,36 +230,11 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, } /* Check if the vma is being used as a stack by this task */ -static int vm_is_stack_for_task(struct task_struct *t, - struct vm_area_struct *vma) +int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t) { return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } -/* - * Check if the vma is being used as a stack. - * If is_group is non-zero, check in the entire thread group or else - * just check in the current task. Returns the task_struct of the task - * that the vma is stack for. Must be called under rcu_read_lock(). - */ -struct task_struct *task_of_stack(struct task_struct *task, - struct vm_area_struct *vma, bool in_group) -{ - if (vm_is_stack_for_task(task, vma)) - return task; - - if (in_group) { - struct task_struct *t; - - for_each_thread(task, t) { - if (vm_is_stack_for_task(t, vma)) - return t; - } - } - - return NULL; -} - #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm) { @@ -355,7 +361,9 @@ struct anon_vma *page_anon_vma(struct page *page) struct address_space *page_mapping(struct page *page) { - unsigned long mapping; + struct address_space *mapping; + + page = compound_head(page); /* This happens if someone calls flush_dcache_page on slab page */ if (unlikely(PageSlab(page))) @@ -368,11 +376,25 @@ struct address_space *page_mapping(struct page *page) return swap_address_space(entry); } - mapping = (unsigned long)page->mapping; - if (mapping & PAGE_MAPPING_FLAGS) + mapping = page->mapping; + if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) return NULL; - return page->mapping; + return mapping; +} + +/* Slow path of page_mapcount() for compound pages */ +int __page_mapcount(struct page *page) +{ + int ret; + + ret = atomic_read(&page->_mapcount) + 1; + page = compound_head(page); + ret += atomic_read(compound_mapcount_ptr(page)) + 1; + if (PageDoubleMap(page)) + ret--; + return ret; } +EXPORT_SYMBOL_GPL(__page_mapcount); int overcommit_ratio_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -429,17 +451,25 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen) int res = 0; unsigned int len; struct mm_struct *mm = get_task_mm(task); + unsigned long arg_start, arg_end, env_start, env_end; if (!mm) goto out; if (!mm->arg_end) goto out_mm; /* Shh! No looking before we're done */ - len = mm->arg_end - mm->arg_start; + down_read(&mm->mmap_sem); + arg_start = mm->arg_start; + arg_end = mm->arg_end; + env_start = mm->env_start; + env_end = mm->env_end; + up_read(&mm->mmap_sem); + + len = arg_end - arg_start; if (len > buflen) len = buflen; - res = access_process_vm(task, mm->arg_start, buffer, len, 0); + res = access_process_vm(task, arg_start, buffer, len, 0); /* * If the nul at the end of args has been overwritten, then @@ -450,10 +480,10 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen) if (len < res) { res = len; } else { - len = mm->env_end - mm->env_start; + len = env_end - env_start; if (len > buflen - res) len = buflen - res; - res += access_process_vm(task, mm->env_start, + res += access_process_vm(task, env_start, buffer+res, len, 0); res = strnlen(buffer, res); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8e3c9c5..fb42a5b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -441,8 +441,7 @@ nocache: if (list_is_last(&first->list, &vmap_area_list)) goto found; - first = list_entry(first->list.next, - struct vmap_area, list); + first = list_next_entry(first, list); } found: @@ -456,7 +455,7 @@ found: free_vmap_cache = &va->rb_node; spin_unlock(&vmap_area_lock); - BUG_ON(va->va_start & (align-1)); + BUG_ON(!IS_ALIGNED(va->va_start, align)); BUG_ON(va->va_start < vstart); BUG_ON(va->va_end > vend); @@ -1087,7 +1086,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) BUG_ON(!addr); BUG_ON(addr < VMALLOC_START); BUG_ON(addr > VMALLOC_END); - BUG_ON(addr & (PAGE_SIZE-1)); + BUG_ON(!IS_ALIGNED(addr, PAGE_SIZE)); debug_check_no_locks_freed(mem, size); vmap_debug_free_range(addr, addr+size); @@ -1477,13 +1476,10 @@ static void __vunmap(const void *addr, int deallocate_pages) struct page *page = area->pages[i]; BUG_ON(!page); - __free_page(page); + __free_kmem_pages(page, 0); } - if (area->flags & VM_VPAGES) - vfree(area->pages); - else - kfree(area->pages); + kvfree(area->pages); } kfree(area); @@ -1593,7 +1589,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, if (array_size > PAGE_SIZE) { pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, PAGE_KERNEL, node, area->caller); - area->flags |= VM_VPAGES; } else { pages = kmalloc_node(array_size, nested_gfp, node); } @@ -1608,9 +1603,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page *page; if (node == NUMA_NO_NODE) - page = alloc_page(alloc_mask); + page = alloc_kmem_pages(alloc_mask, order); else - page = alloc_pages_node(node, alloc_mask, order); + page = alloc_kmem_pages_node(node, alloc_mask, order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ @@ -2559,10 +2554,10 @@ static void *s_start(struct seq_file *m, loff_t *pos) struct vmap_area *va; spin_lock(&vmap_area_lock); - va = list_entry((&vmap_area_list)->next, typeof(*va), list); + va = list_first_entry(&vmap_area_list, typeof(*va), list); while (n > 0 && &va->list != &vmap_area_list) { n--; - va = list_entry(va->list.next, typeof(*va), list); + va = list_next_entry(va, list); } if (!n && &va->list != &vmap_area_list) return va; @@ -2576,7 +2571,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) struct vmap_area *va = p, *next; ++*pos; - next = list_entry(va->list.next, typeof(*va), list); + next = list_next_entry(va, list); if (&next->list != &vmap_area_list) return next; @@ -2651,7 +2646,7 @@ static int s_show(struct seq_file *m, void *p) if (v->flags & VM_USERMAP) seq_puts(m, " user"); - if (v->flags & VM_VPAGES) + if (is_vmalloc_addr(v->pages)) seq_puts(m, " vpages"); show_numa_info(m, v); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index c5afd57..149fdf6 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -137,14 +137,11 @@ struct vmpressure_event { }; static bool vmpressure_event(struct vmpressure *vmpr, - unsigned long scanned, unsigned long reclaimed) + enum vmpressure_levels level) { struct vmpressure_event *ev; - enum vmpressure_levels level; bool signalled = false; - level = vmpressure_calc_level(scanned, reclaimed); - mutex_lock(&vmpr->events_lock); list_for_each_entry(ev, &vmpr->events, node) { @@ -164,6 +161,7 @@ static void vmpressure_work_fn(struct work_struct *work) struct vmpressure *vmpr = work_to_vmpressure(work); unsigned long scanned; unsigned long reclaimed; + enum vmpressure_levels level; spin_lock(&vmpr->sr_lock); /* @@ -174,19 +172,21 @@ static void vmpressure_work_fn(struct work_struct *work) * here. No need for any locks here since we don't care if * vmpr->reclaimed is in sync. */ - scanned = vmpr->scanned; + scanned = vmpr->tree_scanned; if (!scanned) { spin_unlock(&vmpr->sr_lock); return; } - reclaimed = vmpr->reclaimed; - vmpr->scanned = 0; - vmpr->reclaimed = 0; + reclaimed = vmpr->tree_reclaimed; + vmpr->tree_scanned = 0; + vmpr->tree_reclaimed = 0; spin_unlock(&vmpr->sr_lock); + level = vmpressure_calc_level(scanned, reclaimed); + do { - if (vmpressure_event(vmpr, scanned, reclaimed)) + if (vmpressure_event(vmpr, level)) break; /* * If not handled, propagate the event upward into the @@ -199,6 +199,7 @@ static void vmpressure_work_fn(struct work_struct *work) * vmpressure() - Account memory pressure through scanned/reclaimed ratio * @gfp: reclaimer's gfp mask * @memcg: cgroup memory controller handle + * @tree: legacy subtree mode * @scanned: number of pages scanned * @reclaimed: number of pages reclaimed * @@ -206,9 +207,16 @@ static void vmpressure_work_fn(struct work_struct *work) * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw * pressure index is then further refined and averaged over time. * + * If @tree is set, vmpressure is in traditional userspace reporting + * mode: @memcg is considered the pressure root and userspace is + * notified of the entire subtree's reclaim efficiency. + * + * If @tree is not set, reclaim efficiency is recorded for @memcg, and + * only in-kernel users are notified. + * * This function does not return any value. */ -void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, +void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, unsigned long scanned, unsigned long reclaimed) { struct vmpressure *vmpr = memcg_to_vmpressure(memcg); @@ -238,15 +246,46 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, if (!scanned) return; - spin_lock(&vmpr->sr_lock); - vmpr->scanned += scanned; - vmpr->reclaimed += reclaimed; - scanned = vmpr->scanned; - spin_unlock(&vmpr->sr_lock); + if (tree) { + spin_lock(&vmpr->sr_lock); + scanned = vmpr->tree_scanned += scanned; + vmpr->tree_reclaimed += reclaimed; + spin_unlock(&vmpr->sr_lock); - if (scanned < vmpressure_win) - return; - schedule_work(&vmpr->work); + if (scanned < vmpressure_win) + return; + schedule_work(&vmpr->work); + } else { + enum vmpressure_levels level; + + /* For now, no users for root-level efficiency */ + if (!memcg || memcg == root_mem_cgroup) + return; + + spin_lock(&vmpr->sr_lock); + scanned = vmpr->scanned += scanned; + reclaimed = vmpr->reclaimed += reclaimed; + if (scanned < vmpressure_win) { + spin_unlock(&vmpr->sr_lock); + return; + } + vmpr->scanned = vmpr->reclaimed = 0; + spin_unlock(&vmpr->sr_lock); + + level = vmpressure_calc_level(scanned, reclaimed); + + if (level > VMPRESSURE_LOW) { + /* + * Let the socket buffer allocator know that + * we are having trouble reclaiming LRU pages. + * + * For hysteresis keep the pressure state + * asserted for a second in which subsequent + * pressure events can occur. + */ + memcg->socket_pressure = jiffies + HZ; + } + } } /** @@ -276,7 +315,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) * to the vmpressure() basically means that we signal 'critical' * level. */ - vmpressure(gfp, memcg, vmpressure_win, 0); + vmpressure(gfp, memcg, true, vmpressure_win, 0); } /** diff --git a/mm/vmscan.c b/mm/vmscan.c index 2aec424..71b1c29 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -46,6 +46,7 @@ #include <linux/oom.h> #include <linux/prefetch.h> #include <linux/printk.h> +#include <linux/dax.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -106,8 +107,6 @@ struct scan_control { unsigned long nr_reclaimed; }; -#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) - #ifdef ARCH_HAS_PREFETCH #define prefetch_prev_lru_page(_page, _base, _field) \ do { \ @@ -197,11 +196,13 @@ static unsigned long zone_reclaimable_pages(struct zone *zone) unsigned long nr; nr = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE); + zone_page_state(zone, NR_INACTIVE_FILE) + + zone_page_state(zone, NR_ISOLATED_FILE); if (get_nr_swap_pages() > 0) nr += zone_page_state(zone, NR_ACTIVE_ANON) + - zone_page_state(zone, NR_INACTIVE_ANON); + zone_page_state(zone, NR_INACTIVE_ANON) + + zone_page_state(zone, NR_ISOLATED_ANON); return nr; } @@ -411,7 +412,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct shrinker *shrinker; unsigned long freed = 0; - if (memcg && !memcg_kmem_is_active(memcg)) + if (memcg && !memcg_kmem_online(memcg)) return 0; if (nr_scanned == 0) @@ -594,7 +595,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, /* synchronous write or broken a_ops? */ ClearPageReclaim(page); } - trace_mm_vmscan_writepage(page, trace_reclaim_flags(page)); + trace_mm_vmscan_writepage(page); inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -671,9 +672,15 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * inode reclaim needs to empty out the radix tree or * the nodes are lost. Don't plant shadows behind its * back. + * + * We also don't store shadows for DAX mappings because the + * only page cache pages found in these are zero pages + * covering holes, and because we don't want to mix DAX + * exceptional entries and shadow exceptional entries in the + * same page_tree. */ if (reclaimed && page_is_file_cache(page) && - !mapping_exiting(mapping)) + !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(mapping, page); __delete_from_page_cache(page, shadow, memcg); spin_unlock_irqrestore(&mapping->tree_lock, flags); @@ -906,6 +913,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, int may_enter_fs; enum page_references references = PAGEREF_RECLAIM_CLEAN; bool dirty, writeback; + bool lazyfree = false; + int ret = SWAP_SUCCESS; cond_resched(); @@ -1049,6 +1058,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; if (!add_to_swap(page, page_list)) goto activate_locked; + lazyfree = true; may_enter_fs = 1; /* Adding to swap updated mapping */ @@ -1060,14 +1070,17 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, - ttu_flags|TTU_BATCH_FLUSH)) { + switch (ret = try_to_unmap(page, lazyfree ? + (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) : + (ttu_flags | TTU_BATCH_FLUSH))) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: goto keep_locked; case SWAP_MLOCK: goto cull_mlocked; + case SWAP_LZFREE: + goto lazyfree; case SWAP_SUCCESS: ; /* try to free the page below */ } @@ -1174,6 +1187,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } +lazyfree: if (!mapping || !__remove_mapping(mapping, page, true)) goto keep_locked; @@ -1184,8 +1198,11 @@ static unsigned long shrink_page_list(struct list_head *page_list, * we obviously don't have to worry about waking up a process * waiting on the page lock, because there are no references. */ - __clear_page_locked(page); + __ClearPageLocked(page); free_it: + if (ret == SWAP_LZFREE) + count_vm_event(PGLAZYFREED); + nr_reclaimed++; /* @@ -1204,7 +1221,7 @@ cull_mlocked: activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ - if (PageSwapCache(page) && vm_swap_full()) + if (PageSwapCache(page) && mem_cgroup_swap_full(page)) try_to_free_swap(page); VM_BUG_ON_PAGE(PageActive(page), page); SetPageActive(page); @@ -1426,6 +1443,7 @@ int isolate_lru_page(struct page *page) int ret = -EBUSY; VM_BUG_ON_PAGE(!page_count(page), page); + WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); if (PageLRU(page)) { struct zone *zone = page_zone(page); @@ -1691,11 +1709,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, current_may_throttle()) wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); - trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, - zone_idx(zone), - nr_scanned, nr_reclaimed, - sc->priority, - trace_shrink_flags(file)); + trace_mm_vmscan_lru_shrink_inactive(zone, nr_scanned, nr_reclaimed, + sc->priority, file); return nr_reclaimed; } @@ -1958,10 +1973,11 @@ enum scan_balance { * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ -static void get_scan_count(struct lruvec *lruvec, int swappiness, +static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, struct scan_control *sc, unsigned long *nr, unsigned long *lru_pages) { + int swappiness = mem_cgroup_swappiness(memcg); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; u64 denominator = 0; /* gcc */ @@ -1988,14 +2004,14 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, if (current_is_kswapd()) { if (!zone_reclaimable(zone)) force_scan = true; - if (!mem_cgroup_lruvec_online(lruvec)) + if (!mem_cgroup_online(memcg)) force_scan = true; } if (!global_reclaim(sc)) force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || (get_nr_swap_pages() <= 0)) { + if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) { scan_balance = SCAN_FILE; goto out; } @@ -2046,10 +2062,16 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, } /* - * There is enough inactive page cache, do not reclaim - * anything from the anonymous working set right now. + * If there is enough inactive page cache, i.e. if the size of the + * inactive list is greater than that of the active list *and* the + * inactive list actually has some pages to scan on this priority, we + * do not reclaim anything from the anonymous working set right now. + * Without the second condition we could end up never scanning an + * lruvec even if it has plenty of old anonymous pages unless the + * system is under heavy pressure. */ - if (!inactive_file_is_low(lruvec)) { + if (!inactive_file_is_low(lruvec) && + get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { scan_balance = SCAN_FILE; goto out; } @@ -2179,9 +2201,10 @@ static inline void init_tlb_ubc(void) /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void shrink_lruvec(struct lruvec *lruvec, int swappiness, - struct scan_control *sc, unsigned long *lru_pages) +static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg, + struct scan_control *sc, unsigned long *lru_pages) { + struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; unsigned long nr_to_scan; @@ -2191,7 +2214,7 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness, struct blk_plug plug; bool scan_adjusted; - get_scan_count(lruvec, swappiness, sc, nr, lru_pages); + get_scan_count(lruvec, memcg, sc, nr, lru_pages); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); @@ -2393,9 +2416,8 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, memcg = mem_cgroup_iter(root, NULL, &reclaim); do { unsigned long lru_pages; + unsigned long reclaimed; unsigned long scanned; - struct lruvec *lruvec; - int swappiness; if (mem_cgroup_low(root, memcg)) { if (!sc->may_thrash) @@ -2403,11 +2425,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, mem_cgroup_events(memcg, MEMCG_LOW, 1); } - lruvec = mem_cgroup_zone_lruvec(zone, memcg); - swappiness = mem_cgroup_swappiness(memcg); + reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; - shrink_lruvec(lruvec, swappiness, sc, &lru_pages); + shrink_zone_memcg(zone, memcg, sc, &lru_pages); zone_lru_pages += lru_pages; if (memcg && is_classzone) @@ -2415,6 +2436,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, memcg, sc->nr_scanned - scanned, lru_pages); + /* Record the group's reclaim efficiency */ + vmpressure(sc->gfp_mask, memcg, false, + sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); + /* * Direct reclaim and kswapd have to scan all memory * cgroups to fulfill the overall scan target for the @@ -2446,7 +2472,8 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, reclaim_state->reclaimed_slab = 0; } - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, + /* Record the subtree's reclaim efficiency */ + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, sc->nr_scanned - nr_scanned, sc->nr_reclaimed - nr_reclaimed); @@ -2871,8 +2898,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, .may_unmap = 1, .may_swap = !noswap, }; - struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); - int swappiness = mem_cgroup_swappiness(memcg); unsigned long lru_pages; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | @@ -2889,7 +2914,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_lruvec(lruvec, swappiness, &sc, &lru_pages); + shrink_zone_memcg(zone, memcg, &sc, &lru_pages); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); diff --git a/mm/vmstat.c b/mm/vmstat.c index 0d5712b..084c672 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -219,7 +219,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, * particular counter cannot be updated from interrupt context. */ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, - int delta) + long delta) { struct per_cpu_pageset __percpu *pcp = zone->pageset; s8 __percpu *p = pcp->vm_stat_diff + item; @@ -318,8 +318,8 @@ EXPORT_SYMBOL(__dec_zone_page_state); * 1 Overstepping half of threshold * -1 Overstepping minus half of threshold */ -static inline void mod_state(struct zone *zone, - enum zone_stat_item item, int delta, int overstep_mode) +static inline void mod_state(struct zone *zone, enum zone_stat_item item, + long delta, int overstep_mode) { struct per_cpu_pageset __percpu *pcp = zone->pageset; s8 __percpu *p = pcp->vm_stat_diff + item; @@ -357,7 +357,7 @@ static inline void mod_state(struct zone *zone, } void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, - int delta) + long delta) { mod_state(zone, item, delta, 0); } @@ -384,7 +384,7 @@ EXPORT_SYMBOL(dec_zone_page_state); * Use interrupt disable to serialize counter updates */ void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, - int delta) + long delta) { unsigned long flags; @@ -460,7 +460,7 @@ static int fold_diff(int *diff) * * The function returns the number of global counters updated. */ -static int refresh_cpu_vm_stats(void) +static int refresh_cpu_vm_stats(bool do_pagesets) { struct zone *zone; int i; @@ -484,33 +484,35 @@ static int refresh_cpu_vm_stats(void) #endif } } - cond_resched(); #ifdef CONFIG_NUMA - /* - * Deal with draining the remote pageset of this - * processor - * - * Check if there are pages remaining in this pageset - * if not then there is nothing to expire. - */ - if (!__this_cpu_read(p->expire) || + if (do_pagesets) { + cond_resched(); + /* + * Deal with draining the remote pageset of this + * processor + * + * Check if there are pages remaining in this pageset + * if not then there is nothing to expire. + */ + if (!__this_cpu_read(p->expire) || !__this_cpu_read(p->pcp.count)) - continue; + continue; - /* - * We never drain zones local to this processor. - */ - if (zone_to_nid(zone) == numa_node_id()) { - __this_cpu_write(p->expire, 0); - continue; - } + /* + * We never drain zones local to this processor. + */ + if (zone_to_nid(zone) == numa_node_id()) { + __this_cpu_write(p->expire, 0); + continue; + } - if (__this_cpu_dec_return(p->expire)) - continue; + if (__this_cpu_dec_return(p->expire)) + continue; - if (__this_cpu_read(p->pcp.count)) { - drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); - changes++; + if (__this_cpu_read(p->pcp.count)) { + drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); + changes++; + } } #endif } @@ -781,6 +783,7 @@ const char * const vmstat_text[] = { "pgfault", "pgmajfault", + "pglazyfreed", TEXTS_FOR_ZONES("pgrefill") TEXTS_FOR_ZONES("pgsteal_kswapd") @@ -842,7 +845,9 @@ const char * const vmstat_text[] = { "thp_fault_fallback", "thp_collapse_alloc", "thp_collapse_alloc_failed", - "thp_split", + "thp_split_page", + "thp_split_page_failed", + "thp_split_pmd", "thp_zero_page_alloc", "thp_zero_page_alloc_failed", #endif @@ -1386,15 +1391,20 @@ static cpumask_var_t cpu_stat_off; static void vmstat_update(struct work_struct *w) { - if (refresh_cpu_vm_stats()) { + if (refresh_cpu_vm_stats(true)) { /* * Counters were updated so we expect more updates * to occur in the future. Keep on running the * update worker thread. + * If we were marked on cpu_stat_off clear the flag + * so that vmstat_shepherd doesn't schedule us again. */ - queue_delayed_work_on(smp_processor_id(), vmstat_wq, - this_cpu_ptr(&vmstat_work), - round_jiffies_relative(sysctl_stat_interval)); + if (!cpumask_test_and_clear_cpu(smp_processor_id(), + cpu_stat_off)) { + queue_delayed_work_on(smp_processor_id(), vmstat_wq, + this_cpu_ptr(&vmstat_work), + round_jiffies_relative(sysctl_stat_interval)); + } } else { /* * We did not update any counters so the app may be in @@ -1403,21 +1413,16 @@ static void vmstat_update(struct work_struct *w) * Defer the checking for differentials to the * shepherd thread on a different processor. */ - int r; - /* - * Shepherd work thread does not race since it never - * changes the bit if its zero but the cpu - * online / off line code may race if - * worker threads are still allowed during - * shutdown / startup. - */ - r = cpumask_test_and_set_cpu(smp_processor_id(), - cpu_stat_off); - VM_BUG_ON(r); + cpumask_set_cpu(smp_processor_id(), cpu_stat_off); } } /* + * Switch off vmstat processing and then fold all the remaining differentials + * until the diffs stay at zero. The function is used by NOHZ and can only be + * invoked when tick processing is not active. + */ +/* * Check if the diffs for a certain cpu indicate that * an update is needed. */ @@ -1440,6 +1445,30 @@ static bool need_update(int cpu) return false; } +void quiet_vmstat(void) +{ + if (system_state != SYSTEM_RUNNING) + return; + + /* + * If we are already in hands of the shepherd then there + * is nothing for us to do here. + */ + if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off)) + return; + + if (!need_update(smp_processor_id())) + return; + + /* + * Just refresh counters and do not care about the pending delayed + * vmstat_update. It doesn't fire that often to matter and canceling + * it would be too expensive from this path. + * vmstat_shepherd will take care about that for us. + */ + refresh_cpu_vm_stats(false); +} + /* * Shepherd worker thread that checks the @@ -1449,7 +1478,7 @@ static bool need_update(int cpu) */ static void vmstat_shepherd(struct work_struct *w); -static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd); +static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd); static void vmstat_shepherd(struct work_struct *w) { @@ -1457,18 +1486,25 @@ static void vmstat_shepherd(struct work_struct *w) get_online_cpus(); /* Check processors whose vmstat worker threads have been disabled */ - for_each_cpu(cpu, cpu_stat_off) - if (need_update(cpu) && - cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) - - queue_delayed_work_on(cpu, vmstat_wq, - &per_cpu(vmstat_work, cpu), 0); - + for_each_cpu(cpu, cpu_stat_off) { + struct delayed_work *dw = &per_cpu(vmstat_work, cpu); + + if (need_update(cpu)) { + if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) + queue_delayed_work_on(cpu, vmstat_wq, dw, 0); + } else { + /* + * Cancel the work if quiet_vmstat has put this + * cpu on cpu_stat_off because the work item might + * be still scheduled + */ + cancel_delayed_work(dw); + } + } put_online_cpus(); schedule_delayed_work(&shepherd, round_jiffies_relative(sysctl_stat_interval)); - } static void __init start_shepherd_timer(void) @@ -1476,13 +1512,14 @@ static void __init start_shepherd_timer(void) int cpu; for_each_possible_cpu(cpu) - INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu), + INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), vmstat_update); if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) BUG(); cpumask_copy(cpu_stat_off, cpu_online_mask); + vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); schedule_delayed_work(&shepherd, round_jiffies_relative(sysctl_stat_interval)); } @@ -1550,7 +1587,6 @@ static int __init setup_vmstat(void) start_shepherd_timer(); cpu_notifier_register_done(); - vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); #endif #ifdef CONFIG_PROC_FS proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); diff --git a/mm/workingset.c b/mm/workingset.c index aa01713..61ead9e 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -351,8 +351,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, node->slots[i] = NULL; BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT)); node->count -= 1U << RADIX_TREE_COUNT_SHIFT; - BUG_ON(!mapping->nrshadows); - mapping->nrshadows--; + BUG_ON(!mapping->nrexceptional); + mapping->nrexceptional--; } } BUG_ON(node->count); @@ -463,9 +463,6 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle) spin_unlock(&pool->lock); } -#define list_tail_entry(ptr, type, member) \ - list_entry((ptr)->prev, type, member) - /** * zbud_reclaim_page() - evicts allocations from a pool page and frees it * @pool: pool from which a page will attempt to be evicted @@ -514,7 +511,7 @@ int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) return -EINVAL; } for (i = 0; i < retries; i++) { - zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru); + zhdr = list_last_entry(&pool->lru, struct zbud_header, lru); list_del(&zhdr->lru); list_del(&zhdr->buddy); /* Protect zbud page against free */ diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 9f15bdd..2d7c4c1 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -213,10 +213,10 @@ struct size_class { int size; unsigned int index; - /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ - int pages_per_zspage; struct zs_size_stat stats; + /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ + int pages_per_zspage; /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ bool huge; }; @@ -309,7 +309,12 @@ static void free_handle(struct zs_pool *pool, unsigned long handle) static void record_obj(unsigned long handle, unsigned long obj) { - *(unsigned long *)handle = obj; + /* + * lsb of @obj represents handle lock while other bits + * represent object value the handle is pointing so + * updating shouldn't do store tearing. + */ + WRITE_ONCE(*(unsigned long *)handle, obj); } /* zpool driver */ @@ -1635,6 +1640,13 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, free_obj = obj_malloc(d_page, class, handle); zs_object_copy(free_obj, used_obj, class); index++; + /* + * record_obj updates handle's value to free_obj and it will + * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which + * breaks synchronization using pin_tag(e,g, zs_free) so + * let's keep the lock bit. + */ + free_obj |= BIT(HANDLE_PIN_BIT); record_obj(handle, free_obj); unpin_tag(handle); obj_free(pool, class, used_obj); |