diff options
Diffstat (limited to 'mm')
42 files changed, 970 insertions, 901 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 79d0fd1..5b0adf1 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -42,7 +42,6 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT config PAGE_POISONING bool "Poison pages after freeing" - select PAGE_EXTENSION select PAGE_POISONING_NO_SANITY if HIBERNATION ---help--- Fill the pages with poison patterns after free_pages() and verify @@ -53,6 +53,11 @@ unsigned long cma_get_size(const struct cma *cma) return cma->count << PAGE_SHIFT; } +const char *cma_get_name(const struct cma *cma) +{ + return cma->name ? cma->name : "(undefined)"; +} + static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, int align_order) { @@ -168,6 +173,7 @@ core_initcall(cma_init_reserved_areas); */ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, unsigned int order_per_bit, + const char *name, struct cma **res_cma) { struct cma *cma; @@ -198,6 +204,13 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, * subsystems (like slab allocator) are available. */ cma = &cma_areas[cma_area_count]; + if (name) { + cma->name = name; + } else { + cma->name = kasprintf(GFP_KERNEL, "cma%d\n", cma_area_count); + if (!cma->name) + return -ENOMEM; + } cma->base_pfn = PFN_DOWN(base); cma->count = size >> PAGE_SHIFT; cma->order_per_bit = order_per_bit; @@ -229,7 +242,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, int __init cma_declare_contiguous(phys_addr_t base, phys_addr_t size, phys_addr_t limit, phys_addr_t alignment, unsigned int order_per_bit, - bool fixed, struct cma **res_cma) + bool fixed, const char *name, struct cma **res_cma) { phys_addr_t memblock_end = memblock_end_of_DRAM(); phys_addr_t highmem_start; @@ -335,7 +348,7 @@ int __init cma_declare_contiguous(phys_addr_t base, base = addr; } - ret = cma_init_reserved_mem(base, size, order_per_bit, res_cma); + ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma); if (ret) goto err; @@ -491,3 +504,17 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) return true; } + +int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data) +{ + int i; + + for (i = 0; i < cma_area_count; i++) { + int ret = it(&cma_areas[i], data); + + if (ret) + return ret; + } + + return 0; +} @@ -11,6 +11,7 @@ struct cma { struct hlist_head mem_head; spinlock_t mem_head_lock; #endif + const char *name; }; extern struct cma cma_areas[MAX_CMA_AREAS]; diff --git a/mm/cma_debug.c b/mm/cma_debug.c index ffc0c3d..595b757 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -167,7 +167,7 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) char name[16]; int u32s; - sprintf(name, "cma-%d", idx); + sprintf(name, "cma-%s", cma->name); tmp = debugfs_create_dir(name, cma_debugfs_root); diff --git a/mm/compaction.c b/mm/compaction.c index 81e1eaa..09c5282 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -992,9 +992,6 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, static bool suitable_migration_target(struct compact_control *cc, struct page *page) { - if (cc->ignore_block_suitable) - return true; - /* If the page is a large free page, then disallow migration */ if (PageBuddy(page)) { /* @@ -1006,6 +1003,9 @@ static bool suitable_migration_target(struct compact_control *cc, return false; } + if (cc->ignore_block_suitable) + return true; + /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ if (migrate_async_suitable(get_pageblock_migratetype(page))) return true; diff --git a/mm/filemap.c b/mm/filemap.c index dc59c5f..681da61 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2204,12 +2204,12 @@ int filemap_fault(struct vm_fault *vmf) struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; + pgoff_t max_off; struct page *page; - loff_t size; int ret = 0; - size = round_up(i_size_read(inode), PAGE_SIZE); - if (offset >= size >> PAGE_SHIFT) + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(offset >= max_off)) return VM_FAULT_SIGBUS; /* @@ -2258,8 +2258,8 @@ retry_find: * Found the page and have a reference on it. * We must recheck i_size under page lock. */ - size = round_up(i_size_read(inode), PAGE_SIZE); - if (unlikely(offset >= size >> PAGE_SHIFT)) { + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(offset >= max_off)) { unlock_page(page); put_page(page); return VM_FAULT_SIGBUS; @@ -2325,7 +2325,7 @@ void filemap_map_pages(struct vm_fault *vmf, struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; - loff_t size; + unsigned long max_idx; struct page *head, *page; rcu_read_lock(); @@ -2371,8 +2371,8 @@ repeat: if (page->mapping != mapping || !PageUptodate(page)) goto unlock; - size = round_up(i_size_read(mapping->host), PAGE_SIZE); - if (page->index >= size >> PAGE_SHIFT) + max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); + if (page->index >= max_idx) goto unlock; if (file->f_ra.mmap_miss > 0) @@ -2720,18 +2720,16 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) * about to write. We do this *before* the write so that we can return * without clobbering -EIOCBQUEUED from ->direct_IO(). */ - if (mapping->nrpages) { - written = invalidate_inode_pages2_range(mapping, + written = invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end); - /* - * If a page can not be invalidated, return 0 to fall back - * to buffered write. - */ - if (written) { - if (written == -EBUSY) - return 0; - goto out; - } + /* + * If a page can not be invalidated, return 0 to fall back + * to buffered write. + */ + if (written) { + if (written == -EBUSY) + return 0; + goto out; } written = mapping->a_ops->direct_IO(iocb, from); @@ -2744,10 +2742,8 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) * so we don't support it 100%. If this invalidation * fails, tough, the write still worked... */ - if (mapping->nrpages) { - invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); - } + invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, end); if (written > 0) { pos += written; @@ -1575,7 +1575,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, end = start + len; if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, - start, len))) + (void __user *)start, len))) return 0; /* diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f3c4f9d..b787c4c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1564,9 +1564,6 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, ClearPageDirty(page); unlock_page(page); - if (PageActive(page)) - deactivate_page(page); - if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { pmdp_invalidate(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); @@ -1575,6 +1572,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, set_pmd_at(mm, addr, pmd, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } + + mark_page_lazyfree(page); ret = true; out: spin_unlock(ptl); @@ -2145,15 +2144,15 @@ static void freeze_page(struct page *page) { enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; - int ret; + bool unmap_success; VM_BUG_ON_PAGE(!PageHead(page), page); if (PageAnon(page)) ttu_flags |= TTU_MIGRATION; - ret = try_to_unmap(page, ttu_flags); - VM_BUG_ON_PAGE(ret, page); + unmap_success = try_to_unmap(page, ttu_flags); + VM_BUG_ON_PAGE(!unmap_success, page); } static void unfreeze_page(struct page *page) @@ -2399,7 +2398,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) VM_BUG_ON_PAGE(is_huge_zero_page(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); VM_BUG_ON_PAGE(!PageCompound(page), page); if (PageAnon(head)) { diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 9d26fd9..356df05 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -34,8 +34,7 @@ static int hwpoison_inject(void *data, u64 val) if (!hwpoison_filter_enable) goto inject; - if (!PageLRU(hpage) && !PageHuge(p)) - shake_page(hpage, 0); + shake_page(hpage, 0); /* * This implies unable to support non-LRU pages. */ diff --git a/mm/internal.h b/mm/internal.h index 266efae..04d08ef 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -81,11 +81,16 @@ static inline void set_page_refcounted(struct page *page) extern unsigned long highest_memmap_pfn; /* + * Maximum number of reclaim retries without progress before the OOM + * killer is consider the only way forward. + */ +#define MAX_RECLAIM_RETRIES 16 + +/* * in mm/vmscan.c: */ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); -extern bool pgdat_reclaimable(struct pglist_data *pgdat); /* * in mm/rmap.c: @@ -505,4 +510,14 @@ extern const struct trace_print_flags pageflag_names[]; extern const struct trace_print_flags vmaflag_names[]; extern const struct trace_print_flags gfpflag_names[]; +static inline bool is_migrate_highatomic(enum migratetype migratetype) +{ + return migratetype == MIGRATE_HIGHATOMIC; +} + +static inline bool is_migrate_highatomic_page(struct page *page) +{ + return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; +} + #endif /* __MM_INTERNAL_H */ diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 98b2719..9348d27 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -577,7 +577,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object) shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) { - kasan_report_double_free(cache, object, shadow_byte); + kasan_report_double_free(cache, object, + __builtin_return_address(1)); return true; } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index dd2dea8..1229298 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -99,7 +99,7 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr) void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); void kasan_report_double_free(struct kmem_cache *cache, void *object, - s8 shadow); + void *ip); #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB) void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index ab42a08..beee0e9 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -51,7 +51,13 @@ static const void *find_first_bad_addr(const void *addr, size_t size) return first_bad_addr; } -static void print_error_description(struct kasan_access_info *info) +static bool addr_has_shadow(struct kasan_access_info *info) +{ + return (info->access_addr >= + kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); +} + +static const char *get_shadow_bug_type(struct kasan_access_info *info) { const char *bug_type = "unknown-crash"; u8 *shadow_addr; @@ -98,12 +104,39 @@ static void print_error_description(struct kasan_access_info *info) break; } - pr_err("BUG: KASAN: %s in %pS at addr %p\n", - bug_type, (void *)info->ip, - info->access_addr); - pr_err("%s of size %zu by task %s/%d\n", - info->is_write ? "Write" : "Read", - info->access_size, current->comm, task_pid_nr(current)); + return bug_type; +} + +const char *get_wild_bug_type(struct kasan_access_info *info) +{ + const char *bug_type = "unknown-crash"; + + if ((unsigned long)info->access_addr < PAGE_SIZE) + bug_type = "null-ptr-deref"; + else if ((unsigned long)info->access_addr < TASK_SIZE) + bug_type = "user-memory-access"; + else + bug_type = "wild-memory-access"; + + return bug_type; +} + +static const char *get_bug_type(struct kasan_access_info *info) +{ + if (addr_has_shadow(info)) + return get_shadow_bug_type(info); + return get_wild_bug_type(info); +} + +static void print_error_description(struct kasan_access_info *info) +{ + const char *bug_type = get_bug_type(info); + + pr_err("BUG: KASAN: %s in %pS\n", + bug_type, (void *)info->ip); + pr_err("%s of size %zu at addr %p by task %s/%d\n", + info->is_write ? "Write" : "Read", info->access_size, + info->access_addr, current->comm, task_pid_nr(current)); } static inline bool kernel_or_module_addr(const void *addr) @@ -144,9 +177,9 @@ static void kasan_end_report(unsigned long *flags) kasan_enable_current(); } -static void print_track(struct kasan_track *track) +static void print_track(struct kasan_track *track, const char *prefix) { - pr_err("PID = %u\n", track->pid); + pr_err("%s by task %u:\n", prefix, track->pid); if (track->stack) { struct stack_trace trace; @@ -157,59 +190,84 @@ static void print_track(struct kasan_track *track) } } -static void kasan_object_err(struct kmem_cache *cache, void *object) +static struct page *addr_to_page(const void *addr) { - struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); + if ((addr >= (void *)PAGE_OFFSET) && + (addr < high_memory)) + return virt_to_head_page(addr); + return NULL; +} - dump_stack(); - pr_err("Object at %p, in cache %s size: %d\n", object, cache->name, - cache->object_size); +static void describe_object_addr(struct kmem_cache *cache, void *object, + const void *addr) +{ + unsigned long access_addr = (unsigned long)addr; + unsigned long object_addr = (unsigned long)object; + const char *rel_type; + int rel_bytes; - if (!(cache->flags & SLAB_KASAN)) + pr_err("The buggy address belongs to the object at %p\n" + " which belongs to the cache %s of size %d\n", + object, cache->name, cache->object_size); + + if (!addr) return; - pr_err("Allocated:\n"); - print_track(&alloc_info->alloc_track); - pr_err("Freed:\n"); - print_track(&alloc_info->free_track); + if (access_addr < object_addr) { + rel_type = "to the left"; + rel_bytes = object_addr - access_addr; + } else if (access_addr >= object_addr + cache->object_size) { + rel_type = "to the right"; + rel_bytes = access_addr - (object_addr + cache->object_size); + } else { + rel_type = "inside"; + rel_bytes = access_addr - object_addr; + } + + pr_err("The buggy address is located %d bytes %s of\n" + " %d-byte region [%p, %p)\n", + rel_bytes, rel_type, cache->object_size, (void *)object_addr, + (void *)(object_addr + cache->object_size)); } -void kasan_report_double_free(struct kmem_cache *cache, void *object, - s8 shadow) +static void describe_object(struct kmem_cache *cache, void *object, + const void *addr) { - unsigned long flags; + struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); - kasan_start_report(&flags); - pr_err("BUG: Double free or freeing an invalid pointer\n"); - pr_err("Unexpected shadow byte: 0x%hhX\n", shadow); - kasan_object_err(cache, object); - kasan_end_report(&flags); + if (cache->flags & SLAB_KASAN) { + print_track(&alloc_info->alloc_track, "Allocated"); + pr_err("\n"); + print_track(&alloc_info->free_track, "Freed"); + pr_err("\n"); + } + + describe_object_addr(cache, object, addr); } -static void print_address_description(struct kasan_access_info *info) +static void print_address_description(void *addr) { - const void *addr = info->access_addr; + struct page *page = addr_to_page(addr); - if ((addr >= (void *)PAGE_OFFSET) && - (addr < high_memory)) { - struct page *page = virt_to_head_page(addr); - - if (PageSlab(page)) { - void *object; - struct kmem_cache *cache = page->slab_cache; - object = nearest_obj(cache, page, - (void *)info->access_addr); - kasan_object_err(cache, object); - return; - } - dump_page(page, "kasan: bad access detected"); + dump_stack(); + pr_err("\n"); + + if (page && PageSlab(page)) { + struct kmem_cache *cache = page->slab_cache; + void *object = nearest_obj(cache, page, addr); + + describe_object(cache, object, addr); } - if (kernel_or_module_addr(addr)) { - if (!init_task_stack_addr(addr)) - pr_err("Address belongs to variable %pS\n", addr); + if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) { + pr_err("The buggy address belongs to the variable:\n"); + pr_err(" %pS\n", addr); + } + + if (page) { + pr_err("The buggy address belongs to the page:\n"); + dump_page(page, "kasan: bad access detected"); } - dump_stack(); } static bool row_is_guilty(const void *row, const void *guilty) @@ -264,31 +322,34 @@ static void print_shadow_for_address(const void *addr) } } +void kasan_report_double_free(struct kmem_cache *cache, void *object, + void *ip) +{ + unsigned long flags; + + kasan_start_report(&flags); + pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", ip); + pr_err("\n"); + print_address_description(object); + pr_err("\n"); + print_shadow_for_address(object); + kasan_end_report(&flags); +} + static void kasan_report_error(struct kasan_access_info *info) { unsigned long flags; - const char *bug_type; kasan_start_report(&flags); - if (info->access_addr < - kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) { - if ((unsigned long)info->access_addr < PAGE_SIZE) - bug_type = "null-ptr-deref"; - else if ((unsigned long)info->access_addr < TASK_SIZE) - bug_type = "user-memory-access"; - else - bug_type = "wild-memory-access"; - pr_err("BUG: KASAN: %s on address %p\n", - bug_type, info->access_addr); - pr_err("%s of size %zu by task %s/%d\n", - info->is_write ? "Write" : "Read", - info->access_size, current->comm, - task_pid_nr(current)); + print_error_description(info); + pr_err("\n"); + + if (!addr_has_shadow(info)) { dump_stack(); } else { - print_error_description(info); - print_address_description(info); + print_address_description((void *)info->access_addr); + pr_err("\n"); print_shadow_for_address(info->first_bad_addr); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ba40b7f..7cb9c88 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -483,8 +483,7 @@ void __khugepaged_exit(struct mm_struct *mm) static void release_pte_page(struct page *page) { - /* 0 stands for page_is_file_cache(page) == false */ - dec_node_page_state(page, NR_ISOLATED_ANON + 0); + dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); unlock_page(page); putback_lru_page(page); } @@ -532,7 +531,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(PageCompound(page), page); VM_BUG_ON_PAGE(!PageAnon(page), page); - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); /* * We can do it before isolate_lru_page because the @@ -550,7 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * The page must only be referenced by the scanned process * and page swap cache. */ - if (page_count(page) != 1 + !!PageSwapCache(page)) { + if (page_count(page) != 1 + PageSwapCache(page)) { unlock_page(page); result = SCAN_PAGE_COUNT; goto out; @@ -579,8 +577,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_DEL_PAGE_LRU; goto out; } - /* 0 stands for page_is_file_cache(page) == false */ - inc_node_page_state(page, NR_ISOLATED_ANON + 0); + inc_node_page_state(page, + NR_ISOLATED_ANON + page_is_file_cache(page)); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); @@ -1183,7 +1181,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, * The page must only be referenced by the scanned process * and page swap cache. */ - if (page_count(page) != 1 + !!PageSwapCache(page)) { + if (page_count(page) != 1 + PageSwapCache(page)) { result = SCAN_PAGE_COUNT; goto out_unmap; } @@ -1933,11 +1933,10 @@ struct page *ksm_might_need_to_copy(struct page *page, return new_page; } -int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) +void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { struct stable_node *stable_node; struct rmap_item *rmap_item; - int ret = SWAP_AGAIN; int search_new_forks = 0; VM_BUG_ON_PAGE(!PageKsm(page), page); @@ -1950,7 +1949,7 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) stable_node = page_stable_node(page); if (!stable_node) - return ret; + return; again: hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; @@ -1978,23 +1977,20 @@ again: if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - ret = rwc->rmap_one(page, vma, - rmap_item->address, rwc->arg); - if (ret != SWAP_AGAIN) { + if (!rwc->rmap_one(page, vma, + rmap_item->address, rwc->arg)) { anon_vma_unlock_read(anon_vma); - goto out; + return; } if (rwc->done && rwc->done(page)) { anon_vma_unlock_read(anon_vma); - goto out; + return; } } anon_vma_unlock_read(anon_vma); } if (!search_new_forks++) goto again; -out: - return ret; } #ifdef CONFIG_MIGRATION diff --git a/mm/madvise.c b/mm/madvise.c index 7a2abf0..25b78ee 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -411,10 +411,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, ptent = pte_mkold(ptent); ptent = pte_mkclean(ptent); set_pte_at(mm, addr, pte, ptent); - if (PageActive(page)) - deactivate_page(page); tlb_remove_tlb_entry(tlb, pte, addr); } + mark_page_lazyfree(page); } out: if (nr_swap) { @@ -606,34 +605,40 @@ static long madvise_remove(struct vm_area_struct *vma, /* * Error injection support for memory error handling. */ -static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) +static int madvise_inject_error(int behavior, + unsigned long start, unsigned long end) { - struct page *p; + struct page *page; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; + for (; start < end; start += PAGE_SIZE << - compound_order(compound_head(p))) { + compound_order(compound_head(page))) { int ret; - ret = get_user_pages_fast(start, 1, 0, &p); + ret = get_user_pages_fast(start, 1, 0, &page); if (ret != 1) return ret; - if (PageHWPoison(p)) { - put_page(p); + if (PageHWPoison(page)) { + put_page(page); continue; } - if (bhv == MADV_SOFT_OFFLINE) { - pr_info("Soft offlining page %#lx at %#lx\n", - page_to_pfn(p), start); - ret = soft_offline_page(p, MF_COUNT_INCREASED); + + if (behavior == MADV_SOFT_OFFLINE) { + pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", + page_to_pfn(page), start); + + ret = soft_offline_page(page, MF_COUNT_INCREASED); if (ret) return ret; continue; } - pr_info("Injecting memory failure for page %#lx at %#lx\n", - page_to_pfn(p), start); - ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); + pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", + page_to_pfn(page), start); + + ret = memory_failure(page_to_pfn(page), 0, MF_COUNT_INCREASED); if (ret) return ret; } @@ -651,13 +656,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); case MADV_FREE: - /* - * XXX: In this implementation, MADV_FREE works like - * MADV_DONTNEED on swapless system or full swap. - */ - if (get_nr_swap_pages() > 0) - return madvise_free(vma, prev, start, end); - /* passthrough */ + return madvise_free(vma, prev, start, end); case MADV_DONTNEED: return madvise_dontneed(vma, prev, start, end); default: @@ -688,6 +687,10 @@ madvise_behavior_valid(int behavior) #endif case MADV_DONTDUMP: case MADV_DODUMP: +#ifdef CONFIG_MEMORY_FAILURE + case MADV_SOFT_OFFLINE: + case MADV_HWPOISON: +#endif return true; default: @@ -761,10 +764,6 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) size_t len; struct blk_plug plug; -#ifdef CONFIG_MEMORY_FAILURE - if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) - return madvise_hwpoison(behavior, start, start+len_in); -#endif if (!madvise_behavior_valid(behavior)) return error; @@ -784,6 +783,11 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) if (end == start) return error; +#ifdef CONFIG_MEMORY_FAILURE + if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) + return madvise_inject_error(behavior, start, start + len_in); +#endif + write = madvise_need_mmap_write(behavior); if (write) { if (down_write_killable(¤t->mm->mmap_sem)) diff --git a/mm/memblock.c b/mm/memblock.c index 696f06d..b049c9b 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -805,6 +805,18 @@ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) } /** + * memblock_clear_nomap - Clear flag MEMBLOCK_NOMAP for a specified region. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return 0 on success, -errno on failure. + */ +int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP); +} + +/** * __next_reserved_mem_region - next function for for_each_reserved_region() * @idx: pointer to u64 loop variable * @out_start: ptr to phys_addr_t for start address of the region, can be %NULL @@ -1531,11 +1543,37 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit) (phys_addr_t)ULLONG_MAX); } +void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size) +{ + int start_rgn, end_rgn; + int i, ret; + + if (!size) + return; + + ret = memblock_isolate_range(&memblock.memory, base, size, + &start_rgn, &end_rgn); + if (ret) + return; + + /* remove all the MAP regions */ + for (i = memblock.memory.cnt - 1; i >= end_rgn; i--) + if (!memblock_is_nomap(&memblock.memory.regions[i])) + memblock_remove_region(&memblock.memory, i); + + for (i = start_rgn - 1; i >= 0; i--) + if (!memblock_is_nomap(&memblock.memory.regions[i])) + memblock_remove_region(&memblock.memory, i); + + /* truncate the reserved regions */ + memblock_remove_range(&memblock.reserved, 0, base); + memblock_remove_range(&memblock.reserved, + base + size, (phys_addr_t)ULLONG_MAX); +} + void __init memblock_mem_limit_remove_map(phys_addr_t limit) { - struct memblock_type *type = &memblock.memory; phys_addr_t max_addr; - int i, ret, start_rgn, end_rgn; if (!limit) return; @@ -1546,19 +1584,7 @@ void __init memblock_mem_limit_remove_map(phys_addr_t limit) if (max_addr == (phys_addr_t)ULLONG_MAX) return; - ret = memblock_isolate_range(type, max_addr, (phys_addr_t)ULLONG_MAX, - &start_rgn, &end_rgn); - if (ret) - return; - - /* remove all the MAP regions above the limit */ - for (i = end_rgn - 1; i >= start_rgn; i--) { - if (!memblock_is_nomap(&type->regions[i])) - memblock_remove_region(type, i); - } - /* truncate the reserved regions */ - memblock_remove_range(&memblock.reserved, max_addr, - (phys_addr_t)ULLONG_MAX); + memblock_cap_memory_range(0, max_addr); } static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2bd7541..ff73899 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -100,24 +100,7 @@ static bool do_memsw_account(void) return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account; } -static const char * const mem_cgroup_stat_names[] = { - "cache", - "rss", - "rss_huge", - "mapped_file", - "dirty", - "writeback", - "swap", -}; - -static const char * const mem_cgroup_events_names[] = { - "pgpgin", - "pgpgout", - "pgfault", - "pgmajfault", -}; - -static const char * const mem_cgroup_lru_names[] = { +static const char *const mem_cgroup_lru_names[] = { "inactive_anon", "active_anon", "inactive_file", @@ -568,32 +551,15 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) * common workload, threshold and synchronization as vmstat[] should be * implemented. */ -static unsigned long -mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) -{ - long val = 0; - int cpu; - - /* Per-cpu values can be negative, use a signed accumulator */ - for_each_possible_cpu(cpu) - val += per_cpu(memcg->stat->count[idx], cpu); - /* - * Summing races with updates, so val may be negative. Avoid exposing - * transient negative values. - */ - if (val < 0) - val = 0; - return val; -} -static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, - enum mem_cgroup_events_index idx) +static unsigned long memcg_sum_events(struct mem_cgroup *memcg, + enum memcg_event_item event) { unsigned long val = 0; int cpu; for_each_possible_cpu(cpu) - val += per_cpu(memcg->stat->events[idx], cpu); + val += per_cpu(memcg->stat->events[event], cpu); return val; } @@ -606,23 +572,23 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, * counted as CACHE even if it's on ANON LRU. */ if (PageAnon(page)) - __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], - nr_pages); - else - __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], - nr_pages); + __this_cpu_add(memcg->stat->count[MEMCG_RSS], nr_pages); + else { + __this_cpu_add(memcg->stat->count[MEMCG_CACHE], nr_pages); + if (PageSwapBacked(page)) + __this_cpu_add(memcg->stat->count[NR_SHMEM], nr_pages); + } if (compound) { VM_BUG_ON_PAGE(!PageTransHuge(page), page); - __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], - nr_pages); + __this_cpu_add(memcg->stat->count[MEMCG_RSS_HUGE], nr_pages); } /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) - __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); + __this_cpu_inc(memcg->stat->events[PGPGIN]); else { - __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); + __this_cpu_inc(memcg->stat->events[PGPGOUT]); nr_pages = -nr_pages; /* for event */ } @@ -1144,6 +1110,28 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) return false; } +unsigned int memcg1_stats[] = { + MEMCG_CACHE, + MEMCG_RSS, + MEMCG_RSS_HUGE, + NR_SHMEM, + NR_FILE_MAPPED, + NR_FILE_DIRTY, + NR_WRITEBACK, + MEMCG_SWAP, +}; + +static const char *const memcg1_stat_names[] = { + "cache", + "rss", + "rss_huge", + "shmem", + "mapped_file", + "dirty", + "writeback", + "swap", +}; + #define K(x) ((x) << (PAGE_SHIFT-10)) /** * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. @@ -1188,11 +1176,11 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) pr_cont_cgroup_path(iter->css.cgroup); pr_cont(":"); - for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { - if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { + if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account) continue; - pr_cont(" %s:%luKB", mem_cgroup_stat_names[i], - K(mem_cgroup_read_stat(iter, i))); + pr_cont(" %s:%luKB", memcg1_stat_names[i], + K(memcg_page_state(iter, memcg1_stats[i]))); } for (i = 0; i < NR_LRU_LISTS; i++) @@ -1837,7 +1825,7 @@ static void reclaim_high(struct mem_cgroup *memcg, do { if (page_counter_read(&memcg->memory) <= memcg->high) continue; - mem_cgroup_events(memcg, MEMCG_HIGH, 1); + mem_cgroup_event(memcg, MEMCG_HIGH); try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); } while ((memcg = parent_mem_cgroup(memcg))); } @@ -1928,7 +1916,7 @@ retry: if (!gfpflags_allow_blocking(gfp_mask)) goto nomem; - mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1); + mem_cgroup_event(mem_over_limit, MEMCG_MAX); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, gfp_mask, may_swap); @@ -1971,7 +1959,7 @@ retry: if (fatal_signal_pending(current)) goto force; - mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); + mem_cgroup_event(mem_over_limit, MEMCG_OOM); mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages * PAGE_SIZE)); @@ -2381,7 +2369,7 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) head[i].mem_cgroup = head->mem_cgroup; - __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE], + __this_cpu_sub(head->mem_cgroup->stat->count[MEMCG_RSS_HUGE], HPAGE_PMD_NR); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -2391,7 +2379,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, bool charge) { int val = (charge) ? 1 : -1; - this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); + this_cpu_add(memcg->stat->count[MEMCG_SWAP], val); } /** @@ -2725,7 +2713,7 @@ static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat) for_each_mem_cgroup_tree(iter, memcg) { for (i = 0; i < MEMCG_NR_STAT; i++) - stat[i] += mem_cgroup_read_stat(iter, i); + stat[i] += memcg_page_state(iter, i); } } @@ -2738,7 +2726,7 @@ static void tree_events(struct mem_cgroup *memcg, unsigned long *events) for_each_mem_cgroup_tree(iter, memcg) { for (i = 0; i < MEMCG_NR_EVENTS; i++) - events[i] += mem_cgroup_read_events(iter, i); + events[i] += memcg_sum_events(iter, i); } } @@ -2750,13 +2738,10 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) struct mem_cgroup *iter; for_each_mem_cgroup_tree(iter, memcg) { - val += mem_cgroup_read_stat(iter, - MEM_CGROUP_STAT_CACHE); - val += mem_cgroup_read_stat(iter, - MEM_CGROUP_STAT_RSS); + val += memcg_page_state(iter, MEMCG_CACHE); + val += memcg_page_state(iter, MEMCG_RSS); if (swap) - val += mem_cgroup_read_stat(iter, - MEM_CGROUP_STAT_SWAP); + val += memcg_page_state(iter, MEMCG_SWAP); } } else { if (!swap) @@ -3131,6 +3116,21 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) } #endif /* CONFIG_NUMA */ +/* Universal VM events cgroup1 shows, original sort order */ +unsigned int memcg1_events[] = { + PGPGIN, + PGPGOUT, + PGFAULT, + PGMAJFAULT, +}; + +static const char *const memcg1_event_names[] = { + "pgpgin", + "pgpgout", + "pgfault", + "pgmajfault", +}; + static int memcg_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); @@ -3138,22 +3138,20 @@ static int memcg_stat_show(struct seq_file *m, void *v) struct mem_cgroup *mi; unsigned int i; - BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) != - MEM_CGROUP_STAT_NSTATS); - BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) != - MEM_CGROUP_EVENTS_NSTATS); + BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); - for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { - if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { + if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; - seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i], - mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); + seq_printf(m, "%s %lu\n", memcg1_stat_names[i], + memcg_page_state(memcg, memcg1_stats[i]) * + PAGE_SIZE); } - for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) - seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], - mem_cgroup_read_events(memcg, i)); + for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) + seq_printf(m, "%s %lu\n", memcg1_event_names[i], + memcg_sum_events(memcg, memcg1_events[i])); for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], @@ -3171,23 +3169,23 @@ static int memcg_stat_show(struct seq_file *m, void *v) seq_printf(m, "hierarchical_memsw_limit %llu\n", (u64)memsw * PAGE_SIZE); - for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long long val = 0; - if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) + if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; for_each_mem_cgroup_tree(mi, memcg) - val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; - seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val); + val += memcg_page_state(mi, memcg1_stats[i]) * + PAGE_SIZE; + seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val); } - for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { + for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) { unsigned long long val = 0; for_each_mem_cgroup_tree(mi, memcg) - val += mem_cgroup_read_events(mi, i); - seq_printf(m, "total_%s %llu\n", - mem_cgroup_events_names[i], val); + val += memcg_sum_events(mi, memcg1_events[i]); + seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val); } for (i = 0; i < NR_LRU_LISTS; i++) { @@ -3652,10 +3650,10 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY); + *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); /* this should eventually include NR_UNSTABLE_NFS */ - *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); + *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | (1 << LRU_ACTIVE_FILE)); *pheadroom = PAGE_COUNTER_MAX; @@ -4511,33 +4509,29 @@ static int mem_cgroup_move_account(struct page *page, spin_lock_irqsave(&from->move_lock, flags); if (!anon && page_mapped(page)) { - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], - nr_pages); - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], - nr_pages); + __this_cpu_sub(from->stat->count[NR_FILE_MAPPED], nr_pages); + __this_cpu_add(to->stat->count[NR_FILE_MAPPED], nr_pages); } /* * move_lock grabbed above and caller set from->moving_account, so - * mem_cgroup_update_page_stat() will serialize updates to PageDirty. + * mod_memcg_page_state will serialize updates to PageDirty. * So mapping should be stable for dirty pages. */ if (!anon && PageDirty(page)) { struct address_space *mapping = page_mapping(page); if (mapping_cap_account_dirty(mapping)) { - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY], + __this_cpu_sub(from->stat->count[NR_FILE_DIRTY], nr_pages); - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY], + __this_cpu_add(to->stat->count[NR_FILE_DIRTY], nr_pages); } } if (PageWriteback(page)) { - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], - nr_pages); - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], - nr_pages); + __this_cpu_sub(from->stat->count[NR_WRITEBACK], nr_pages); + __this_cpu_add(to->stat->count[NR_WRITEBACK], nr_pages); } /* @@ -5154,7 +5148,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, continue; } - mem_cgroup_events(memcg, MEMCG_OOM, 1); + mem_cgroup_event(memcg, MEMCG_OOM); if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) break; } @@ -5167,10 +5161,10 @@ static int memory_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW)); - seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH)); - seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX)); - seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM)); + seq_printf(m, "low %lu\n", memcg_sum_events(memcg, MEMCG_LOW)); + seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH)); + seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX)); + seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM)); return 0; } @@ -5197,9 +5191,9 @@ static int memory_stat_show(struct seq_file *m, void *v) tree_events(memcg, events); seq_printf(m, "anon %llu\n", - (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE); + (u64)stat[MEMCG_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", - (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); + (u64)stat[MEMCG_CACHE] * PAGE_SIZE); seq_printf(m, "kernel_stack %llu\n", (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); seq_printf(m, "slab %llu\n", @@ -5208,12 +5202,14 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "sock %llu\n", (u64)stat[MEMCG_SOCK] * PAGE_SIZE); + seq_printf(m, "shmem %llu\n", + (u64)stat[NR_SHMEM] * PAGE_SIZE); seq_printf(m, "file_mapped %llu\n", - (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE); + (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE); seq_printf(m, "file_dirty %llu\n", - (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE); + (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE); seq_printf(m, "file_writeback %llu\n", - (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE); + (u64)stat[NR_WRITEBACK] * PAGE_SIZE); for (i = 0; i < NR_LRU_LISTS; i++) { struct mem_cgroup *mi; @@ -5232,10 +5228,15 @@ static int memory_stat_show(struct seq_file *m, void *v) /* Accumulated memory events */ - seq_printf(m, "pgfault %lu\n", - events[MEM_CGROUP_EVENTS_PGFAULT]); - seq_printf(m, "pgmajfault %lu\n", - events[MEM_CGROUP_EVENTS_PGMAJFAULT]); + seq_printf(m, "pgfault %lu\n", events[PGFAULT]); + seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]); + + seq_printf(m, "workingset_refault %lu\n", + stat[WORKINGSET_REFAULT]); + seq_printf(m, "workingset_activate %lu\n", + stat[WORKINGSET_ACTIVATE]); + seq_printf(m, "workingset_nodereclaim %lu\n", + stat[WORKINGSET_NODERECLAIM]); return 0; } @@ -5476,8 +5477,8 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, unsigned long nr_anon, unsigned long nr_file, - unsigned long nr_huge, unsigned long nr_kmem, - struct page *dummy_page) + unsigned long nr_kmem, unsigned long nr_huge, + unsigned long nr_shmem, struct page *dummy_page) { unsigned long nr_pages = nr_anon + nr_file + nr_kmem; unsigned long flags; @@ -5492,10 +5493,11 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, } local_irq_save(flags); - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); - __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); + __this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon); + __this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file); + __this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge); + __this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem); + __this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout); __this_cpu_add(memcg->stat->nr_page_events, nr_pages); memcg_check_events(memcg, dummy_page); local_irq_restore(flags); @@ -5507,6 +5509,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, static void uncharge_list(struct list_head *page_list) { struct mem_cgroup *memcg = NULL; + unsigned long nr_shmem = 0; unsigned long nr_anon = 0; unsigned long nr_file = 0; unsigned long nr_huge = 0; @@ -5539,9 +5542,9 @@ static void uncharge_list(struct list_head *page_list) if (memcg != page->mem_cgroup) { if (memcg) { uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_huge, nr_kmem, page); - pgpgout = nr_anon = nr_file = - nr_huge = nr_kmem = 0; + nr_kmem, nr_huge, nr_shmem, page); + pgpgout = nr_anon = nr_file = nr_kmem = 0; + nr_huge = nr_shmem = 0; } memcg = page->mem_cgroup; } @@ -5555,8 +5558,11 @@ static void uncharge_list(struct list_head *page_list) } if (PageAnon(page)) nr_anon += nr_pages; - else + else { nr_file += nr_pages; + if (PageSwapBacked(page)) + nr_shmem += nr_pages; + } pgpgout++; } else { nr_kmem += 1 << compound_order(page); @@ -5568,7 +5574,7 @@ static void uncharge_list(struct list_head *page_list) if (memcg) uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_huge, nr_kmem, page); + nr_kmem, nr_huge, nr_shmem, page); } /** diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 27f7210..73066b8 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -220,6 +220,9 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, */ void shake_page(struct page *p, int access) { + if (PageHuge(p)) + return; + if (!PageSlab(p)) { lru_add_drain_all(); if (PageLRU(p)) @@ -322,7 +325,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * wrong earlier. */ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, - int fail, struct page *page, unsigned long pfn, + bool fail, struct page *page, unsigned long pfn, int flags) { struct to_kill *tk, *next; @@ -904,35 +907,36 @@ EXPORT_SYMBOL_GPL(get_hwpoison_page); * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. */ -static int hwpoison_user_mappings(struct page *p, unsigned long pfn, +static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, int trapno, int flags, struct page **hpagep) { - enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; struct address_space *mapping; LIST_HEAD(tokill); - int ret; + bool unmap_success; int kill = 1, forcekill; struct page *hpage = *hpagep; + bool mlocked = PageMlocked(hpage); /* * Here we are interested only in user-mapped pages, so skip any * other types of pages. */ if (PageReserved(p) || PageSlab(p)) - return SWAP_SUCCESS; + return true; if (!(PageLRU(hpage) || PageHuge(p))) - return SWAP_SUCCESS; + return true; /* * This check implies we don't kill processes if their pages * are in the swap cache early. Those are always late kills. */ if (!page_mapped(hpage)) - return SWAP_SUCCESS; + return true; if (PageKsm(p)) { pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn); - return SWAP_FAIL; + return false; } if (PageSwapCache(p)) { @@ -971,12 +975,19 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, if (kill) collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); - ret = try_to_unmap(hpage, ttu); - if (ret != SWAP_SUCCESS) + unmap_success = try_to_unmap(hpage, ttu); + if (!unmap_success) pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(hpage)); /* + * try_to_unmap() might put mlocked page in lru cache, so call + * shake_page() again to ensure that it's flushed. + */ + if (mlocked) + shake_page(hpage, 0); + + /* * Now that the dirty bit has been propagated to the * struct page and all unmaps done we can decide if * killing is needed or not. Only kill when the page @@ -987,10 +998,9 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * any accesses to the poisoned memory. */ forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL); - kill_procs(&tokill, forcekill, trapno, - ret != SWAP_SUCCESS, p, pfn, flags); + kill_procs(&tokill, forcekill, trapno, !unmap_success, p, pfn, flags); - return ret; + return unmap_success; } static void set_page_hwpoison_huge_page(struct page *hpage) @@ -1138,22 +1148,14 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. */ - if (!PageHuge(p)) { - if (!PageLRU(p)) - shake_page(p, 0); - if (!PageLRU(p)) { - /* - * shake_page could have turned it free. - */ - if (is_free_buddy_page(p)) { - if (flags & MF_COUNT_INCREASED) - action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); - else - action_result(pfn, MF_MSG_BUDDY_2ND, - MF_DELAYED); - return 0; - } - } + shake_page(p, 0); + /* shake_page could have turned it free. */ + if (!PageLRU(p) && is_free_buddy_page(p)) { + if (flags & MF_COUNT_INCREASED) + action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); + else + action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED); + return 0; } lock_page(hpage); @@ -1230,8 +1232,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * When the raw error page is thp tail page, hpage points to the raw * page after thp split. */ - if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) - != SWAP_SUCCESS) { + if (!hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)) { action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; goto out; @@ -1543,8 +1544,8 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) if (ret == 1 && !PageLRU(page)) { /* Drop page reference which is from __get_any_page() */ put_hwpoison_page(page); - pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", - pfn, page->flags); + pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n", + pfn, page->flags, &page->flags); return -EIO; } } @@ -1585,8 +1586,8 @@ static int soft_offline_huge_page(struct page *page, int flags) ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { - pr_info("soft offline: %#lx: migration failed %d, type %lx\n", - pfn, ret, page->flags); + pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n", + pfn, ret, page->flags, &page->flags); /* * We know that soft_offline_huge_page() tries to migrate * only one hugepage pointed to by hpage, so we need not @@ -1677,14 +1678,14 @@ static int __soft_offline_page(struct page *page, int flags) if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); - pr_info("soft offline: %#lx: migration failed %d, type %lx\n", - pfn, ret, page->flags); + pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n", + pfn, ret, page->flags, &page->flags); if (ret > 0) ret = -EIO; } } else { - pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", - pfn, ret, page_count(page), page->flags); + pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n", + pfn, ret, page_count(page), page->flags, &page->flags); } return ret; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6fa7208..b63d7d1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1208,7 +1208,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) arch_refresh_nodedata(nid, pgdat); } else { - /* Reset the nr_zones, order and classzone_idx before reuse */ + /* + * Reset the nr_zones, order and classzone_idx before reuse. + * Note that kswapd will init kswapd_classzone_idx properly + * when it starts in the near future. + */ pgdat->nr_zones = 0; pgdat->kswapd_order = 0; pgdat->kswapd_classzone_idx = 0; diff --git a/mm/migrate.c b/mm/migrate.c index 738f1d5..89a0a17 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -194,7 +194,7 @@ void putback_movable_pages(struct list_head *l) /* * Restore a potential migration pte to a working pte entry */ -static int remove_migration_pte(struct page *page, struct vm_area_struct *vma, +static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *old) { struct page_vma_mapped_walk pvmw = { @@ -253,7 +253,7 @@ static int remove_migration_pte(struct page *page, struct vm_area_struct *vma, update_mmu_cache(vma, pvmw.address, pvmw.pte); } - return SWAP_AGAIN; + return true; } /* @@ -1722,9 +1722,6 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, { int z; - if (!pgdat_reclaimable(pgdat)) - return false; - for (z = pgdat->nr_zones - 1; z >= 0; z--) { struct zone *zone = pgdat->node_zones + z; @@ -1947,7 +1944,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, /* Prepare a page as a migration target */ __SetPageLocked(new_page); - __SetPageSwapBacked(new_page); + if (PageSwapBacked(page)) + __SetPageSwapBacked(new_page); /* anon mapping, we can simply copy page->mapping to the new page: */ new_page->mapping = page->mapping; @@ -123,17 +123,15 @@ static bool __munlock_isolate_lru_page(struct page *page, bool getpage) */ static void __munlock_isolated_page(struct page *page) { - int ret = SWAP_AGAIN; - /* * Optimization: if the page was mapped just once, that's our mapping * and we don't need to check all the other vmas. */ if (page_mapcount(page) > 1) - ret = try_to_munlock(page); + try_to_munlock(page); /* Did try_to_unlock() succeed or punt? */ - if (ret != SWAP_MLOCK) + if (!PageMlocked(page)) count_vm_event(UNEVICTABLE_PGMUNLOCKED); putback_lru_page(page); @@ -1479,7 +1479,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, struct user_struct *user = NULL; struct hstate *hs; - hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK); + hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (!hs) return -EINVAL; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d083714..04c9143 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -685,6 +685,7 @@ void exit_oom_victim(void) void oom_killer_enable(void) { oom_killer_disabled = false; + pr_info("OOM killer enabled.\n"); } /** @@ -721,6 +722,7 @@ bool oom_killer_disable(signed long timeout) oom_killer_enable(); return false; } + pr_info("OOM killer disabled.\n"); return true; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d8ac2a7..2359608 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -650,9 +650,8 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp) spin_lock_init(&dom->lock); - init_timer_deferrable(&dom->period_timer); - dom->period_timer.function = writeout_period; - dom->period_timer.data = (unsigned long)dom; + setup_deferrable_timer(&dom->period_timer, writeout_period, + (unsigned long)dom); dom->dirty_limit_tstamp = jiffies; @@ -2428,7 +2427,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) inode_attach_wb(inode, page); wb = inode_to_wb(inode); - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY); + inc_memcg_page_state(page, NR_FILE_DIRTY); __inc_node_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); __inc_node_page_state(page, NR_DIRTIED); @@ -2450,7 +2449,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, struct bdi_writeback *wb) { if (mapping_cap_account_dirty(mapping)) { - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); + dec_memcg_page_state(page, NR_FILE_DIRTY); dec_node_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); @@ -2707,7 +2706,7 @@ int clear_page_dirty_for_io(struct page *page) */ wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) { - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); + dec_memcg_page_state(page, NR_FILE_DIRTY); dec_node_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); @@ -2754,7 +2753,7 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); + dec_memcg_page_state(page, NR_WRITEBACK); dec_node_page_state(page, NR_WRITEBACK); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); inc_node_page_state(page, NR_WRITTEN); @@ -2809,7 +2808,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) ret = TestSetPageWriteback(page); } if (!ret) { - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); + inc_memcg_page_state(page, NR_WRITEBACK); inc_node_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bd01501..2c25de4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -65,6 +65,7 @@ #include <linux/page_owner.h> #include <linux/kthread.h> #include <linux/memcontrol.h> +#include <linux/ftrace.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -1090,14 +1091,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; - unsigned long nr_scanned; bool isolated_pageblocks; spin_lock(&zone->lock); isolated_pageblocks = has_isolate_pageblock(zone); - nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); - if (nr_scanned) - __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); while (count) { struct page *page; @@ -1150,12 +1147,7 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype) { - unsigned long nr_scanned; spin_lock(&zone->lock); - nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); - if (nr_scanned) - __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); - if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); @@ -1698,10 +1690,10 @@ static inline int check_new_page(struct page *page) return 1; } -static inline bool free_pages_prezeroed(bool poisoned) +static inline bool free_pages_prezeroed(void) { return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && - page_poisoning_enabled() && poisoned; + page_poisoning_enabled(); } #ifdef CONFIG_DEBUG_VM @@ -1755,17 +1747,10 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags unsigned int alloc_flags) { int i; - bool poisoned = true; - - for (i = 0; i < (1 << order); i++) { - struct page *p = page + i; - if (poisoned) - poisoned &= page_is_poisoned(p); - } post_alloc_hook(page, order, gfp_flags); - if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) + if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO)) for (i = 0; i < (1 << order); i++) clear_highpage(page + i); @@ -2045,8 +2030,8 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, /* Yoink! */ mt = get_pageblock_migratetype(page); - if (mt != MIGRATE_HIGHATOMIC && - !is_migrate_isolate(mt) && !is_migrate_cma(mt)) { + if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt) + && !is_migrate_cma(mt)) { zone->nr_reserved_highatomic += pageblock_nr_pages; set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); move_freepages_block(zone, page, MIGRATE_HIGHATOMIC); @@ -2103,8 +2088,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * from highatomic to ac->migratetype. So we should * adjust the count once. */ - if (get_pageblock_migratetype(page) == - MIGRATE_HIGHATOMIC) { + if (is_migrate_highatomic_page(page)) { /* * It should never happen but changes to * locking could inadvertently allow a per-cpu @@ -2161,8 +2145,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) page = list_first_entry(&area->free_list[fallback_mt], struct page, lru); - if (can_steal && - get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC) + if (can_steal && !is_migrate_highatomic_page(page)) steal_suitable_fallback(zone, page, start_migratetype); /* Remove the page from the freelists */ @@ -2502,7 +2485,7 @@ void free_hot_cold_page(struct page *page, bool cold) /* * We only track unmovable, reclaimable and movable on pcp lists. * Free ISOLATE pages back to the allocator because they are being - * offlined but treat RESERVE as movable pages so we can get those + * offlined but treat HIGHATOMIC as movable pages so we can get those * areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ @@ -2612,7 +2595,7 @@ int __isolate_free_page(struct page *page, unsigned int order) for (; page < endpage; page += pageblock_nr_pages) { int mt = get_pageblock_migratetype(page); if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) - && mt != MIGRATE_HIGHATOMIC) + && !is_migrate_highatomic(mt)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); } @@ -3110,8 +3093,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || - debug_guardpage_minorder() > 0) + if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) return; pr_warn("%s: ", current->comm); @@ -3522,19 +3504,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) } /* - * Maximum number of reclaim retries without any progress before OOM killer - * is consider as the only way to move forward. - */ -#define MAX_RECLAIM_RETRIES 16 - -/* * Checks whether it makes sense to retry the reclaim to make a forward progress * for the given allocation request. - * The reclaim feedback represented by did_some_progress (any progress during - * the last reclaim round) and no_progress_loops (number of reclaim rounds without - * any progress in a row) is considered as well as the reclaimable pages on the - * applicable zone list (with a backoff mechanism which is a function of - * no_progress_loops). + * + * We give up when we either have tried MAX_RECLAIM_RETRIES in a row + * without success, or when we couldn't even meet the watermark if we + * reclaimed all remaining pages on the LRU lists. * * Returns true if a retry is viable or false to enter the oom path. */ @@ -3579,13 +3554,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, bool wmark; available = reclaimable = zone_reclaimable_pages(zone); - available -= DIV_ROUND_UP((*no_progress_loops) * available, - MAX_RECLAIM_RETRIES); available += zone_page_state_snapshot(zone, NR_FREE_PAGES); /* - * Would the allocation succeed if we reclaimed the whole - * available? + * Would the allocation succeed if we reclaimed all + * reclaimable pages? */ wmark = __zone_watermark_ok(zone, order, min_wmark, ac_classzone_idx(ac), alloc_flags, available); @@ -3771,7 +3744,7 @@ retry: /* Make sure we know about allocations which stall for too long */ if (time_after(jiffies, alloc_start + stall_timeout)) { - warn_alloc(gfp_mask, ac->nodemask, + warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask, "page allocation stalls for %ums, order:%u", jiffies_to_msecs(jiffies-alloc_start), order); stall_timeout += 10 * HZ; @@ -3971,10 +3944,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, goto out; /* - * Runtime PM, block IO and its error handling path can deadlock - * because I/O on the device might not complete. + * Apply scoped allocation constraints. This is mainly about GFP_NOFS + * resp. GFP_NOIO which has to be inherited for all allocation requests + * from a particular context which has been marked by + * memalloc_no{fs,io}_{save,restore}. */ - alloc_mask = memalloc_noio_flags(gfp_mask); + alloc_mask = current_gfp_context(gfp_mask); ac.spread_dirty_pages = false; /* @@ -4510,7 +4485,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) #endif " writeback_tmp:%lukB" " unstable:%lukB" - " pages_scanned:%lu" " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -4533,8 +4507,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), K(node_page_state(pgdat, NR_UNSTABLE_NFS)), - node_page_state(pgdat, NR_PAGES_SCANNED), - !pgdat_reclaimable(pgdat) ? "yes" : "no"); + pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? + "yes" : "no"); } for_each_populated_zone(zone) { @@ -7429,7 +7403,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .zone = page_zone(pfn_to_page(start)), .mode = MIGRATE_SYNC, .ignore_skip_hint = true, - .gfp_mask = memalloc_noio_flags(gfp_mask), + .gfp_mask = current_gfp_context(gfp_mask), }; INIT_LIST_HEAD(&cc.migratepages); diff --git a/mm/page_ext.c b/mm/page_ext.c index 121dcff..88ccc044 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -59,9 +59,6 @@ static struct page_ext_operations *page_ext_ops[] = { &debug_guardpage_ops, -#ifdef CONFIG_PAGE_POISONING - &page_poisoning_ops, -#endif #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif @@ -127,15 +124,12 @@ struct page_ext *lookup_page_ext(struct page *page) struct page_ext *base; base = NODE_DATA(page_to_nid(page))->node_page_ext; -#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) +#if defined(CONFIG_DEBUG_VM) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. - * - * This check is also necessary for ensuring page poisoning - * works as expected when enabled */ if (unlikely(!base)) return NULL; @@ -204,15 +198,12 @@ struct page_ext *lookup_page_ext(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); -#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) +#if defined(CONFIG_DEBUG_VM) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. - * - * This check is also necessary for ensuring page poisoning - * works as expected when enabled */ if (!section->page_ext) return NULL; diff --git a/mm/page_idle.c b/mm/page_idle.c index b0ee56c..1b0f48c 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -50,7 +50,7 @@ static struct page *page_idle_get_page(unsigned long pfn) return page; } -static int page_idle_clear_pte_refs_one(struct page *page, +static bool page_idle_clear_pte_refs_one(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg) { @@ -84,7 +84,7 @@ static int page_idle_clear_pte_refs_one(struct page *page, */ set_page_young(page); } - return SWAP_AGAIN; + return true; } static void page_idle_clear_pte_refs(struct page *page) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index f4e17a5..7927bbb 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -88,7 +88,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); - if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + if (!is_migrate_isolate_page(page)) goto out; /* @@ -205,7 +205,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, pfn < end_pfn; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); - if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + if (!page || !is_migrate_isolate_page(page)) continue; unset_migratetype_isolate(page, migratetype); } @@ -262,7 +262,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, */ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); - if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + if (page && !is_migrate_isolate_page(page)) break; } page = __first_valid_page(start_pfn, end_pfn - start_pfn); diff --git a/mm/page_poison.c b/mm/page_poison.c index 2e647c6..be19e98 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -6,7 +6,6 @@ #include <linux/poison.h> #include <linux/ratelimit.h> -static bool __page_poisoning_enabled __read_mostly; static bool want_page_poisoning __read_mostly; static int early_page_poison_param(char *buf) @@ -19,74 +18,21 @@ early_param("page_poison", early_page_poison_param); bool page_poisoning_enabled(void) { - return __page_poisoning_enabled; -} - -static bool need_page_poisoning(void) -{ - return want_page_poisoning; -} - -static void init_page_poisoning(void) -{ /* - * page poisoning is debug page alloc for some arches. If either - * of those options are enabled, enable poisoning + * Assumes that debug_pagealloc_enabled is set before + * free_all_bootmem. + * Page poisoning is debug page alloc for some arches. If + * either of those options are enabled, enable poisoning. */ - if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) { - if (!want_page_poisoning && !debug_pagealloc_enabled()) - return; - } else { - if (!want_page_poisoning) - return; - } - - __page_poisoning_enabled = true; -} - -struct page_ext_operations page_poisoning_ops = { - .need = need_page_poisoning, - .init = init_page_poisoning, -}; - -static inline void set_page_poison(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return; - - __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); -} - -static inline void clear_page_poison(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return; - - __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); -} - -bool page_is_poisoned(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return false; - - return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); + return (want_page_poisoning || + (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && + debug_pagealloc_enabled())); } static void poison_page(struct page *page) { void *addr = kmap_atomic(page); - set_page_poison(page); memset(addr, PAGE_POISON, PAGE_SIZE); kunmap_atomic(addr); } @@ -140,12 +86,13 @@ static void unpoison_page(struct page *page) { void *addr; - if (!page_is_poisoned(page)) - return; - addr = kmap_atomic(page); + /* + * Page poisoning when enabled poisons each and every page + * that is freed to buddy. Thus no extra check is done to + * see if a page was posioned. + */ check_poison_mem(addr, PAGE_SIZE); - clear_page_poison(page); kunmap_atomic(addr); } @@ -724,7 +724,7 @@ struct page_referenced_arg { /* * arg: page_referenced_arg will be passed */ -static int page_referenced_one(struct page *page, struct vm_area_struct *vma, +static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct page_referenced_arg *pra = arg; @@ -741,7 +741,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, if (vma->vm_flags & VM_LOCKED) { page_vma_mapped_walk_done(&pvmw); pra->vm_flags |= VM_LOCKED; - return SWAP_FAIL; /* To break the loop */ + return false; /* To break the loop */ } if (pvmw.pte) { @@ -781,9 +781,9 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, } if (!pra->mapcount) - return SWAP_SUCCESS; /* To break the loop */ + return false; /* To break the loop */ - return SWAP_AGAIN; + return true; } static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) @@ -812,7 +812,6 @@ int page_referenced(struct page *page, struct mem_cgroup *memcg, unsigned long *vm_flags) { - int ret; int we_locked = 0; struct page_referenced_arg pra = { .mapcount = total_mapcount(page), @@ -846,7 +845,7 @@ int page_referenced(struct page *page, rwc.invalid_vma = invalid_page_referenced_vma; } - ret = rmap_walk(page, &rwc); + rmap_walk(page, &rwc); *vm_flags = pra.vm_flags; if (we_locked) @@ -855,7 +854,7 @@ int page_referenced(struct page *page, return pra.referenced; } -static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, +static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct page_vma_mapped_walk pvmw = { @@ -908,7 +907,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, } } - return SWAP_AGAIN; + return true; } static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) @@ -1159,7 +1158,7 @@ void page_add_file_rmap(struct page *page, bool compound) goto out; } __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr); - mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, nr); + mod_memcg_page_state(page, NR_FILE_MAPPED, nr); out: unlock_page_memcg(page); } @@ -1199,7 +1198,7 @@ static void page_remove_file_rmap(struct page *page, bool compound) * pte lock(a spinlock) is held, which implies preemption disabled. */ __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr); - mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, -nr); + mod_memcg_page_state(page, NR_FILE_MAPPED, -nr); if (unlikely(PageMlocked(page))) clear_page_mlock(page); @@ -1288,15 +1287,10 @@ void page_remove_rmap(struct page *page, bool compound) */ } -struct rmap_private { - enum ttu_flags flags; - int lazyfreed; -}; - /* * @arg: enum ttu_flags will be passed to this argument */ -static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, +static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; @@ -1307,13 +1301,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, }; pte_t pteval; struct page *subpage; - int ret = SWAP_AGAIN; - struct rmap_private *rp = arg; - enum ttu_flags flags = rp->flags; + bool ret = true; + enum ttu_flags flags = (enum ttu_flags)arg; /* munlock has nothing to gain from examining un-locked vmas */ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) - return SWAP_AGAIN; + return true; if (flags & TTU_SPLIT_HUGE_PMD) { split_huge_pmd_address(vma, address, @@ -1336,7 +1329,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ mlock_vma_page(page); } - ret = SWAP_MLOCK; + ret = false; page_vma_mapped_walk_done(&pvmw); break; } @@ -1354,7 +1347,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (!(flags & TTU_IGNORE_ACCESS)) { if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) { - ret = SWAP_FAIL; + ret = false; page_vma_mapped_walk_done(&pvmw); break; } @@ -1424,18 +1417,34 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Store the swap location in the pte. * See handle_pte_fault() ... */ - VM_BUG_ON_PAGE(!PageSwapCache(page), page); + if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) { + WARN_ON_ONCE(1); + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } + + /* MADV_FREE page check */ + if (!PageSwapBacked(page)) { + if (!PageDirty(page)) { + dec_mm_counter(mm, MM_ANONPAGES); + goto discard; + } - if (!PageDirty(page) && (flags & TTU_LZFREE)) { - /* It's a freeable page by MADV_FREE */ - dec_mm_counter(mm, MM_ANONPAGES); - rp->lazyfreed++; - goto discard; + /* + * If the page was redirtied, it cannot be + * discarded. Remap the page to page table. + */ + set_pte_at(mm, address, pvmw.pte, pteval); + SetPageSwapBacked(page); + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; } if (swap_duplicate(entry) < 0) { set_pte_at(mm, address, pvmw.pte, pteval); - ret = SWAP_FAIL; + ret = false; page_vma_mapped_walk_done(&pvmw); break; } @@ -1492,24 +1501,14 @@ static int page_mapcount_is_zero(struct page *page) * * Tries to remove all the page table entries which are mapping this * page, used in the pageout path. Caller must hold the page lock. - * Return values are: * - * SWAP_SUCCESS - we succeeded in removing all mappings - * SWAP_AGAIN - we missed a mapping, try again later - * SWAP_FAIL - the page is unswappable - * SWAP_MLOCK - page is mlocked. + * If unmap is successful, return true. Otherwise, false. */ -int try_to_unmap(struct page *page, enum ttu_flags flags) +bool try_to_unmap(struct page *page, enum ttu_flags flags) { - int ret; - struct rmap_private rp = { - .flags = flags, - .lazyfreed = 0, - }; - struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, - .arg = &rp, + .arg = (void *)flags, .done = page_mapcount_is_zero, .anon_lock = page_lock_anon_vma_read, }; @@ -1526,16 +1525,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) rwc.invalid_vma = invalid_migration_vma; if (flags & TTU_RMAP_LOCKED) - ret = rmap_walk_locked(page, &rwc); + rmap_walk_locked(page, &rwc); else - ret = rmap_walk(page, &rwc); + rmap_walk(page, &rwc); - if (ret != SWAP_MLOCK && !page_mapcount(page)) { - ret = SWAP_SUCCESS; - if (rp.lazyfreed && !PageDirty(page)) - ret = SWAP_LZFREE; - } - return ret; + return !page_mapcount(page) ? true : false; } static int page_not_mapped(struct page *page) @@ -1550,34 +1544,22 @@ static int page_not_mapped(struct page *page) * Called from munlock code. Checks all of the VMAs mapping the page * to make sure nobody else has this page mlocked. The page will be * returned with PG_mlocked cleared if no other vmas have it mlocked. - * - * Return values are: - * - * SWAP_AGAIN - no vma is holding page mlocked, or, - * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem - * SWAP_FAIL - page cannot be located at present - * SWAP_MLOCK - page is now mlocked. */ -int try_to_munlock(struct page *page) -{ - int ret; - struct rmap_private rp = { - .flags = TTU_MUNLOCK, - .lazyfreed = 0, - }; +void try_to_munlock(struct page *page) +{ struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, - .arg = &rp, + .arg = (void *)TTU_MUNLOCK, .done = page_not_mapped, .anon_lock = page_lock_anon_vma_read, }; VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); + VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); - ret = rmap_walk(page, &rwc); - return ret; + rmap_walk(page, &rwc); } void __put_anon_vma(struct anon_vma *anon_vma) @@ -1625,13 +1607,12 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. */ -static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, +static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, bool locked) { struct anon_vma *anon_vma; pgoff_t pgoff_start, pgoff_end; struct anon_vma_chain *avc; - int ret = SWAP_AGAIN; if (locked) { anon_vma = page_anon_vma(page); @@ -1641,7 +1622,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, anon_vma = rmap_walk_anon_lock(page, rwc); } if (!anon_vma) - return ret; + return; pgoff_start = page_to_pgoff(page); pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; @@ -1655,8 +1636,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - ret = rwc->rmap_one(page, vma, address, rwc->arg); - if (ret != SWAP_AGAIN) + if (!rwc->rmap_one(page, vma, address, rwc->arg)) break; if (rwc->done && rwc->done(page)) break; @@ -1664,7 +1644,6 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, if (!locked) anon_vma_unlock_read(anon_vma); - return ret; } /* @@ -1680,13 +1659,12 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. */ -static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, +static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, bool locked) { struct address_space *mapping = page_mapping(page); pgoff_t pgoff_start, pgoff_end; struct vm_area_struct *vma; - int ret = SWAP_AGAIN; /* * The page lock not only makes sure that page->mapping cannot @@ -1697,7 +1675,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, VM_BUG_ON_PAGE(!PageLocked(page), page); if (!mapping) - return ret; + return; pgoff_start = page_to_pgoff(page); pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; @@ -1712,8 +1690,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - ret = rwc->rmap_one(page, vma, address, rwc->arg); - if (ret != SWAP_AGAIN) + if (!rwc->rmap_one(page, vma, address, rwc->arg)) goto done; if (rwc->done && rwc->done(page)) goto done; @@ -1722,28 +1699,27 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, done: if (!locked) i_mmap_unlock_read(mapping); - return ret; } -int rmap_walk(struct page *page, struct rmap_walk_control *rwc) +void rmap_walk(struct page *page, struct rmap_walk_control *rwc) { if (unlikely(PageKsm(page))) - return rmap_walk_ksm(page, rwc); + rmap_walk_ksm(page, rwc); else if (PageAnon(page)) - return rmap_walk_anon(page, rwc, false); + rmap_walk_anon(page, rwc, false); else - return rmap_walk_file(page, rwc, false); + rmap_walk_file(page, rwc, false); } /* Like rmap_walk, but caller holds relevant rmap lock */ -int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) +void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) { /* no ksm support for now */ VM_BUG_ON_PAGE(PageKsm(page), page); if (PageAnon(page)) - return rmap_walk_anon(page, rwc, true); + rmap_walk_anon(page, rwc, true); else - return rmap_walk_file(page, rwc, true); + rmap_walk_file(page, rwc, true); } #ifdef CONFIG_HUGETLB_PAGE diff --git a/mm/rodata_test.c b/mm/rodata_test.c index 0fd2167..6bb4deb 100644 --- a/mm/rodata_test.c +++ b/mm/rodata_test.c @@ -9,11 +9,12 @@ * as published by the Free Software Foundation; version 2 * of the License. */ +#define pr_fmt(fmt) "rodata_test: " fmt + #include <linux/uaccess.h> #include <asm/sections.h> const int rodata_test_data = 0xC3; -EXPORT_SYMBOL_GPL(rodata_test_data); void rodata_test(void) { @@ -23,20 +24,20 @@ void rodata_test(void) /* test 1: read the value */ /* If this test fails, some previous testrun has clobbered the state */ if (!rodata_test_data) { - pr_err("rodata_test: test 1 fails (start data)\n"); + pr_err("test 1 fails (start data)\n"); return; } /* test 2: write to the variable; this should fault */ if (!probe_kernel_write((void *)&rodata_test_data, - (void *)&zero, sizeof(zero))) { - pr_err("rodata_test: test data was not read only\n"); + (void *)&zero, sizeof(zero))) { + pr_err("test data was not read only\n"); return; } /* test 3: check the value hasn't changed */ if (rodata_test_data == zero) { - pr_err("rodata_test: test data was changed\n"); + pr_err("test data was changed\n"); return; } @@ -44,13 +45,13 @@ void rodata_test(void) start = (unsigned long)__start_rodata; end = (unsigned long)__end_rodata; if (start & (PAGE_SIZE - 1)) { - pr_err("rodata_test: start of .rodata is not page size aligned\n"); + pr_err("start of .rodata is not page size aligned\n"); return; } if (end & (PAGE_SIZE - 1)) { - pr_err("rodata_test: end of .rodata is not page size aligned\n"); + pr_err("end of .rodata is not page size aligned\n"); return; } - pr_info("rodata_test: all tests were successful\n"); + pr_info("all tests were successful\n"); } @@ -3879,7 +3879,12 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, prev = cachep->cpu_cache; cachep->cpu_cache = cpu_cache; - kick_all_cpus_sync(); + /* + * Without a previous cpu_cache there's no need to synchronize remote + * cpus, so skip the IPIs. + */ + if (prev) + kick_all_cpus_sync(); check_irq_on(); cachep->batchcount = batchcount; diff --git a/mm/sparse.c b/mm/sparse.c index db6bf3c..6903c8f 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -248,10 +248,7 @@ static int __meminit sparse_init_one_section(struct mem_section *ms, unsigned long usemap_size(void) { - unsigned long size_bytes; - size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8; - size_bytes = roundup(size_bytes, sizeof(unsigned long)); - return size_bytes; + return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long); } #ifdef CONFIG_MEMORY_HOTPLUG @@ -46,7 +46,7 @@ int page_cluster; static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); -static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs); #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); #endif @@ -571,20 +571,27 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, } -static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, +static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, void *arg) { - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_cache(page); - int lru = page_lru_base_type(page); + if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && + !PageUnevictable(page)) { + bool active = PageActive(page); - del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); + del_page_from_lru_list(page, lruvec, + LRU_INACTIVE_ANON + active); ClearPageActive(page); ClearPageReferenced(page); - add_page_to_lru_list(page, lruvec, lru); + /* + * lazyfree pages are clean anonymous pages. They have + * SwapBacked flag cleared to distinguish normal anonymous + * pages + */ + ClearPageSwapBacked(page); + add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE); - __count_vm_event(PGDEACTIVATE); - update_page_reclaim_stat(lruvec, file, 0); + __count_vm_events(PGLAZYFREE, hpage_nr_pages(page)); + update_page_reclaim_stat(lruvec, 1, 0); } } @@ -614,9 +621,9 @@ void lru_add_drain_cpu(int cpu) if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); - pvec = &per_cpu(lru_deactivate_pvecs, cpu); + pvec = &per_cpu(lru_lazyfree_pvecs, cpu); if (pagevec_count(pvec)) - pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); activate_page_drain(cpu); } @@ -648,22 +655,22 @@ void deactivate_file_page(struct page *page) } /** - * deactivate_page - deactivate a page + * mark_page_lazyfree - make an anon page lazyfree * @page: page to deactivate * - * deactivate_page() moves @page to the inactive list if @page was on the active - * list and was not an unevictable page. This is done to accelerate the reclaim - * of @page. + * mark_page_lazyfree() moves @page to the inactive file list. + * This is done to accelerate the reclaim of @page. */ -void deactivate_page(struct page *page) +void mark_page_lazyfree(struct page *page) { - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && + !PageUnevictable(page)) { + struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs); get_page(page); if (!pagevec_add(pvec, page) || PageCompound(page)) - pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); - put_cpu_var(lru_deactivate_pvecs); + pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); + put_cpu_var(lru_lazyfree_pvecs); } } @@ -703,7 +710,7 @@ void lru_add_drain_all(void) if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || - pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); diff --git a/mm/swap_slots.c b/mm/swap_slots.c index b1ccb58..aa1c415 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -241,8 +241,10 @@ int enable_swap_slots_cache(void) ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache", alloc_swap_slot_cache, free_slot_cache); - if (ret < 0) + if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating " + "without swap slots cache.\n", __func__)) goto out_unlock; + swap_slot_cache_initialized = true; __reenable_swap_slots_cache(); out_unlock: diff --git a/mm/swap_state.c b/mm/swap_state.c index 473b71e..7bfb9bd 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -360,17 +360,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * We might race against get_swap_page() and stumble * across a SWAP_HAS_CACHE swap_map entry whose page - * has not been brought into the swapcache yet, while - * the other end is scheduled away waiting on discard - * I/O completion at scan_swap_map(). - * - * In order to avoid turning this transitory state - * into a permanent loop around this -EEXIST case - * if !CONFIG_PREEMPT and the I/O completion happens - * to be waiting on the CPU waitqueue where we are now - * busy looping, we just conditionally invoke the - * scheduler here, if there are some more important - * tasks to run. + * has not been brought into the swapcache yet. */ cond_resched(); continue; diff --git a/mm/swapfile.c b/mm/swapfile.c index 1781308..b86b2aca 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -335,7 +335,7 @@ static void cluster_list_add_tail(struct swap_cluster_list *list, ci_tail = ci + tail; spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); cluster_set_next(ci_tail, idx); - unlock_cluster(ci_tail); + spin_unlock(&ci_tail->lock); cluster_set_next_flag(&list->tail, idx, 0); } } @@ -672,6 +672,9 @@ checks: else goto done; } + si->swap_map[offset] = usage; + inc_cluster_info_page(si, si->cluster_info, offset); + unlock_cluster(ci); if (offset == si->lowest_bit) si->lowest_bit++; @@ -685,9 +688,6 @@ checks: plist_del(&si->avail_list, &swap_avail_head); spin_unlock(&swap_avail_lock); } - si->swap_map[offset] = usage; - inc_cluster_info_page(si, si->cluster_info, offset); - unlock_cluster(ci); si->cluster_next = offset + 1; slots[n_ret++] = swp_entry(si->type, offset); @@ -1079,8 +1079,6 @@ void swapcache_free_entries(swp_entry_t *entries, int n) p = swap_info_get_cont(entries[i], prev); if (p) swap_entry_free(p, entries[i]); - else - break; prev = p; } if (p) @@ -1111,6 +1109,18 @@ int page_swapcount(struct page *page) return count; } +static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) +{ + int count = 0; + pgoff_t offset = swp_offset(entry); + struct swap_cluster_info *ci; + + ci = lock_cluster_or_swap_info(si, offset); + count = swap_count(si->swap_map[offset]); + unlock_cluster_or_swap_info(si, ci); + return count; +} + /* * How many references to @entry are currently swapped out? * This does not give an exact answer when swap count is continued, @@ -1119,17 +1129,11 @@ int page_swapcount(struct page *page) int __swp_swapcount(swp_entry_t entry) { int count = 0; - pgoff_t offset; struct swap_info_struct *si; - struct swap_cluster_info *ci; si = __swap_info_get(entry); - if (si) { - offset = swp_offset(entry); - ci = lock_cluster_or_swap_info(si, offset); - count = swap_count(si->swap_map[offset]); - unlock_cluster_or_swap_info(si, ci); - } + if (si) + count = swap_swapcount(si, entry); return count; } @@ -1291,7 +1295,8 @@ int free_swap_and_cache(swp_entry_t entry) * Also recheck PageSwapCache now page is locked (above). */ if (PageSwapCache(page) && !PageWriteback(page) && - (!page_mapped(page) || mem_cgroup_swap_full(page))) { + (!page_mapped(page) || mem_cgroup_swap_full(page)) && + !swap_swapcount(p, entry)) { delete_from_swap_cache(page); SetPageDirty(page); } diff --git a/mm/truncate.c b/mm/truncate.c index 6263aff..83a059e 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -266,9 +266,8 @@ void truncate_inode_pages_range(struct address_space *mapping, pgoff_t index; int i; - cleancache_invalidate_inode(mapping); if (mapping->nrpages == 0 && mapping->nrexceptional == 0) - return; + goto out; /* Offsets within partial pages */ partial_start = lstart & (PAGE_SIZE - 1); @@ -363,7 +362,7 @@ void truncate_inode_pages_range(struct address_space *mapping, * will be released, just zeroed, so we can bail out now. */ if (start >= end) - return; + goto out; index = start; for ( ; ; ) { @@ -410,6 +409,8 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_release(&pvec); index++; } + +out: cleancache_invalidate_inode(mapping); } EXPORT_SYMBOL(truncate_inode_pages_range); @@ -623,7 +624,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping, int ret2 = 0; int did_range_unmap = 0; - cleancache_invalidate_inode(mapping); + if (mapping->nrpages == 0 && mapping->nrexceptional == 0) + goto out; + pagevec_init(&pvec, 0); index = start; while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, @@ -686,6 +689,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping, cond_resched(); index++; } + +out: cleancache_invalidate_inode(mapping); return ret; } diff --git a/mm/vmscan.c b/mm/vmscan.c index bc8031e..4e7ed65 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -97,8 +97,13 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ unsigned int may_swap:1; - /* Can cgroups be reclaimed below their normal consumption range? */ - unsigned int may_thrash:1; + /* + * Cgroups are not reclaimed below their configured memory.low, + * unless we threaten to OOM. If any cgroups are skipped due to + * memory.low and nothing was reclaimed, go back for memory.low. + */ + unsigned int memcg_low_reclaim:1; + unsigned int memcg_low_skipped:1; unsigned int hibernation_mode:1; @@ -230,12 +235,6 @@ unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat) return nr; } -bool pgdat_reclaimable(struct pglist_data *pgdat) -{ - return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) < - pgdat_reclaimable_pages(pgdat) * 6; -} - /** * lruvec_lru_size - Returns the number of pages on the given LRU list. * @lruvec: lru vector @@ -912,7 +911,8 @@ static void page_check_dirty_writeback(struct page *page, * Anonymous pages are not handled by flushers and must be written * from reclaim context. Do not stall reclaim based on them */ - if (!page_is_file_cache(page)) { + if (!page_is_file_cache(page) || + (PageAnon(page) && !PageSwapBacked(page))) { *dirty = false; *writeback = false; return; @@ -972,8 +972,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, int may_enter_fs; enum page_references references = PAGEREF_RECLAIM_CLEAN; bool dirty, writeback; - bool lazyfree = false; - int ret = SWAP_SUCCESS; cond_resched(); @@ -988,13 +986,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, sc->nr_scanned++; if (unlikely(!page_evictable(page))) - goto cull_mlocked; + goto activate_locked; if (!sc->may_unmap && page_mapped(page)) goto keep_locked; /* Double the slab pressure for mapped and swapcache pages */ - if (page_mapped(page) || PageSwapCache(page)) + if ((page_mapped(page) || PageSwapCache(page)) && + !(PageAnon(page) && !PageSwapBacked(page))) sc->nr_scanned++; may_enter_fs = (sc->gfp_mask & __GFP_FS) || @@ -1120,13 +1119,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. + * Lazyfree page could be freed directly */ - if (PageAnon(page) && !PageSwapCache(page)) { + if (PageAnon(page) && PageSwapBacked(page) && + !PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; if (!add_to_swap(page, page_list)) goto activate_locked; - lazyfree = true; may_enter_fs = 1; /* Adding to swap updated mapping */ @@ -1143,21 +1143,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, * The page is mapped into the page tables of one or more * processes. Try to unmap it here. */ - if (page_mapped(page) && mapping) { - switch (ret = try_to_unmap(page, lazyfree ? - (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) : - (ttu_flags | TTU_BATCH_FLUSH))) { - case SWAP_FAIL: + if (page_mapped(page)) { + if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) { nr_unmap_fail++; goto activate_locked; - case SWAP_AGAIN: - goto keep_locked; - case SWAP_MLOCK: - goto cull_mlocked; - case SWAP_LZFREE: - goto lazyfree; - case SWAP_SUCCESS: - ; /* try to free the page below */ } } @@ -1267,10 +1256,18 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } -lazyfree: - if (!mapping || !__remove_mapping(mapping, page, true)) - goto keep_locked; + if (PageAnon(page) && !PageSwapBacked(page)) { + /* follow __remove_mapping for reference */ + if (!page_ref_freeze(page, 1)) + goto keep_locked; + if (PageDirty(page)) { + page_ref_unfreeze(page, 1); + goto keep_locked; + } + count_vm_event(PGLAZYFREED); + } else if (!mapping || !__remove_mapping(mapping, page, true)) + goto keep_locked; /* * At this point, we have no other references and there is * no way to pick any more up (removed from LRU, removed @@ -1280,9 +1277,6 @@ lazyfree: */ __ClearPageLocked(page); free_it: - if (ret == SWAP_LZFREE) - count_vm_event(PGLAZYFREED); - nr_reclaimed++; /* @@ -1292,20 +1286,16 @@ free_it: list_add(&page->lru, &free_pages); continue; -cull_mlocked: - if (PageSwapCache(page)) - try_to_free_swap(page); - unlock_page(page); - list_add(&page->lru, &ret_pages); - continue; - activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ - if (PageSwapCache(page) && mem_cgroup_swap_full(page)) + if (PageSwapCache(page) && (mem_cgroup_swap_full(page) || + PageMlocked(page))) try_to_free_swap(page); VM_BUG_ON_PAGE(PageActive(page), page); - SetPageActive(page); - pgactivate++; + if (!PageMlocked(page)) { + SetPageActive(page); + pgactivate++; + } keep_locked: unlock_page(page); keep: @@ -1354,7 +1344,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, - TTU_UNMAP|TTU_IGNORE_ACCESS, NULL, true); + TTU_IGNORE_ACCESS, NULL, true); list_splice(&clean_pages, page_list); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); return ret; @@ -1478,12 +1468,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long nr_taken = 0; unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; - unsigned long skipped = 0, total_skipped = 0; + unsigned long skipped = 0; unsigned long scan, nr_pages; LIST_HEAD(pages_skipped); for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && - !list_empty(src);) { + !list_empty(src); scan++) { struct page *page; page = lru_to_page(src); @@ -1497,12 +1487,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, continue; } - /* - * Account for scanned and skipped separetly to avoid the pgdat - * being prematurely marked unreclaimable by pgdat_reclaimable. - */ - scan++; - switch (__isolate_lru_page(page, mode)) { case 0: nr_pages = hpage_nr_pages(page); @@ -1531,6 +1515,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, if (!list_empty(&pages_skipped)) { int zid; + list_splice(&pages_skipped, src); for (zid = 0; zid < MAX_NR_ZONES; zid++) { if (!nr_skipped[zid]) continue; @@ -1538,17 +1523,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); skipped += nr_skipped[zid]; } - - /* - * Account skipped pages as a partial scan as the pgdat may be - * close to unreclaimable. If the LRU list is empty, account - * skipped pages as a full scan. - */ - total_skipped = list_empty(src) ? skipped : skipped >> 2; - - list_splice(&pages_skipped, src); } - *nr_scanned = scan + total_skipped; + *nr_scanned = scan; trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan, skipped, nr_taken, mode, lru); update_lru_sizes(lruvec, lru, nr_zone_taken); @@ -1750,7 +1726,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, reclaim_stat->recent_scanned[file] += nr_taken; if (global_reclaim(sc)) { - __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); if (current_is_kswapd()) __count_vm_events(PGSCAN_KSWAPD, nr_scanned); else @@ -1761,7 +1736,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (nr_taken == 0) return 0; - nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP, + nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0, &stat, false); spin_lock_irq(&pgdat->lru_lock); @@ -1953,8 +1928,6 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; - if (global_reclaim(sc)) - __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); __count_vm_events(PGREFILL, nr_scanned); spin_unlock_irq(&pgdat->lru_lock); @@ -2033,6 +2006,8 @@ static void shrink_active_list(unsigned long nr_to_scan, * Both inactive lists should also be large enough that each inactive * page has a chance to be referenced again before it is reclaimed. * + * If that fails and refaulting is observed, the inactive list grows. + * * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages * on this LRU, maintained by the pageout code. A zone->inactive_ratio * of 3 means 3:1 or 25% of the pages are kept on the inactive list. @@ -2049,12 +2024,15 @@ static void shrink_active_list(unsigned long nr_to_scan, * 10TB 320 32GB */ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, - struct scan_control *sc, bool trace) + struct mem_cgroup *memcg, + struct scan_control *sc, bool actual_reclaim) { - unsigned long inactive_ratio; - unsigned long inactive, active; - enum lru_list inactive_lru = file * LRU_FILE; enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + enum lru_list inactive_lru = file * LRU_FILE; + unsigned long inactive, active; + unsigned long inactive_ratio; + unsigned long refaults; unsigned long gb; /* @@ -2067,27 +2045,42 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx); - gb = (inactive + active) >> (30 - PAGE_SHIFT); - if (gb) - inactive_ratio = int_sqrt(10 * gb); + if (memcg) + refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE); else - inactive_ratio = 1; + refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE); + + /* + * When refaults are being observed, it means a new workingset + * is being established. Disable active list protection to get + * rid of the stale workingset quickly. + */ + if (file && actual_reclaim && lruvec->refaults != refaults) { + inactive_ratio = 0; + } else { + gb = (inactive + active) >> (30 - PAGE_SHIFT); + if (gb) + inactive_ratio = int_sqrt(10 * gb); + else + inactive_ratio = 1; + } - if (trace) - trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id, - sc->reclaim_idx, - lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, - lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, - inactive_ratio, file); + if (actual_reclaim) + trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx, + lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, + lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, + inactive_ratio, file); return inactive * inactive_ratio < active; } static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, - struct lruvec *lruvec, struct scan_control *sc) + struct lruvec *lruvec, struct mem_cgroup *memcg, + struct scan_control *sc) { if (is_active_lru(lru)) { - if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true)) + if (inactive_list_is_low(lruvec, is_file_lru(lru), + memcg, sc, true)) shrink_active_list(nr_to_scan, lruvec, sc, lru); return 0; } @@ -2123,30 +2116,8 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, unsigned long anon_prio, file_prio; enum scan_balance scan_balance; unsigned long anon, file; - bool force_scan = false; unsigned long ap, fp; enum lru_list lru; - bool some_scanned; - int pass; - - /* - * If the zone or memcg is small, nr[l] can be 0. This - * results in no scanning on this priority and a potential - * priority drop. Global direct reclaim can go to the next - * zone and tends to have no problems. Global kswapd is for - * zone balancing and it needs to scan a minimum amount. When - * reclaiming for a memcg, a priority drop can cause high - * latencies, so it's better to scan a minimum amount there as - * well. - */ - if (current_is_kswapd()) { - if (!pgdat_reclaimable(pgdat)) - force_scan = true; - if (!mem_cgroup_online(memcg)) - force_scan = true; - } - if (!global_reclaim(sc)) - force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) { @@ -2218,7 +2189,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * lruvec even if it has plenty of old anonymous pages unless the * system is under heavy pressure. */ - if (!inactive_list_is_low(lruvec, true, sc, false) && + if (!inactive_list_is_low(lruvec, true, memcg, sc, false) && lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { scan_balance = SCAN_FILE; goto out; @@ -2277,55 +2248,48 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, fraction[1] = fp; denominator = ap + fp + 1; out: - some_scanned = false; - /* Only use force_scan on second pass. */ - for (pass = 0; !some_scanned && pass < 2; pass++) { - *lru_pages = 0; - for_each_evictable_lru(lru) { - int file = is_file_lru(lru); - unsigned long size; - unsigned long scan; - - size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - scan = size >> sc->priority; - - if (!scan && pass && force_scan) - scan = min(size, SWAP_CLUSTER_MAX); - - switch (scan_balance) { - case SCAN_EQUAL: - /* Scan lists relative to size */ - break; - case SCAN_FRACT: - /* - * Scan types proportional to swappiness and - * their relative recent reclaim efficiency. - */ - scan = div64_u64(scan * fraction[file], - denominator); - break; - case SCAN_FILE: - case SCAN_ANON: - /* Scan one type exclusively */ - if ((scan_balance == SCAN_FILE) != file) { - size = 0; - scan = 0; - } - break; - default: - /* Look ma, no brain */ - BUG(); - } + *lru_pages = 0; + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); + unsigned long size; + unsigned long scan; - *lru_pages += size; - nr[lru] = scan; + size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); + scan = size >> sc->priority; + /* + * If the cgroup's already been deleted, make sure to + * scrape out the remaining cache. + */ + if (!scan && !mem_cgroup_online(memcg)) + scan = min(size, SWAP_CLUSTER_MAX); + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: /* - * Skip the second pass and don't force_scan, - * if we found something to scan. + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. */ - some_scanned |= !!scan; + scan = div64_u64(scan * fraction[file], + denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) { + size = 0; + scan = 0; + } + break; + default: + /* Look ma, no brain */ + BUG(); } + + *lru_pages += size; + nr[lru] = scan; } } @@ -2376,7 +2340,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc nr[lru] -= nr_to_scan; nr_reclaimed += shrink_list(lru, nr_to_scan, - lruvec, sc); + lruvec, memcg, sc); } } @@ -2443,7 +2407,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_list_is_low(lruvec, false, sc, true)) + if (inactive_list_is_low(lruvec, false, memcg, sc, true)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); } @@ -2557,9 +2521,11 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long scanned; if (mem_cgroup_low(root, memcg)) { - if (!sc->may_thrash) + if (!sc->memcg_low_reclaim) { + sc->memcg_low_skipped = 1; continue; - mem_cgroup_events(memcg, MEMCG_LOW, 1); + } + mem_cgroup_event(memcg, MEMCG_LOW); } reclaimed = sc->nr_reclaimed; @@ -2620,6 +2586,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); + /* + * Kswapd gives up on balancing particular nodes after too + * many failures to reclaim anything from them and goes to + * sleep. On reclaim progress, reset the failure counter. A + * successful direct reclaim run will revive a dormant kswapd. + */ + if (reclaimable) + pgdat->kswapd_failures = 0; + return reclaimable; } @@ -2694,10 +2669,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) GFP_KERNEL | __GFP_HARDWALL)) continue; - if (sc->priority != DEF_PRIORITY && - !pgdat_reclaimable(zone->zone_pgdat)) - continue; /* Let kswapd poll it */ - /* * If we already have plenty of memory free for * compaction in this zone, don't free any more. @@ -2752,6 +2723,25 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) sc->gfp_mask = orig_mask; } +static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat) +{ + struct mem_cgroup *memcg; + + memcg = mem_cgroup_iter(root_memcg, NULL, NULL); + do { + unsigned long refaults; + struct lruvec *lruvec; + + if (memcg) + refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE); + else + refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE); + + lruvec = mem_cgroup_lruvec(pgdat, memcg); + lruvec->refaults = refaults; + } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL))); +} + /* * This is the main entry point to direct page reclaim. * @@ -2772,6 +2762,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc) { int initial_priority = sc->priority; + pg_data_t *last_pgdat; + struct zoneref *z; + struct zone *zone; retry: delayacct_freepages_start(); @@ -2798,6 +2791,15 @@ retry: sc->may_writepage = 1; } while (--sc->priority >= 0); + last_pgdat = NULL; + for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, + sc->nodemask) { + if (zone->zone_pgdat == last_pgdat) + continue; + last_pgdat = zone->zone_pgdat; + snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); + } + delayacct_freepages_end(); if (sc->nr_reclaimed) @@ -2808,16 +2810,17 @@ retry: return 1; /* Untapped cgroup reserves? Don't OOM, retry. */ - if (!sc->may_thrash) { + if (sc->memcg_low_skipped) { sc->priority = initial_priority; - sc->may_thrash = 1; + sc->memcg_low_reclaim = 1; + sc->memcg_low_skipped = 0; goto retry; } return 0; } -static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) +static bool allow_direct_reclaim(pg_data_t *pgdat) { struct zone *zone; unsigned long pfmemalloc_reserve = 0; @@ -2825,10 +2828,15 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) int i; bool wmark_ok; + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return true; + for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; - if (!managed_zone(zone) || - pgdat_reclaimable_pages(pgdat) == 0) + if (!managed_zone(zone)) + continue; + + if (!zone_reclaimable_pages(zone)) continue; pfmemalloc_reserve += min_wmark_pages(zone); @@ -2905,7 +2913,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, /* Throttle based on the first usable node */ pgdat = zone->zone_pgdat; - if (pfmemalloc_watermark_ok(pgdat)) + if (allow_direct_reclaim(pgdat)) goto out; break; } @@ -2927,14 +2935,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, */ if (!(gfp_mask & __GFP_FS)) { wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, - pfmemalloc_watermark_ok(pgdat), HZ); + allow_direct_reclaim(pgdat), HZ); goto check_pending; } /* Throttle until kswapd wakes the process */ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, - pfmemalloc_watermark_ok(pgdat)); + allow_direct_reclaim(pgdat)); check_pending: if (fatal_signal_pending(current)) @@ -2950,7 +2958,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, unsigned long nr_reclaimed; struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, - .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), .reclaim_idx = gfp_zone(gfp_mask), .order = order, .nodemask = nodemask, @@ -3030,7 +3038,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, int nid; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), .reclaim_idx = MAX_NR_ZONES - 1, .target_mem_cgroup = memcg, @@ -3076,7 +3084,7 @@ static void age_active_anon(struct pglist_data *pgdat, do { struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); - if (inactive_list_is_low(lruvec, false, sc, true)) + if (inactive_list_is_low(lruvec, false, memcg, sc, true)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); @@ -3084,22 +3092,44 @@ static void age_active_anon(struct pglist_data *pgdat, } while (memcg); } -static bool zone_balanced(struct zone *zone, int order, int classzone_idx) +/* + * Returns true if there is an eligible zone balanced for the request order + * and classzone_idx + */ +static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) { - unsigned long mark = high_wmark_pages(zone); + int i; + unsigned long mark = -1; + struct zone *zone; - if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx)) - return false; + for (i = 0; i <= classzone_idx; i++) { + zone = pgdat->node_zones + i; + + if (!managed_zone(zone)) + continue; + + mark = high_wmark_pages(zone); + if (zone_watermark_ok_safe(zone, order, mark, classzone_idx)) + return true; + } /* - * If any eligible zone is balanced then the node is not considered - * to be congested or dirty + * If a node has no populated zone within classzone_idx, it does not + * need balancing by definition. This can happen if a zone-restricted + * allocation tries to wake a remote kswapd. */ - clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags); - clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags); - clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags); + if (mark == -1) + return true; - return true; + return false; +} + +/* Clear pgdat state for congested, dirty or under writeback. */ +static void clear_pgdat_congested(pg_data_t *pgdat) +{ + clear_bit(PGDAT_CONGESTED, &pgdat->flags); + clear_bit(PGDAT_DIRTY, &pgdat->flags); + clear_bit(PGDAT_WRITEBACK, &pgdat->flags); } /* @@ -3110,11 +3140,9 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx) */ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) { - int i; - /* * The throttled processes are normally woken up in balance_pgdat() as - * soon as pfmemalloc_watermark_ok() is true. But there is a potential + * soon as allow_direct_reclaim() is true. But there is a potential * race between when kswapd checks the watermarks and a process gets * throttled. There is also a potential race if processes get * throttled, kswapd wakes, a large process exits thereby balancing the @@ -3128,17 +3156,16 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) if (waitqueue_active(&pgdat->pfmemalloc_wait)) wake_up_all(&pgdat->pfmemalloc_wait); - for (i = 0; i <= classzone_idx; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!managed_zone(zone)) - continue; + /* Hopeless node, leave it to direct reclaim */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return true; - if (!zone_balanced(zone, order, classzone_idx)) - return false; + if (pgdat_balanced(pgdat, order, classzone_idx)) { + clear_pgdat_congested(pgdat); + return true; } - return true; + return false; } /* @@ -3214,9 +3241,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) count_vm_event(PAGEOUTRUN); do { + unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; - sc.nr_reclaimed = 0; sc.reclaim_idx = classzone_idx; /* @@ -3241,23 +3268,12 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) } /* - * Only reclaim if there are no eligible zones. Check from - * high to low zone as allocations prefer higher zones. - * Scanning from low to high zone would allow congestion to be - * cleared during a very small window when a small low - * zone was balanced even under extreme pressure when the - * overall node may be congested. Note that sc.reclaim_idx - * is not used as buffer_heads_over_limit may have adjusted - * it. + * Only reclaim if there are no eligible zones. Note that + * sc.reclaim_idx is not used as buffer_heads_over_limit may + * have adjusted it. */ - for (i = classzone_idx; i >= 0; i--) { - zone = pgdat->node_zones + i; - if (!managed_zone(zone)) - continue; - - if (zone_balanced(zone, sc.order, classzone_idx)) - goto out; - } + if (pgdat_balanced(pgdat, sc.order, classzone_idx)) + goto out; /* * Do some background aging of the anon list, to give @@ -3271,7 +3287,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * If we're getting trouble reclaiming, start doing writepage * even in laptop mode. */ - if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat)) + if (sc.priority < DEF_PRIORITY - 2) sc.may_writepage = 1; /* Call soft limit reclaim before calling shrink_node. */ @@ -3295,7 +3311,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * able to safely make forward progress. Wake them */ if (waitqueue_active(&pgdat->pfmemalloc_wait) && - pfmemalloc_watermark_ok(pgdat)) + allow_direct_reclaim(pgdat)) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ @@ -3306,11 +3322,16 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages */ - if (raise_priority || !sc.nr_reclaimed) + nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; + if (raise_priority || !nr_reclaimed) sc.priority--; } while (sc.priority >= 1); + if (!sc.nr_reclaimed) + pgdat->kswapd_failures++; + out: + snapshot_refaults(NULL, pgdat); /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller @@ -3320,6 +3341,22 @@ out: return sc.order; } +/* + * pgdat->kswapd_classzone_idx is the highest zone index that a recent + * allocation request woke kswapd for. When kswapd has not woken recently, + * the value is MAX_NR_ZONES which is not a valid index. This compares a + * given classzone and returns it or the highest classzone index kswapd + * was recently woke for. + */ +static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat, + enum zone_type classzone_idx) +{ + if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES) + return classzone_idx; + + return max(pgdat->kswapd_classzone_idx, classzone_idx); +} + static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, unsigned int classzone_idx) { @@ -3331,7 +3368,13 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); - /* Try to sleep for a short interval */ + /* + * Try to sleep for a short interval. Note that kcompactd will only be + * woken if it is possible to sleep for a short interval. This is + * deliberate on the assumption that if reclaim cannot keep an + * eligible zone balanced that it's also unlikely that compaction will + * succeed. + */ if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { /* * Compaction records what page blocks it recently failed to @@ -3355,7 +3398,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * the previous request that slept prematurely. */ if (remaining) { - pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); + pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order); } @@ -3409,7 +3452,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o */ static int kswapd(void *p) { - unsigned int alloc_order, reclaim_order, classzone_idx; + unsigned int alloc_order, reclaim_order; + unsigned int classzone_idx = MAX_NR_ZONES - 1; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; @@ -3439,20 +3483,23 @@ static int kswapd(void *p) tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; set_freezable(); - pgdat->kswapd_order = alloc_order = reclaim_order = 0; - pgdat->kswapd_classzone_idx = classzone_idx = 0; + pgdat->kswapd_order = 0; + pgdat->kswapd_classzone_idx = MAX_NR_ZONES; for ( ; ; ) { bool ret; + alloc_order = reclaim_order = pgdat->kswapd_order; + classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); + kswapd_try_sleep: kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, classzone_idx); /* Read the new order and classzone_idx */ alloc_order = reclaim_order = pgdat->kswapd_order; - classzone_idx = pgdat->kswapd_classzone_idx; + classzone_idx = kswapd_classzone_idx(pgdat, 0); pgdat->kswapd_order = 0; - pgdat->kswapd_classzone_idx = 0; + pgdat->kswapd_classzone_idx = MAX_NR_ZONES; ret = try_to_freeze(); if (kthread_should_stop()) @@ -3478,9 +3525,6 @@ kswapd_try_sleep: reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); if (reclaim_order < alloc_order) goto kswapd_try_sleep; - - alloc_order = reclaim_order = pgdat->kswapd_order; - classzone_idx = pgdat->kswapd_classzone_idx; } tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); @@ -3496,7 +3540,6 @@ kswapd_try_sleep: void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) { pg_data_t *pgdat; - int z; if (!managed_zone(zone)) return; @@ -3504,22 +3547,20 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) return; pgdat = zone->zone_pgdat; - pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); + pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, + classzone_idx); pgdat->kswapd_order = max(pgdat->kswapd_order, order); if (!waitqueue_active(&pgdat->kswapd_wait)) return; - /* Only wake kswapd if all zones are unbalanced */ - for (z = 0; z <= classzone_idx; z++) { - zone = pgdat->node_zones + z; - if (!managed_zone(zone)) - continue; + /* Hopeless node, leave it to direct reclaim */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return; - if (zone_balanced(zone, order, classzone_idx)) - return; - } + if (pgdat_balanced(pgdat, order, classzone_idx)) + return; - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order); wake_up_interruptible(&pgdat->kswapd_wait); } @@ -3725,7 +3766,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in int classzone_idx = gfp_zone(gfp_mask); struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), .order = order, .priority = NODE_RECLAIM_PRIORITY, .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), @@ -3779,9 +3820,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) return NODE_RECLAIM_FULL; - if (!pgdat_reclaimable(pgdat)) - return NODE_RECLAIM_FULL; - /* * Do not scan if the allocation should not be delayed. */ diff --git a/mm/vmstat.c b/mm/vmstat.c index 5a4f5c5..f5fa1bd 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -954,7 +954,6 @@ const char * const vmstat_text[] = { "nr_unevictable", "nr_isolated_anon", "nr_isolated_file", - "nr_pages_scanned", "workingset_refault", "workingset_activate", "workingset_nodereclaim", @@ -992,6 +991,7 @@ const char * const vmstat_text[] = { "pgfree", "pgactivate", "pgdeactivate", + "pglazyfree", "pgfault", "pgmajfault", @@ -1124,8 +1124,12 @@ static void frag_stop(struct seq_file *m, void *arg) { } -/* Walk all the zones in a node and print using a callback */ +/* + * Walk zones in a node and print using a callback. + * If @assert_populated is true, only use callback for zones that are populated. + */ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, + bool assert_populated, void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) { struct zone *zone; @@ -1133,7 +1137,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, unsigned long flags; for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!populated_zone(zone)) + if (assert_populated && !populated_zone(zone)) continue; spin_lock_irqsave(&zone->lock, flags); @@ -1161,7 +1165,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, static int frag_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, frag_show_print); + walk_zones_in_node(m, pgdat, true, frag_show_print); return 0; } @@ -1202,7 +1206,7 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg) seq_printf(m, "%6d ", order); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print); + walk_zones_in_node(m, pgdat, true, pagetypeinfo_showfree_print); return 0; } @@ -1254,7 +1258,7 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) seq_printf(m, "%12s ", migratetype_names[mtype]); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print); + walk_zones_in_node(m, pgdat, true, pagetypeinfo_showblockcount_print); return 0; } @@ -1280,7 +1284,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) seq_printf(m, "%12s ", migratetype_names[mtype]); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print); + walk_zones_in_node(m, pgdat, true, pagetypeinfo_showmixedcount_print); #endif /* CONFIG_PAGE_OWNER */ } @@ -1378,7 +1382,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n min %lu" "\n low %lu" "\n high %lu" - "\n node_scanned %lu" "\n spanned %lu" "\n present %lu" "\n managed %lu", @@ -1386,23 +1389,28 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), - node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED), zone->spanned_pages, zone->present_pages, zone->managed_pages); - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - seq_printf(m, "\n %-12s %lu", vmstat_text[i], - zone_page_state(zone, i)); - seq_printf(m, "\n protection: (%ld", zone->lowmem_reserve[0]); for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) seq_printf(m, ", %ld", zone->lowmem_reserve[i]); - seq_printf(m, - ")" - "\n pagesets"); + seq_putc(m, ')'); + + /* If unpopulated, no other information is useful */ + if (!populated_zone(zone)) { + seq_putc(m, '\n'); + return; + } + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + seq_printf(m, "\n %-12s %lu", vmstat_text[i], + zone_page_state(zone, i)); + + seq_printf(m, "\n pagesets"); for_each_online_cpu(i) { struct per_cpu_pageset *pageset; @@ -1425,19 +1433,22 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n node_unreclaimable: %u" "\n start_pfn: %lu" "\n node_inactive_ratio: %u", - !pgdat_reclaimable(zone->zone_pgdat), + pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, zone->zone_start_pfn, zone->zone_pgdat->inactive_ratio); seq_putc(m, '\n'); } /* - * Output information about zones in @pgdat. + * Output information about zones in @pgdat. All zones are printed regardless + * of whether they are populated or not: lowmem_reserve_ratio operates on the + * set of all zones and userspace would not be aware of such zones if they are + * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio). */ static int zoneinfo_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, zoneinfo_show_print); + walk_zones_in_node(m, pgdat, false, zoneinfo_show_print); return 0; } @@ -1586,22 +1597,9 @@ int vmstat_refresh(struct ctl_table *table, int write, for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { val = atomic_long_read(&vm_zone_stat[i]); if (val < 0) { - switch (i) { - case NR_PAGES_SCANNED: - /* - * This is often seen to go negative in - * recent kernels, but not to go permanently - * negative. Whilst it would be nicer not to - * have exceptions, rooting them out would be - * another task, of rather low priority. - */ - break; - default: - pr_warn("%s: %s %ld\n", - __func__, vmstat_text[i], val); - err = -EINVAL; - break; - } + pr_warn("%s: %s %ld\n", + __func__, vmstat_text[i], val); + err = -EINVAL; } } if (err) @@ -1856,7 +1854,7 @@ static int unusable_show(struct seq_file *m, void *arg) if (!node_state(pgdat->node_id, N_MEMORY)) return 0; - walk_zones_in_node(m, pgdat, unusable_show_print); + walk_zones_in_node(m, pgdat, true, unusable_show_print); return 0; } @@ -1908,7 +1906,7 @@ static int extfrag_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, extfrag_show_print); + walk_zones_in_node(m, pgdat, true, extfrag_show_print); return 0; } diff --git a/mm/workingset.c b/mm/workingset.c index eda05c7..b8c9ab6 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -269,7 +269,6 @@ bool workingset_refault(void *shadow) lruvec = mem_cgroup_lruvec(pgdat, memcg); refault = atomic_long_read(&lruvec->inactive_age); active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); - rcu_read_unlock(); /* * The unsigned subtraction here gives an accurate distance @@ -290,11 +289,15 @@ bool workingset_refault(void *shadow) refault_distance = (refault - eviction) & EVICTION_MASK; inc_node_state(pgdat, WORKINGSET_REFAULT); + inc_memcg_state(memcg, WORKINGSET_REFAULT); if (refault_distance <= active_file) { inc_node_state(pgdat, WORKINGSET_ACTIVATE); + inc_memcg_state(memcg, WORKINGSET_ACTIVATE); + rcu_read_unlock(); return true; } + rcu_read_unlock(); return false; } @@ -472,6 +475,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, if (WARN_ON_ONCE(node->exceptional)) goto out_invalid; inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); + inc_memcg_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); __radix_tree_delete_node(&mapping->page_tree, node, workingset_update_node, mapping); |