diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 12 | ||||
-rw-r--r-- | mm/backing-dev.c | 2 | ||||
-rw-r--r-- | mm/debug.c | 5 | ||||
-rw-r--r-- | mm/dmapool.c | 2 | ||||
-rw-r--r-- | mm/failslab.c | 8 | ||||
-rw-r--r-- | mm/filemap.c | 6 | ||||
-rw-r--r-- | mm/huge_memory.c | 15 | ||||
-rw-r--r-- | mm/hugetlb.c | 35 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 2 | ||||
-rw-r--r-- | mm/internal.h | 28 | ||||
-rw-r--r-- | mm/memcontrol.c | 10 | ||||
-rw-r--r-- | mm/memory-failure.c | 7 | ||||
-rw-r--r-- | mm/mempool.c | 10 | ||||
-rw-r--r-- | mm/migrate.c | 4 | ||||
-rw-r--r-- | mm/oom_kill.c | 17 | ||||
-rw-r--r-- | mm/page_alloc.c | 703 | ||||
-rw-r--r-- | mm/readahead.c | 4 | ||||
-rw-r--r-- | mm/shmem.c | 2 | ||||
-rw-r--r-- | mm/slab.c | 35 | ||||
-rw-r--r-- | mm/slub.c | 15 | ||||
-rw-r--r-- | mm/swap.c | 4 | ||||
-rw-r--r-- | mm/vmalloc.c | 4 | ||||
-rw-r--r-- | mm/vmscan.c | 8 | ||||
-rw-r--r-- | mm/vmstat.c | 2 | ||||
-rw-r--r-- | mm/zbud.c | 2 | ||||
-rw-r--r-- | mm/zpool.c | 18 | ||||
-rw-r--r-- | mm/zsmalloc.c | 49 | ||||
-rw-r--r-- | mm/zswap.c | 87 |
28 files changed, 468 insertions, 628 deletions
@@ -200,18 +200,6 @@ config MEMORY_HOTREMOVE depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE depends on MIGRATION -# -# If we have space for more page flags then we can enable additional -# optimizations and functionality. -# -# Regular Sparsemem takes page flag bits for the sectionid if it does not -# use a virtual memmap. Disable extended page flags for 32 bit platforms -# that require the use of a sectionid in the page flags. -# -config PAGEFLAGS_EXTENDED - def_bool y - depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM - # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address # space can be handled with less contention: split it at this NR_CPUS. diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 619984f..8ed2ffd 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -637,7 +637,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, { struct bdi_writeback *wb; - might_sleep_if(gfp & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp)); if (!memcg_css->parent) return &bdi->wb; @@ -25,12 +25,7 @@ static const struct trace_print_flags pageflag_names[] = { {1UL << PG_private, "private" }, {1UL << PG_private_2, "private_2" }, {1UL << PG_writeback, "writeback" }, -#ifdef CONFIG_PAGEFLAGS_EXTENDED {1UL << PG_head, "head" }, - {1UL << PG_tail, "tail" }, -#else - {1UL << PG_compound, "compound" }, -#endif {1UL << PG_swapcache, "swapcache" }, {1UL << PG_mappedtodisk, "mappedtodisk" }, {1UL << PG_reclaim, "reclaim" }, diff --git a/mm/dmapool.c b/mm/dmapool.c index 312a716..57312b5 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -326,7 +326,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, size_t offset; void *retval; - might_sleep_if(mem_flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(mem_flags)); spin_lock_irqsave(&pool->lock, flags); list_for_each_entry(page, &pool->page_list, page_list) { diff --git a/mm/failslab.c b/mm/failslab.c index 98fb490..79171b4 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -3,11 +3,11 @@ static struct { struct fault_attr attr; - bool ignore_gfp_wait; + bool ignore_gfp_reclaim; bool cache_filter; } failslab = { .attr = FAULT_ATTR_INITIALIZER, - .ignore_gfp_wait = true, + .ignore_gfp_reclaim = true, .cache_filter = false, }; @@ -16,7 +16,7 @@ bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) if (gfpflags & __GFP_NOFAIL) return false; - if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) + if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM)) return false; if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB)) @@ -42,7 +42,7 @@ static int __init failslab_debugfs_init(void) return PTR_ERR(dir); if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, - &failslab.ignore_gfp_wait)) + &failslab.ignore_gfp_reclaim)) goto fail; if (!debugfs_create_bool("cache-filter", mode, dir, &failslab.cache_filter)) diff --git a/mm/filemap.c b/mm/filemap.c index 58e04e2..1bb00762 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1722,7 +1722,7 @@ no_cached_page: goto out; } error = add_to_page_cache_lru(page, mapping, index, - GFP_KERNEL & mapping_gfp_mask(mapping)); + mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error) { page_cache_release(page); if (error == -EEXIST) { @@ -1824,7 +1824,7 @@ static int page_cache_read(struct file *file, pgoff_t offset) return -ENOMEM; ret = add_to_page_cache_lru(page, mapping, offset, - GFP_KERNEL & mapping_gfp_mask(mapping)); + mapping_gfp_constraint(mapping, GFP_KERNEL)); if (ret == 0) ret = mapping->a_ops->readpage(file, page); else if (ret == -EEXIST) @@ -2713,7 +2713,7 @@ EXPORT_SYMBOL(generic_file_write_iter); * page is known to the local caching routines. * * The @gfp_mask argument specifies whether I/O may be performed to release - * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). + * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). * */ int try_to_release_page(struct page *page, gfp_t gfp_mask) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 00cfd1a..c29ddeb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -116,7 +116,7 @@ static void set_recommended_min_free_kbytes(void) for_each_populated_zone(zone) nr_zones++; - /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ + /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ recommended_min = pageblock_nr_pages * nr_zones * 2; /* @@ -786,7 +786,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) { - return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; + return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp; } /* Caller must hold page table lock. */ @@ -1755,8 +1755,7 @@ static void __split_huge_page_refcount(struct page *page, (1L << PG_unevictable))); page_tail->flags |= (1L << PG_dirty); - /* clear PageTail before overwriting first_page */ - smp_wmb(); + clear_compound_head(page_tail); if (page_is_young(page)) set_page_young(page_tail); @@ -2413,8 +2412,7 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) static struct page * khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - int node) + unsigned long address, int node) { VM_BUG_ON_PAGE(*hpage, *hpage); @@ -2481,8 +2479,7 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) static struct page * khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - int node) + unsigned long address, int node) { up_read(&mm->mmap_sem); VM_BUG_ON(!*hpage); @@ -2530,7 +2527,7 @@ static void collapse_huge_page(struct mm_struct *mm, __GFP_THISNODE; /* release the mmap_sem read lock. */ - new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node); + new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node); if (!new_page) return; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 74ef0c6..7ce07d6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -994,23 +994,22 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) #if defined(CONFIG_CMA) && defined(CONFIG_X86_64) static void destroy_compound_gigantic_page(struct page *page, - unsigned long order) + unsigned int order) { int i; int nr_pages = 1 << order; struct page *p = page + 1; for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { - __ClearPageTail(p); + clear_compound_head(p); set_page_refcounted(p); - p->first_page = NULL; } set_compound_order(page, 0); __ClearPageHead(page); } -static void free_gigantic_page(struct page *page, unsigned order) +static void free_gigantic_page(struct page *page, unsigned int order) { free_contig_range(page_to_pfn(page), 1 << order); } @@ -1054,7 +1053,7 @@ static bool zone_spans_last_pfn(const struct zone *zone, return zone_spans_pfn(zone, last_pfn); } -static struct page *alloc_gigantic_page(int nid, unsigned order) +static struct page *alloc_gigantic_page(int nid, unsigned int order) { unsigned long nr_pages = 1 << order; unsigned long ret, pfn, flags; @@ -1090,7 +1089,7 @@ static struct page *alloc_gigantic_page(int nid, unsigned order) } static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); -static void prep_compound_gigantic_page(struct page *page, unsigned long order); +static void prep_compound_gigantic_page(struct page *page, unsigned int order); static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) { @@ -1123,9 +1122,9 @@ static int alloc_fresh_gigantic_page(struct hstate *h, static inline bool gigantic_page_supported(void) { return true; } #else static inline bool gigantic_page_supported(void) { return false; } -static inline void free_gigantic_page(struct page *page, unsigned order) { } +static inline void free_gigantic_page(struct page *page, unsigned int order) { } static inline void destroy_compound_gigantic_page(struct page *page, - unsigned long order) { } + unsigned int order) { } static inline int alloc_fresh_gigantic_page(struct hstate *h, nodemask_t *nodes_allowed) { return 0; } #endif @@ -1146,7 +1145,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) 1 << PG_writeback); } VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); - set_compound_page_dtor(page, NULL); + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); set_page_refcounted(page); if (hstate_is_gigantic(h)) { destroy_compound_gigantic_page(page, huge_page_order(h)); @@ -1242,7 +1241,7 @@ void free_huge_page(struct page *page) static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { INIT_LIST_HEAD(&page->lru); - set_compound_page_dtor(page, free_huge_page); + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); spin_lock(&hugetlb_lock); set_hugetlb_cgroup(page, NULL); h->nr_huge_pages++; @@ -1251,7 +1250,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) put_page(page); /* free it into the hugepage allocator */ } -static void prep_compound_gigantic_page(struct page *page, unsigned long order) +static void prep_compound_gigantic_page(struct page *page, unsigned int order) { int i; int nr_pages = 1 << order; @@ -1276,10 +1275,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) */ __ClearPageReserved(p); set_page_count(p, 0); - p->first_page = page; - /* Make sure p->first_page is always valid for PageTail() */ - smp_wmb(); - __SetPageTail(p); + set_compound_head(p, page); } } @@ -1294,7 +1290,7 @@ int PageHuge(struct page *page) return 0; page = compound_head(page); - return get_compound_page_dtor(page) == free_huge_page; + return page[1].compound_dtor == HUGETLB_PAGE_DTOR; } EXPORT_SYMBOL_GPL(PageHuge); @@ -1568,7 +1564,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, if (page) { INIT_LIST_HEAD(&page->lru); r_nid = page_to_nid(page); - set_compound_page_dtor(page, free_huge_page); + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); set_hugetlb_cgroup(page, NULL); /* * We incremented the global counters already @@ -1972,7 +1968,8 @@ found: return 1; } -static void __init prep_compound_huge_page(struct page *page, int order) +static void __init prep_compound_huge_page(struct page *page, + unsigned int order) { if (unlikely(order > (MAX_ORDER - 1))) prep_compound_gigantic_page(page, order); @@ -2683,7 +2680,7 @@ static int __init hugetlb_init(void) module_init(hugetlb_init); /* Should be called on processing a hugepagesz=... option */ -void __init hugetlb_add_hstate(unsigned order) +void __init hugetlb_add_hstate(unsigned int order) { struct hstate *h; unsigned long i; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 33d59ab..d8fb10d 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -385,7 +385,7 @@ void __init hugetlb_cgroup_file_init(void) /* * Add cgroup control files only if the huge page consists * of more than two normal pages. This is because we use - * page[2].lru.next for storing cgroup details. + * page[2].private for storing cgroup details. */ if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) __hugetlb_cgroup_file_init(hstate_index(h)); diff --git a/mm/internal.h b/mm/internal.h index d4b807d..38e24b8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -14,6 +14,25 @@ #include <linux/fs.h> #include <linux/mm.h> +/* + * The set of flags that only affect watermark checking and reclaim + * behaviour. This is used by the MM to obey the caller constraints + * about IO, FS and watermark checking while ignoring placement + * hints such as HIGHMEM usage. + */ +#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ + __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ + __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC) + +/* The GFP flags allowed during early boot */ +#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) + +/* Control allocation cpuset and node placement constraints */ +#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) + +/* Do not use these with a slab allocator */ +#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) + void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); @@ -61,9 +80,9 @@ static inline void __get_page_tail_foll(struct page *page, * speculative page access (like in * page_cache_get_speculative()) on tail pages. */ - VM_BUG_ON_PAGE(atomic_read(&page->first_page->_count) <= 0, page); + VM_BUG_ON_PAGE(atomic_read(&compound_head(page)->_count) <= 0, page); if (get_page_head) - atomic_inc(&page->first_page->_count); + atomic_inc(&compound_head(page)->_count); get_huge_page_tail(page); } @@ -129,6 +148,7 @@ struct alloc_context { int classzone_idx; int migratetype; enum zone_type high_zoneidx; + bool spread_dirty_pages; }; /* @@ -157,7 +177,7 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) extern int __isolate_free_page(struct page *page, unsigned int order); extern void __free_pages_bootmem(struct page *page, unsigned long pfn, unsigned int order); -extern void prep_compound_page(struct page *page, unsigned long order); +extern void prep_compound_page(struct page *page, unsigned int order); #ifdef CONFIG_MEMORY_FAILURE extern bool is_free_buddy_page(struct page *page); #endif @@ -215,7 +235,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, * page cannot be allocated or merged in parallel. Alternatively, it must * handle invalid values gracefully, and use page_order_unsafe() below. */ -static inline unsigned long page_order(struct page *page) +static inline unsigned int page_order(struct page *page) { /* PageBuddy() must be checked by the caller */ return page_private(page); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bc502e5..9acfb16 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2046,7 +2046,7 @@ retry: if (unlikely(task_in_memcg_oom(current))) goto nomem; - if (!(gfp_mask & __GFP_WAIT)) + if (!gfpflags_allow_blocking(gfp_mask)) goto nomem; mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1); @@ -2120,7 +2120,7 @@ done_restock: /* * If the hierarchy is above the normal consumption range, schedule * reclaim on returning to userland. We can perform reclaim here - * if __GFP_WAIT but let's always punt for simplicity and so that + * if __GFP_RECLAIM but let's always punt for simplicity and so that * GFP_KERNEL can consistently be used during reclaim. @memcg is * not recorded as it most likely matches current's and won't * change in the meantime. As high limit is checked again before @@ -2801,7 +2801,7 @@ static unsigned long tree_stat(struct mem_cgroup *memcg, return val; } -static inline unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) +static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { unsigned long val; @@ -4364,8 +4364,8 @@ static int mem_cgroup_do_precharge(unsigned long count) { int ret; - /* Try a single bulk charge without reclaim first */ - ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); + /* Try a single bulk charge without reclaim first, kswapd may wake */ + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); if (!ret) { mc.precharge += count; return ret; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 16a0ec3..8424b64 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -776,8 +776,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) #define lru (1UL << PG_lru) #define swapbacked (1UL << PG_swapbacked) #define head (1UL << PG_head) -#define tail (1UL << PG_tail) -#define compound (1UL << PG_compound) #define slab (1UL << PG_slab) #define reserved (1UL << PG_reserved) @@ -800,12 +798,7 @@ static struct page_state { */ { slab, slab, MF_MSG_SLAB, me_kernel }, -#ifdef CONFIG_PAGEFLAGS_EXTENDED { head, head, MF_MSG_HUGE, me_huge_page }, - { tail, tail, MF_MSG_HUGE, me_huge_page }, -#else - { compound, compound, MF_MSG_HUGE, me_huge_page }, -#endif { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, diff --git a/mm/mempool.c b/mm/mempool.c index 4c533bc..004d42b 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -320,13 +320,13 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) gfp_t gfp_temp; VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); - might_sleep_if(gfp_mask & __GFP_WAIT); + might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ gfp_mask |= __GFP_NOWARN; /* failures are OK */ - gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO); + gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); repeat_alloc: @@ -349,7 +349,7 @@ repeat_alloc: } /* - * We use gfp mask w/o __GFP_WAIT or IO for the first round. If + * We use gfp mask w/o direct reclaim or IO for the first round. If * alloc failed with that and @pool was empty, retry immediately. */ if (gfp_temp != gfp_mask) { @@ -358,8 +358,8 @@ repeat_alloc: goto repeat_alloc; } - /* We must not sleep if !__GFP_WAIT */ - if (!(gfp_mask & __GFP_WAIT)) { + /* We must not sleep if !__GFP_DIRECT_RECLAIM */ + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { spin_unlock_irqrestore(&pool->lock, flags); return NULL; } diff --git a/mm/migrate.c b/mm/migrate.c index 2834fab..7890d0b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1578,7 +1578,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, (GFP_HIGHUSER_MOVABLE | __GFP_THISNODE | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & - ~GFP_IOFS, 0); + ~(__GFP_IO | __GFP_FS), 0); return newpage; } @@ -1752,7 +1752,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, goto out_dropref; new_page = alloc_pages_node(node, - (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT, + (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM, HPAGE_PMD_ORDER); if (!new_page) goto out_fail; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e477828..d13a339 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -118,6 +118,15 @@ found: return t; } +/* + * order == -1 means the oom kill is required by sysrq, otherwise only + * for display purposes. + */ +static inline bool is_sysrq_oom(struct oom_control *oc) +{ + return oc->order == -1; +} + /* return true if the task is not adequate as candidate victim task. */ static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask) @@ -265,7 +274,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, * Don't allow any other task to have access to the reserves. */ if (test_tsk_thread_flag(task, TIF_MEMDIE)) { - if (oc->order != -1) + if (!is_sysrq_oom(oc)) return OOM_SCAN_ABORT; } if (!task->mm) @@ -278,7 +287,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, if (oom_task_origin(task)) return OOM_SCAN_SELECT; - if (task_will_free_mem(task) && oc->order != -1) + if (task_will_free_mem(task) && !is_sysrq_oom(oc)) return OOM_SCAN_ABORT; return OOM_SCAN_OK; @@ -629,7 +638,7 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint, return; } /* Do not panic for oom kills triggered by sysrq */ - if (oc->order == -1) + if (is_sysrq_oom(oc)) return; dump_header(oc, NULL, memcg); panic("Out of memory: %s panic_on_oom is enabled\n", @@ -709,7 +718,7 @@ bool out_of_memory(struct oom_control *oc) p = select_bad_process(oc, &points, totalpages); /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!p && oc->order != -1) { + if (!p && !is_sysrq_oom(oc)) { dump_header(oc, NULL, NULL); panic("Out of memory and no killable processes...\n"); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 446bb36..208e4c7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -169,19 +169,19 @@ void pm_restrict_gfp_mask(void) WARN_ON(!mutex_is_locked(&pm_mutex)); WARN_ON(saved_gfp_mask); saved_gfp_mask = gfp_allowed_mask; - gfp_allowed_mask &= ~GFP_IOFS; + gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); } bool pm_suspended_storage(void) { - if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) + if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) return false; return true; } #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE -int pageblock_order __read_mostly; +unsigned int pageblock_order __read_mostly; #endif static void __free_pages_ok(struct page *page, unsigned int order); @@ -229,6 +229,15 @@ static char * const zone_names[MAX_NR_ZONES] = { #endif }; +static void free_compound_page(struct page *page); +compound_page_dtor * const compound_page_dtors[] = { + NULL, + free_compound_page, +#ifdef CONFIG_HUGETLB_PAGE + free_huge_page, +#endif +}; + int min_free_kbytes = 1024; int user_min_free_kbytes = -1; @@ -436,15 +445,15 @@ out: /* * Higher-order pages are called "compound pages". They are structured thusly: * - * The first PAGE_SIZE page is called the "head page". + * The first PAGE_SIZE page is called the "head page" and have PG_head set. * - * The remaining PAGE_SIZE pages are called "tail pages". + * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded + * in bit 0 of page->compound_head. The rest of bits is pointer to head page. * - * All pages have PG_compound set. All tail pages have their ->first_page - * pointing at the head page. + * The first tail page's ->compound_dtor holds the offset in array of compound + * page destructors. See compound_page_dtors. * - * The first tail page's ->lru.next holds the address of the compound page's - * put_page() function. Its ->lru.prev holds the order of allocation. + * The first tail page's ->compound_order holds the order of allocation. * This usage means that zero-order pages may not be compound. */ @@ -453,21 +462,18 @@ static void free_compound_page(struct page *page) __free_pages_ok(page, compound_order(page)); } -void prep_compound_page(struct page *page, unsigned long order) +void prep_compound_page(struct page *page, unsigned int order) { int i; int nr_pages = 1 << order; - set_compound_page_dtor(page, free_compound_page); + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); set_compound_order(page, order); __SetPageHead(page); for (i = 1; i < nr_pages; i++) { struct page *p = page + i; set_page_count(p, 0); - p->first_page = page; - /* Make sure p->first_page is always valid for PageTail() */ - smp_wmb(); - __SetPageTail(p); + set_compound_head(p, page); } } @@ -656,7 +662,7 @@ static inline void __free_one_page(struct page *page, unsigned long combined_idx; unsigned long uninitialized_var(buddy_idx); struct page *buddy; - int max_order = MAX_ORDER; + unsigned int max_order = MAX_ORDER; VM_BUG_ON(!zone_is_initialized(zone)); VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); @@ -669,7 +675,7 @@ static inline void __free_one_page(struct page *page, * pageblock. Without this, pageblock isolation * could cause incorrect freepage accounting. */ - max_order = min(MAX_ORDER, pageblock_order + 1); + max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); } else { __mod_zone_freepage_state(zone, 1 << order, migratetype); } @@ -817,7 +823,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, if (unlikely(has_isolate_pageblock(zone))) mt = get_pageblock_migratetype(page); - /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); } while (--to_free && --batch_free && !list_empty(list)); @@ -846,17 +851,30 @@ static void free_one_page(struct zone *zone, static int free_tail_pages_check(struct page *head_page, struct page *page) { - if (!IS_ENABLED(CONFIG_DEBUG_VM)) - return 0; + int ret = 1; + + /* + * We rely page->lru.next never has bit 0 set, unless the page + * is PageTail(). Let's make sure that's true even for poisoned ->lru. + */ + BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); + + if (!IS_ENABLED(CONFIG_DEBUG_VM)) { + ret = 0; + goto out; + } if (unlikely(!PageTail(page))) { bad_page(page, "PageTail not set", 0); - return 1; + goto out; } - if (unlikely(page->first_page != head_page)) { - bad_page(page, "first_page not consistent", 0); - return 1; + if (unlikely(compound_head(page) != head_page)) { + bad_page(page, "compound_head not consistent", 0); + goto out; } - return 0; + ret = 0; +out: + clear_compound_head(page); + return ret; } static void __meminit __init_single_page(struct page *page, unsigned long pfn, @@ -923,6 +941,10 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) struct page *page = pfn_to_page(start_pfn); init_reserved_page(start_pfn); + + /* Avoid false-positive PageTail() */ + INIT_LIST_HEAD(&page->lru); + SetPageReserved(page); } } @@ -1417,15 +1439,14 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, * the free lists for the desirable migrate type are depleted */ static int fallbacks[MIGRATE_TYPES][4] = { - [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, - [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, - [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, + [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, #ifdef CONFIG_CMA - [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ + [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ #endif - [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ #ifdef CONFIG_MEMORY_ISOLATION - [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ + [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */ #endif }; @@ -1450,7 +1471,7 @@ int move_freepages(struct zone *zone, int migratetype) { struct page *page; - unsigned long order; + unsigned int order; int pages_moved = 0; #ifndef CONFIG_HOLES_IN_ZONE @@ -1563,7 +1584,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt) static void steal_suitable_fallback(struct zone *zone, struct page *page, int start_type) { - int current_order = page_order(page); + unsigned int current_order = page_order(page); int pages; /* Take ownership for orders >= pageblock_order */ @@ -1598,7 +1619,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, *can_steal = false; for (i = 0;; i++) { fallback_mt = fallbacks[migratetype][i]; - if (fallback_mt == MIGRATE_RESERVE) + if (fallback_mt == MIGRATE_TYPES) break; if (list_empty(&area->free_list[fallback_mt])) @@ -1617,6 +1638,101 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, return -1; } +/* + * Reserve a pageblock for exclusive use of high-order atomic allocations if + * there are no empty page blocks that contain a page with a suitable order + */ +static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, + unsigned int alloc_order) +{ + int mt; + unsigned long max_managed, flags; + + /* + * Limit the number reserved to 1 pageblock or roughly 1% of a zone. + * Check is race-prone but harmless. + */ + max_managed = (zone->managed_pages / 100) + pageblock_nr_pages; + if (zone->nr_reserved_highatomic >= max_managed) + return; + + spin_lock_irqsave(&zone->lock, flags); + + /* Recheck the nr_reserved_highatomic limit under the lock */ + if (zone->nr_reserved_highatomic >= max_managed) + goto out_unlock; + + /* Yoink! */ + mt = get_pageblock_migratetype(page); + if (mt != MIGRATE_HIGHATOMIC && + !is_migrate_isolate(mt) && !is_migrate_cma(mt)) { + zone->nr_reserved_highatomic += pageblock_nr_pages; + set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); + move_freepages_block(zone, page, MIGRATE_HIGHATOMIC); + } + +out_unlock: + spin_unlock_irqrestore(&zone->lock, flags); +} + +/* + * Used when an allocation is about to fail under memory pressure. This + * potentially hurts the reliability of high-order allocations when under + * intense memory pressure but failed atomic allocations should be easier + * to recover from than an OOM. + */ +static void unreserve_highatomic_pageblock(const struct alloc_context *ac) +{ + struct zonelist *zonelist = ac->zonelist; + unsigned long flags; + struct zoneref *z; + struct zone *zone; + struct page *page; + int order; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, + ac->nodemask) { + /* Preserve at least one pageblock */ + if (zone->nr_reserved_highatomic <= pageblock_nr_pages) + continue; + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + struct free_area *area = &(zone->free_area[order]); + + if (list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) + continue; + + page = list_entry(area->free_list[MIGRATE_HIGHATOMIC].next, + struct page, lru); + + /* + * It should never happen but changes to locking could + * inadvertently allow a per-cpu drain to add pages + * to MIGRATE_HIGHATOMIC while unreserving so be safe + * and watch for underflows. + */ + zone->nr_reserved_highatomic -= min(pageblock_nr_pages, + zone->nr_reserved_highatomic); + + /* + * Convert to ac->migratetype and avoid the normal + * pageblock stealing heuristics. Minimally, the caller + * is doing the work and needs the pages. More + * importantly, if the block was always converted to + * MIGRATE_UNMOVABLE or another type then the number + * of pageblocks that cannot be completely freed + * may increase. + */ + set_pageblock_migratetype(page, ac->migratetype); + move_freepages_block(zone, page, ac->migratetype); + spin_unlock_irqrestore(&zone->lock, flags); + return; + } + spin_unlock_irqrestore(&zone->lock, flags); + } +} + /* Remove an element from the buddy allocator from the fallback list */ static inline struct page * __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) @@ -1672,29 +1788,17 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) * Call me with the zone->lock already held. */ static struct page *__rmqueue(struct zone *zone, unsigned int order, - int migratetype) + int migratetype, gfp_t gfp_flags) { struct page *page; -retry_reserve: page = __rmqueue_smallest(zone, order, migratetype); - - if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { + if (unlikely(!page)) { if (migratetype == MIGRATE_MOVABLE) page = __rmqueue_cma_fallback(zone, order); if (!page) page = __rmqueue_fallback(zone, order, migratetype); - - /* - * Use MIGRATE_RESERVE rather than fail an allocation. goto - * is used because __rmqueue_smallest is an inline function - * and we want just one call site - */ - if (!page) { - migratetype = MIGRATE_RESERVE; - goto retry_reserve; - } } trace_mm_page_alloc_zone_locked(page, order, migratetype); @@ -1714,7 +1818,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, spin_lock(&zone->lock); for (i = 0; i < count; ++i) { - struct page *page = __rmqueue(zone, order, migratetype); + struct page *page = __rmqueue(zone, order, migratetype, 0); if (unlikely(page == NULL)) break; @@ -2086,7 +2190,7 @@ int split_free_page(struct page *page) static inline struct page *buffered_rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, - gfp_t gfp_flags, int migratetype) + gfp_t gfp_flags, int alloc_flags, int migratetype) { unsigned long flags; struct page *page; @@ -2129,7 +2233,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, WARN_ON_ONCE(order > 1); } spin_lock_irqsave(&zone->lock, flags); - page = __rmqueue(zone, order, migratetype); + + page = NULL; + if (alloc_flags & ALLOC_HARDER) { + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (page) + trace_mm_page_alloc_zone_locked(page, order, migratetype); + } + if (!page) + page = __rmqueue(zone, order, migratetype, gfp_flags); spin_unlock(&zone->lock); if (!page) goto failed; @@ -2160,11 +2272,11 @@ static struct { struct fault_attr attr; bool ignore_gfp_highmem; - bool ignore_gfp_wait; + bool ignore_gfp_reclaim; u32 min_order; } fail_page_alloc = { .attr = FAULT_ATTR_INITIALIZER, - .ignore_gfp_wait = true, + .ignore_gfp_reclaim = true, .ignore_gfp_highmem = true, .min_order = 1, }; @@ -2183,7 +2295,8 @@ static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) return false; if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) return false; - if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) + if (fail_page_alloc.ignore_gfp_reclaim && + (gfp_mask & __GFP_DIRECT_RECLAIM)) return false; return should_fail(&fail_page_alloc.attr, 1 << order); @@ -2202,7 +2315,7 @@ static int __init fail_page_alloc_debugfs(void) return PTR_ERR(dir); if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, - &fail_page_alloc.ignore_gfp_wait)) + &fail_page_alloc.ignore_gfp_reclaim)) goto fail; if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, &fail_page_alloc.ignore_gfp_highmem)) @@ -2232,42 +2345,77 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) #endif /* CONFIG_FAIL_PAGE_ALLOC */ /* - * Return true if free pages are above 'mark'. This takes into account the order - * of the allocation. + * Return true if free base pages are above 'mark'. For high-order checks it + * will return true of the order-0 watermark is reached and there is at least + * one free page of a suitable size. Checking now avoids taking the zone lock + * to check in the allocation paths if no pages are free. */ static bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, int alloc_flags, long free_pages) { - /* free_pages may go negative - that's OK */ long min = mark; int o; - long free_cma = 0; + const int alloc_harder = (alloc_flags & ALLOC_HARDER); + /* free_pages may go negative - that's OK */ free_pages -= (1 << order) - 1; + if (alloc_flags & ALLOC_HIGH) min -= min / 2; - if (alloc_flags & ALLOC_HARDER) + + /* + * If the caller does not have rights to ALLOC_HARDER then subtract + * the high-atomic reserves. This will over-estimate the size of the + * atomic reserve but it avoids a search. + */ + if (likely(!alloc_harder)) + free_pages -= z->nr_reserved_highatomic; + else min -= min / 4; + #ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ if (!(alloc_flags & ALLOC_CMA)) - free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); + free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); #endif - if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx]) + /* + * Check watermarks for an order-0 allocation request. If these + * are not met, then a high-order request also cannot go ahead + * even if a suitable page happened to be free. + */ + if (free_pages <= min + z->lowmem_reserve[classzone_idx]) return false; - for (o = 0; o < order; o++) { - /* At the next order, this order's pages become unavailable */ - free_pages -= z->free_area[o].nr_free << o; - /* Require fewer higher order pages to be free */ - min >>= 1; + /* If this is an order-0 request then the watermark is fine */ + if (!order) + return true; + + /* For a high-order request, check at least one suitable page is free */ + for (o = order; o < MAX_ORDER; o++) { + struct free_area *area = &z->free_area[o]; + int mt; + + if (!area->nr_free) + continue; + + if (alloc_harder) + return true; + + for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { + if (!list_empty(&area->free_list[mt])) + return true; + } - if (free_pages <= min) - return false; +#ifdef CONFIG_CMA + if ((alloc_flags & ALLOC_CMA) && + !list_empty(&area->free_list[MIGRATE_CMA])) { + return true; + } +#endif } - return true; + return false; } bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, @@ -2278,134 +2426,18 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, int alloc_flags) + unsigned long mark, int classzone_idx) { long free_pages = zone_page_state(z, NR_FREE_PAGES); if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + return __zone_watermark_ok(z, order, mark, classzone_idx, 0, free_pages); } #ifdef CONFIG_NUMA -/* - * zlc_setup - Setup for "zonelist cache". Uses cached zone data to - * skip over zones that are not allowed by the cpuset, or that have - * been recently (in last second) found to be nearly full. See further - * comments in mmzone.h. Reduces cache footprint of zonelist scans - * that have to skip over a lot of full or unallowed zones. - * - * If the zonelist cache is present in the passed zonelist, then - * returns a pointer to the allowed node mask (either the current - * tasks mems_allowed, or node_states[N_MEMORY].) - * - * If the zonelist cache is not available for this zonelist, does - * nothing and returns NULL. - * - * If the fullzones BITMAP in the zonelist cache is stale (more than - * a second since last zap'd) then we zap it out (clear its bits.) - * - * We hold off even calling zlc_setup, until after we've checked the - * first zone in the zonelist, on the theory that most allocations will - * be satisfied from that first zone, so best to examine that zone as - * quickly as we can. - */ -static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) -{ - struct zonelist_cache *zlc; /* cached zonelist speedup info */ - nodemask_t *allowednodes; /* zonelist_cache approximation */ - - zlc = zonelist->zlcache_ptr; - if (!zlc) - return NULL; - - if (time_after(jiffies, zlc->last_full_zap + HZ)) { - bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); - zlc->last_full_zap = jiffies; - } - - allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? - &cpuset_current_mems_allowed : - &node_states[N_MEMORY]; - return allowednodes; -} - -/* - * Given 'z' scanning a zonelist, run a couple of quick checks to see - * if it is worth looking at further for free memory: - * 1) Check that the zone isn't thought to be full (doesn't have its - * bit set in the zonelist_cache fullzones BITMAP). - * 2) Check that the zones node (obtained from the zonelist_cache - * z_to_n[] mapping) is allowed in the passed in allowednodes mask. - * Return true (non-zero) if zone is worth looking at further, or - * else return false (zero) if it is not. - * - * This check -ignores- the distinction between various watermarks, - * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is - * found to be full for any variation of these watermarks, it will - * be considered full for up to one second by all requests, unless - * we are so low on memory on all allowed nodes that we are forced - * into the second scan of the zonelist. - * - * In the second scan we ignore this zonelist cache and exactly - * apply the watermarks to all zones, even it is slower to do so. - * We are low on memory in the second scan, and should leave no stone - * unturned looking for a free page. - */ -static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, - nodemask_t *allowednodes) -{ - struct zonelist_cache *zlc; /* cached zonelist speedup info */ - int i; /* index of *z in zonelist zones */ - int n; /* node that zone *z is on */ - - zlc = zonelist->zlcache_ptr; - if (!zlc) - return 1; - - i = z - zonelist->_zonerefs; - n = zlc->z_to_n[i]; - - /* This zone is worth trying if it is allowed but not full */ - return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); -} - -/* - * Given 'z' scanning a zonelist, set the corresponding bit in - * zlc->fullzones, so that subsequent attempts to allocate a page - * from that zone don't waste time re-examining it. - */ -static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) -{ - struct zonelist_cache *zlc; /* cached zonelist speedup info */ - int i; /* index of *z in zonelist zones */ - - zlc = zonelist->zlcache_ptr; - if (!zlc) - return; - - i = z - zonelist->_zonerefs; - - set_bit(i, zlc->fullzones); -} - -/* - * clear all zones full, called after direct reclaim makes progress so that - * a zone that was recently full is not skipped over for up to a second - */ -static void zlc_clear_zones_full(struct zonelist *zonelist) -{ - struct zonelist_cache *zlc; /* cached zonelist speedup info */ - - zlc = zonelist->zlcache_ptr; - if (!zlc) - return; - - bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); -} - static bool zone_local(struct zone *local_zone, struct zone *zone) { return local_zone->node == zone->node; @@ -2416,28 +2448,7 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < RECLAIM_DISTANCE; } - #else /* CONFIG_NUMA */ - -static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) -{ - return NULL; -} - -static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, - nodemask_t *allowednodes) -{ - return 1; -} - -static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) -{ -} - -static void zlc_clear_zones_full(struct zonelist *zonelist) -{ -} - static bool zone_local(struct zone *local_zone, struct zone *zone) { return true; @@ -2447,7 +2458,6 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return true; } - #endif /* CONFIG_NUMA */ static void reset_alloc_batches(struct zone *preferred_zone) @@ -2474,11 +2484,6 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, struct zoneref *z; struct page *page = NULL; struct zone *zone; - nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ - int zlc_active = 0; /* set if using zonelist_cache */ - int did_zlc_setup = 0; /* just call zlc_setup() one time */ - bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && - (gfp_mask & __GFP_WRITE); int nr_fair_skipped = 0; bool zonelist_rescan; @@ -2493,9 +2498,6 @@ zonelist_scan: ac->nodemask) { unsigned long mark; - if (IS_ENABLED(CONFIG_NUMA) && zlc_active && - !zlc_zone_worth_trying(zonelist, z, allowednodes)) - continue; if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed(zone, gfp_mask)) @@ -2533,14 +2535,14 @@ zonelist_scan: * * XXX: For now, allow allocations to potentially * exceed the per-zone dirty limit in the slowpath - * (ALLOC_WMARK_LOW unset) before going into reclaim, + * (spread_dirty_pages unset) before going into reclaim, * which is important when on a NUMA setup the allowed * zones are together not big enough to reach the * global limit. The proper fix for these situations * will require awareness of zones in the * dirty-throttling and the flusher threads. */ - if (consider_zone_dirty && !zone_dirty_ok(zone)) + if (ac->spread_dirty_pages && !zone_dirty_ok(zone)) continue; mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; @@ -2553,28 +2555,8 @@ zonelist_scan: if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone; - if (IS_ENABLED(CONFIG_NUMA) && - !did_zlc_setup && nr_online_nodes > 1) { - /* - * we do zlc_setup if there are multiple nodes - * and before considering the first zone allowed - * by the cpuset. - */ - allowednodes = zlc_setup(zonelist, alloc_flags); - zlc_active = 1; - did_zlc_setup = 1; - } - if (zone_reclaim_mode == 0 || !zone_allows_reclaim(ac->preferred_zone, zone)) - goto this_zone_full; - - /* - * As we may have just activated ZLC, check if the first - * eligible zone has failed zone_reclaim recently. - */ - if (IS_ENABLED(CONFIG_NUMA) && zlc_active && - !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; ret = zone_reclaim(zone, gfp_mask, order); @@ -2591,34 +2573,26 @@ zonelist_scan: ac->classzone_idx, alloc_flags)) goto try_this_zone; - /* - * Failed to reclaim enough to meet watermark. - * Only mark the zone full if checking the min - * watermark or if we failed to reclaim just - * 1<<order pages or else the page allocator - * fastpath will prematurely mark zones full - * when the watermark is between the low and - * min watermarks. - */ - if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) || - ret == ZONE_RECLAIM_SOME) - goto this_zone_full; - continue; } } try_this_zone: page = buffered_rmqueue(ac->preferred_zone, zone, order, - gfp_mask, ac->migratetype); + gfp_mask, alloc_flags, ac->migratetype); if (page) { if (prep_new_page(page, order, gfp_mask, alloc_flags)) goto try_this_zone; + + /* + * If this is a high-order atomic allocation then check + * if the pageblock should be reserved for the future + */ + if (unlikely(order && (alloc_flags & ALLOC_HARDER))) + reserve_highatomic_pageblock(page, zone, order); + return page; } -this_zone_full: - if (IS_ENABLED(CONFIG_NUMA) && zlc_active) - zlc_mark_zone_full(zonelist, z); } /* @@ -2639,12 +2613,6 @@ this_zone_full: zonelist_rescan = true; } - if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { - /* Disable zlc cache for second zonelist scan */ - zlc_active = 0; - zonelist_rescan = true; - } - if (zonelist_rescan) goto zonelist_scan; @@ -2669,7 +2637,7 @@ static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); -void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) +void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) { unsigned int filter = SHOW_MEM_FILTER_NODES; @@ -2686,7 +2654,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) if (test_thread_flag(TIF_MEMDIE) || (current->flags & (PF_MEMALLOC | PF_EXITING))) filter &= ~SHOW_MEM_FILTER_NODES; - if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) + if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; if (fmt) { @@ -2703,7 +2671,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) va_end(args); } - pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", + pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n", current->comm, order, gfp_mask); dump_stack(); @@ -2889,19 +2857,17 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, if (unlikely(!(*did_some_progress))) return NULL; - /* After successful reclaim, reconsider all zones for allocation */ - if (IS_ENABLED(CONFIG_NUMA)) - zlc_clear_zones_full(ac->zonelist); - retry: page = get_page_from_freelist(gfp_mask, order, alloc_flags & ~ALLOC_NO_WATERMARKS, ac); /* * If an allocation failed after direct reclaim, it could be because - * pages are pinned on the per-cpu lists. Drain them and try again + * pages are pinned on the per-cpu lists or in high alloc reserves. + * Shrink them them and try again */ if (!page && !drained) { + unreserve_highatomic_pageblock(ac); drain_all_pages(NULL); drained = true; goto retry; @@ -2946,7 +2912,6 @@ static inline int gfp_to_alloc_flags(gfp_t gfp_mask) { int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; - const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD)); /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); @@ -2955,11 +2920,11 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH). + * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). */ alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); - if (atomic) { + if (gfp_mask & __GFP_ATOMIC) { /* * Not worth trying to allocate harder for __GFP_NOMEMALLOC even * if it can't schedule. @@ -2996,11 +2961,16 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); } +static inline bool is_thp_gfp_mask(gfp_t gfp_mask) +{ + return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE; +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { - const gfp_t wait = gfp_mask & __GFP_WAIT; + bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; struct page *page = NULL; int alloc_flags; unsigned long pages_reclaimed = 0; @@ -3021,15 +2991,23 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, } /* + * We also sanity check to catch abuse of atomic reserves being used by + * callers that are not in atomic context. + */ + if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == + (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) + gfp_mask &= ~__GFP_ATOMIC; + + /* * If this allocation cannot block and it is for a specific node, then * fail early. There's no need to wakeup kswapd or retry for a * speculative node-specific allocation. */ - if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait) + if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim) goto nopage; retry: - if (!(gfp_mask & __GFP_NO_KSWAPD)) + if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); /* @@ -3072,8 +3050,8 @@ retry: } } - /* Atomic allocations - we can't balance anything */ - if (!wait) { + /* Caller is not willing to reclaim, we can't balance anything */ + if (!can_direct_reclaim) { /* * All existing users of the deprecated __GFP_NOFAIL are * blockable, so warn of any new users that actually allow this @@ -3103,7 +3081,7 @@ retry: goto got_pg; /* Checks for THP-specific high-order allocations */ - if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) { + if (is_thp_gfp_mask(gfp_mask)) { /* * If compaction is deferred for high-order allocations, it is * because sync compaction recently failed. If this is the case @@ -3138,8 +3116,7 @@ retry: * fault, so use asynchronous memory compaction for THP unless it is * khugepaged trying to collapse. */ - if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE || - (current->flags & PF_KTHREAD)) + if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD)) migration_mode = MIGRATE_SYNC_LIGHT; /* Try direct reclaim and then allocating */ @@ -3210,7 +3187,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, lockdep_trace_alloc(gfp_mask); - might_sleep_if(gfp_mask & __GFP_WAIT); + might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); if (should_fail_alloc_page(gfp_mask, order)) return NULL; @@ -3231,6 +3208,10 @@ retry_cpuset: /* We set it here, as __alloc_pages_slowpath might have changed it */ ac.zonelist = zonelist; + + /* Dirty zone balancing only done in the fast path */ + ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); + /* The preferred zone is used for statistics later */ preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, ac.nodemask ? : &cpuset_current_mems_allowed, @@ -3249,6 +3230,7 @@ retry_cpuset: * complete. */ alloc_mask = memalloc_noio_flags(gfp_mask); + ac.spread_dirty_pages = false; page = __alloc_pages_slowpath(alloc_mask, order, &ac); } @@ -3467,7 +3449,8 @@ void free_kmem_pages(unsigned long addr, unsigned int order) } } -static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) +static void *make_alloc_exact(unsigned long addr, unsigned int order, + size_t size) { if (addr) { unsigned long alloc_end = addr + (PAGE_SIZE << order); @@ -3517,7 +3500,7 @@ EXPORT_SYMBOL(alloc_pages_exact); */ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { - unsigned order = get_order(size); + unsigned int order = get_order(size); struct page *p = alloc_pages_node(nid, gfp_mask, order); if (!p) return NULL; @@ -3666,7 +3649,6 @@ static void show_migration_types(unsigned char type) [MIGRATE_UNMOVABLE] = 'U', [MIGRATE_RECLAIMABLE] = 'E', [MIGRATE_MOVABLE] = 'M', - [MIGRATE_RESERVE] = 'R', #ifdef CONFIG_CMA [MIGRATE_CMA] = 'C', #endif @@ -3819,7 +3801,8 @@ void show_free_areas(unsigned int filter) } for_each_populated_zone(zone) { - unsigned long nr[MAX_ORDER], flags, order, total = 0; + unsigned int order; + unsigned long nr[MAX_ORDER], flags, total = 0; unsigned char types[MAX_ORDER]; if (skip_free_areas_node(filter, zone_to_nid(zone))) @@ -4168,7 +4151,7 @@ static void build_zonelists(pg_data_t *pgdat) nodemask_t used_mask; int local_node, prev_node; struct zonelist *zonelist; - int order = current_zonelist_order; + unsigned int order = current_zonelist_order; /* initialize zonelists */ for (i = 0; i < MAX_ZONELISTS; i++) { @@ -4212,20 +4195,6 @@ static void build_zonelists(pg_data_t *pgdat) build_thisnode_zonelists(pgdat); } -/* Construct the zonelist performance cache - see further mmzone.h */ -static void build_zonelist_cache(pg_data_t *pgdat) -{ - struct zonelist *zonelist; - struct zonelist_cache *zlc; - struct zoneref *z; - - zonelist = &pgdat->node_zonelists[0]; - zonelist->zlcache_ptr = zlc = &zonelist->zlcache; - bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); - for (z = zonelist->_zonerefs; z->zone; z++) - zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); -} - #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * Return node id of node used for "local" allocations. @@ -4286,12 +4255,6 @@ static void build_zonelists(pg_data_t *pgdat) zonelist->_zonerefs[j].zone_idx = 0; } -/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ -static void build_zonelist_cache(pg_data_t *pgdat) -{ - pgdat->node_zonelists[0].zlcache_ptr = NULL; -} - #endif /* CONFIG_NUMA */ /* @@ -4332,14 +4295,12 @@ static int __build_all_zonelists(void *data) if (self && !node_online(self->node_id)) { build_zonelists(self); - build_zonelist_cache(self); } for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); build_zonelists(pgdat); - build_zonelist_cache(pgdat); } /* @@ -4499,120 +4460,6 @@ static inline unsigned long wait_table_bits(unsigned long size) } /* - * Check if a pageblock contains reserved pages - */ -static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long pfn; - - for (pfn = start_pfn; pfn < end_pfn; pfn++) { - if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) - return 1; - } - return 0; -} - -/* - * Mark a number of pageblocks as MIGRATE_RESERVE. The number - * of blocks reserved is based on min_wmark_pages(zone). The memory within - * the reserve will tend to store contiguous free pages. Setting min_free_kbytes - * higher will lead to a bigger reserve which will get freed as contiguous - * blocks as reclaim kicks in - */ -static void setup_zone_migrate_reserve(struct zone *zone) -{ - unsigned long start_pfn, pfn, end_pfn, block_end_pfn; - struct page *page; - unsigned long block_migratetype; - int reserve; - int old_reserve; - - /* - * Get the start pfn, end pfn and the number of blocks to reserve - * We have to be careful to be aligned to pageblock_nr_pages to - * make sure that we always check pfn_valid for the first page in - * the block. - */ - start_pfn = zone->zone_start_pfn; - end_pfn = zone_end_pfn(zone); - start_pfn = roundup(start_pfn, pageblock_nr_pages); - reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> - pageblock_order; - - /* - * Reserve blocks are generally in place to help high-order atomic - * allocations that are short-lived. A min_free_kbytes value that - * would result in more than 2 reserve blocks for atomic allocations - * is assumed to be in place to help anti-fragmentation for the - * future allocation of hugepages at runtime. - */ - reserve = min(2, reserve); - old_reserve = zone->nr_migrate_reserve_block; - - /* When memory hot-add, we almost always need to do nothing */ - if (reserve == old_reserve) - return; - zone->nr_migrate_reserve_block = reserve; - - for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { - if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone))) - return; - - if (!pfn_valid(pfn)) - continue; - page = pfn_to_page(pfn); - - /* Watch out for overlapping nodes */ - if (page_to_nid(page) != zone_to_nid(zone)) - continue; - - block_migratetype = get_pageblock_migratetype(page); - - /* Only test what is necessary when the reserves are not met */ - if (reserve > 0) { - /* - * Blocks with reserved pages will never free, skip - * them. - */ - block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); - if (pageblock_is_reserved(pfn, block_end_pfn)) - continue; - - /* If this block is reserved, account for it */ - if (block_migratetype == MIGRATE_RESERVE) { - reserve--; - continue; - } - - /* Suitable for reserving if this block is movable */ - if (block_migratetype == MIGRATE_MOVABLE) { - set_pageblock_migratetype(page, - MIGRATE_RESERVE); - move_freepages_block(zone, page, - MIGRATE_RESERVE); - reserve--; - continue; - } - } else if (!old_reserve) { - /* - * At boot time we don't need to scan the whole zone - * for turning off MIGRATE_RESERVE. - */ - break; - } - - /* - * If the reserve is met and this is a previous reserved block, - * take it back - */ - if (block_migratetype == MIGRATE_RESERVE) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - move_freepages_block(zone, page, MIGRATE_MOVABLE); - } - } -} - -/* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. @@ -4651,9 +4498,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * movable at startup. This will force kernel allocations * to reserve their blocks rather than leaking throughout * the address space during boot when many long-lived - * kernel allocations are made. Later some blocks near - * the start are marked MIGRATE_RESERVE by - * setup_zone_migrate_reserve() + * kernel allocations are made. * * bitmap is created for zone's valid pfn range. but memmap * can be created for invalid pages (for alignment) @@ -6214,7 +6059,6 @@ static void __setup_per_zone_wmarks(void) high_wmark_pages(zone) - low_wmark_pages(zone) - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); } @@ -6836,7 +6680,8 @@ int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype) { unsigned long outer_start, outer_end; - int ret = 0, order; + unsigned int order; + int ret = 0; struct compact_control cc = { .nr_migratepages = 0, diff --git a/mm/readahead.c b/mm/readahead.c index 998ad59..ba22d7f 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -90,7 +90,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, page = list_to_page(pages); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, - GFP_KERNEL & mapping_gfp_mask(mapping))) { + mapping_gfp_constraint(mapping, GFP_KERNEL))) { read_cache_pages_invalidate_page(mapping, page); continue; } @@ -128,7 +128,7 @@ static int read_pages(struct address_space *mapping, struct file *filp, struct page *page = list_to_page(pages); list_del(&page->lru); if (!add_to_page_cache_lru(page, mapping, page->index, - GFP_KERNEL & mapping_gfp_mask(mapping))) { + mapping_gfp_constraint(mapping, GFP_KERNEL))) { mapping->a_ops->readpage(filp, page); } page_cache_release(page); @@ -73,6 +73,8 @@ static struct vfsmount *shm_mnt; #include <asm/uaccess.h> #include <asm/pgtable.h> +#include "internal.h" + #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) @@ -1031,12 +1031,12 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) } /* - * Construct gfp mask to allocate from a specific node but do not invoke reclaim - * or warn about failures. + * Construct gfp mask to allocate from a specific node but do not direct reclaim + * or warn about failures. kswapd may still wake to reclaim in the background. */ static inline gfp_t gfp_exact_node(gfp_t flags) { - return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT; + return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM; } #endif @@ -1889,21 +1889,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page) freelist = page->freelist; slab_destroy_debugcheck(cachep, page); - if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { - struct rcu_head *head; - - /* - * RCU free overloads the RCU head over the LRU. - * slab_page has been overloeaded over the LRU, - * however it is not used from now on so that - * we can use it safely. - */ - head = (void *)&page->rcu_head; - call_rcu(head, kmem_rcu_free); - - } else { + if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) + call_rcu(&page->rcu_head, kmem_rcu_free); + else kmem_freepages(cachep, page); - } /* * From now on, we don't use freelist @@ -2633,7 +2622,7 @@ static int cache_grow(struct kmem_cache *cachep, offset *= cachep->colour_off; - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_enable(); /* @@ -2663,7 +2652,7 @@ static int cache_grow(struct kmem_cache *cachep, cache_init_objs(cachep, page); - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); check_irq_off(); spin_lock(&n->list_lock); @@ -2677,7 +2666,7 @@ static int cache_grow(struct kmem_cache *cachep, opps1: kmem_freepages(cachep, page); failed: - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); return 0; } @@ -2869,7 +2858,7 @@ force_grow: static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) { - might_sleep_if(flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(flags)); #if DEBUG kmem_flagcheck(cachep, flags); #endif @@ -3057,11 +3046,11 @@ retry: */ struct page *page; - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_enable(); kmem_flagcheck(cache, flags); page = kmem_getpages(cache, local_flags, numa_mem_id()); - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); if (page) { /* @@ -1265,7 +1265,7 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, { flags &= gfp_allowed_mask; lockdep_trace_alloc(flags); - might_sleep_if(flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(flags)); if (should_failslab(s->object_size, flags, s->flags)) return NULL; @@ -1353,7 +1353,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) flags &= gfp_allowed_mask; - if (flags & __GFP_WAIT) + if (gfpflags_allow_blocking(flags)) local_irq_enable(); flags |= s->allocflags; @@ -1363,8 +1363,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * so we fall-back to the minimum order allocation. */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; - if ((alloc_gfp & __GFP_WAIT) && oo_order(oo) > oo_order(s->min)) - alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_WAIT; + if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) + alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM; page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) { @@ -1424,7 +1424,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) page->frozen = 1; out: - if (flags & __GFP_WAIT) + if (gfpflags_allow_blocking(flags)) local_irq_disable(); if (!page) return NULL; @@ -1507,10 +1507,7 @@ static void free_slab(struct kmem_cache *s, struct page *page) VM_BUG_ON(s->reserved != sizeof(*head)); head = page_address(page) + offset; } else { - /* - * RCU free overloads the RCU head over the LRU - */ - head = (void *)&page->lru; + head = &page->rcu_head; } call_rcu(head, rcu_free_slab); @@ -201,7 +201,7 @@ out_put_single: __put_single_page(page); return; } - VM_BUG_ON_PAGE(page_head != page->first_page, page); + VM_BUG_ON_PAGE(page_head != compound_head(page), page); /* * We can release the refcount taken by * get_page_unless_zero() now that @@ -262,7 +262,7 @@ static void put_compound_page(struct page *page) * Case 3 is possible, as we may race with * __split_huge_page_refcount tearing down a THP page. */ - page_head = compound_head_by_tail(page); + page_head = compound_head(page); if (!__compound_tail_refcounted(page_head)) put_unrefcounted_compound_page(page_head, page); else diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9db9ef5..d045634 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -35,6 +35,8 @@ #include <asm/tlbflush.h> #include <asm/shmparam.h> +#include "internal.h" + struct vfree_deferred { struct llist_head list; struct work_struct wq; @@ -1617,7 +1619,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } area->pages[i] = page; - if (gfp_mask & __GFP_WAIT) + if (gfpflags_allow_blocking(gfp_mask)) cond_resched(); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 55721b6..2aec424 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1476,7 +1476,7 @@ static int too_many_isolated(struct zone *zone, int file, * won't get blocked by normal direct-reclaimers, forming a circular * deadlock. */ - if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) + if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) inactive >>= 3; return isolated > inactive; @@ -2477,7 +2477,7 @@ static inline bool compaction_ready(struct zone *zone, int order) balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); watermark = high_wmark_pages(zone) + balance_gap + (2UL << order); - watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); + watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0); /* * If compaction is deferred, reclaim up to a point where @@ -2960,7 +2960,7 @@ static bool zone_balanced(struct zone *zone, int order, unsigned long balance_gap, int classzone_idx) { if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + - balance_gap, classzone_idx, 0)) + balance_gap, classzone_idx)) return false; if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, @@ -3791,7 +3791,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) /* * Do not scan if the allocation should not be delayed. */ - if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) + if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) return ZONE_RECLAIM_NOSCAN; /* diff --git a/mm/vmstat.c b/mm/vmstat.c index ffcb4f5..879a2be 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -923,7 +923,7 @@ static char * const migratetype_names[MIGRATE_TYPES] = { "Unmovable", "Reclaimable", "Movable", - "Reserve", + "HighAtomic", #ifdef CONFIG_CMA "CMA", #endif @@ -137,7 +137,7 @@ static const struct zbud_ops zbud_zpool_ops = { .evict = zbud_zpool_evict }; -static void *zbud_zpool_create(char *name, gfp_t gfp, +static void *zbud_zpool_create(const char *name, gfp_t gfp, const struct zpool_ops *zpool_ops, struct zpool *zpool) { @@ -18,8 +18,6 @@ #include <linux/zpool.h> struct zpool { - char *type; - struct zpool_driver *driver; void *pool; const struct zpool_ops *ops; @@ -73,7 +71,8 @@ int zpool_unregister_driver(struct zpool_driver *driver) } EXPORT_SYMBOL(zpool_unregister_driver); -static struct zpool_driver *zpool_get_driver(char *type) +/* this assumes @type is null-terminated. */ +static struct zpool_driver *zpool_get_driver(const char *type) { struct zpool_driver *driver; @@ -113,6 +112,8 @@ static void zpool_put_driver(struct zpool_driver *driver) * not be loaded, and calling @zpool_create_pool() with the pool type will * fail. * + * The @type string must be null-terminated. + * * Returns: true if @type pool is available, false if not */ bool zpool_has_pool(char *type) @@ -145,9 +146,11 @@ EXPORT_SYMBOL(zpool_has_pool); * * Implementations must guarantee this to be thread-safe. * + * The @type and @name strings must be null-terminated. + * * Returns: New zpool on success, NULL on failure. */ -struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, +struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, const struct zpool_ops *ops) { struct zpool_driver *driver; @@ -174,7 +177,6 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, return NULL; } - zpool->type = driver->type; zpool->driver = driver; zpool->pool = driver->create(name, gfp, ops, zpool); zpool->ops = ops; @@ -208,7 +210,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, */ void zpool_destroy_pool(struct zpool *zpool) { - pr_debug("destroying pool type %s\n", zpool->type); + pr_debug("destroying pool type %s\n", zpool->driver->type); spin_lock(&pools_lock); list_del(&zpool->list); @@ -228,9 +230,9 @@ void zpool_destroy_pool(struct zpool *zpool) * * Returns: The type of zpool. */ -char *zpool_get_type(struct zpool *zpool) +const char *zpool_get_type(struct zpool *zpool) { - return zpool->type; + return zpool->driver->type; } /** diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index f135b1b..9f15bdd 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -16,7 +16,7 @@ * struct page(s) to form a zspage. * * Usage of struct page fields: - * page->first_page: points to the first component (0-order) page + * page->private: points to the first component (0-order) page * page->index (union with page->freelist): offset of the first object * starting in this page. For the first page, this is * always 0, so we use this field (aka freelist) to point @@ -26,8 +26,7 @@ * * For _first_ page only: * - * page->private (union with page->first_page): refers to the - * component page after the first page + * page->private: refers to the component page after the first page * If the page is first_page for huge object, it stores handle. * Look at size_class->huge. * page->freelist: points to the first free object in zspage. @@ -38,6 +37,7 @@ * page->lru: links together first pages of various zspages. * Basically forming list of zspages in a fullness group. * page->mapping: class index and fullness group of the zspage + * page->inuse: the number of objects that are used in this zspage * * Usage of struct page flags: * PG_private: identifies the first component page @@ -58,7 +58,7 @@ #include <linux/cpumask.h> #include <linux/cpu.h> #include <linux/vmalloc.h> -#include <linux/hardirq.h> +#include <linux/preempt.h> #include <linux/spinlock.h> #include <linux/types.h> #include <linux/debugfs.h> @@ -166,9 +166,14 @@ enum zs_stat_type { OBJ_USED, CLASS_ALMOST_FULL, CLASS_ALMOST_EMPTY, - NR_ZS_STAT_TYPE, }; +#ifdef CONFIG_ZSMALLOC_STAT +#define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1) +#else +#define NR_ZS_STAT_TYPE (OBJ_USED + 1) +#endif + struct zs_size_stat { unsigned long objs[NR_ZS_STAT_TYPE]; }; @@ -237,7 +242,7 @@ struct link_free { }; struct zs_pool { - char *name; + const char *name; struct size_class **size_class; struct kmem_cache *handle_cachep; @@ -311,7 +316,7 @@ static void record_obj(unsigned long handle, unsigned long obj) #ifdef CONFIG_ZPOOL -static void *zs_zpool_create(char *name, gfp_t gfp, +static void *zs_zpool_create(const char *name, gfp_t gfp, const struct zpool_ops *zpool_ops, struct zpool *zpool) { @@ -447,19 +452,23 @@ static int get_size_class_index(int size) static inline void zs_stat_inc(struct size_class *class, enum zs_stat_type type, unsigned long cnt) { - class->stats.objs[type] += cnt; + if (type < NR_ZS_STAT_TYPE) + class->stats.objs[type] += cnt; } static inline void zs_stat_dec(struct size_class *class, enum zs_stat_type type, unsigned long cnt) { - class->stats.objs[type] -= cnt; + if (type < NR_ZS_STAT_TYPE) + class->stats.objs[type] -= cnt; } static inline unsigned long zs_stat_get(struct size_class *class, enum zs_stat_type type) { - return class->stats.objs[type]; + if (type < NR_ZS_STAT_TYPE) + return class->stats.objs[type]; + return 0; } #ifdef CONFIG_ZSMALLOC_STAT @@ -548,7 +557,7 @@ static const struct file_operations zs_stat_size_ops = { .release = single_release, }; -static int zs_pool_stat_create(char *name, struct zs_pool *pool) +static int zs_pool_stat_create(const char *name, struct zs_pool *pool) { struct dentry *entry; @@ -588,7 +597,7 @@ static void __exit zs_stat_exit(void) { } -static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) +static inline int zs_pool_stat_create(const char *name, struct zs_pool *pool) { return 0; } @@ -764,7 +773,7 @@ static struct page *get_first_page(struct page *page) if (is_first_page(page)) return page; else - return page->first_page; + return (struct page *)page_private(page); } static struct page *get_next_page(struct page *page) @@ -824,7 +833,7 @@ static unsigned long obj_to_head(struct size_class *class, struct page *page, { if (class->huge) { VM_BUG_ON(!is_first_page(page)); - return *(unsigned long *)page_private(page); + return page_private(page); } else return *(unsigned long *)obj; } @@ -949,7 +958,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags) * Allocate individual pages and link them together as: * 1. first page->private = first sub-page * 2. all sub-pages are linked together using page->lru - * 3. each sub-page is linked to the first page using page->first_page + * 3. each sub-page is linked to the first page using page->private * * For each size class, First/Head pages are linked together using * page->lru. Also, we set PG_private to identify the first page @@ -974,7 +983,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags) if (i == 1) set_page_private(first_page, (unsigned long)page); if (i >= 1) - page->first_page = first_page; + set_page_private(page, (unsigned long)first_page); if (i >= 2) list_add(&page->lru, &prev_page->lru); if (i == class->pages_per_zspage - 1) /* last page */ @@ -1428,8 +1437,6 @@ static void obj_free(struct zs_pool *pool, struct size_class *class, struct page *first_page, *f_page; unsigned long f_objidx, f_offset; void *vaddr; - int class_idx; - enum fullness_group fullness; BUG_ON(!obj); @@ -1437,7 +1444,6 @@ static void obj_free(struct zs_pool *pool, struct size_class *class, obj_to_location(obj, &f_page, &f_objidx); first_page = get_first_page(f_page); - get_zspage_mapping(first_page, &class_idx, &fullness); f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); vaddr = kmap_atomic(f_page); @@ -1822,9 +1828,6 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker, struct zs_pool *pool = container_of(shrinker, struct zs_pool, shrinker); - if (!pool->shrinker_enabled) - return 0; - for (i = zs_size_classes - 1; i >= 0; i--) { class = pool->size_class[i]; if (!class) @@ -1866,7 +1869,7 @@ static int zs_register_shrinker(struct zs_pool *pool) * On success, a pointer to the newly created pool is returned, * otherwise NULL. */ -struct zs_pool *zs_create_pool(char *name, gfp_t flags) +struct zs_pool *zs_create_pool(const char *name, gfp_t flags) { int i; struct zs_pool *pool; @@ -82,33 +82,27 @@ module_param_named(enabled, zswap_enabled, bool, 0644); /* Crypto compressor to use */ #define ZSWAP_COMPRESSOR_DEFAULT "lzo" -static char zswap_compressor[CRYPTO_MAX_ALG_NAME] = ZSWAP_COMPRESSOR_DEFAULT; -static struct kparam_string zswap_compressor_kparam = { - .string = zswap_compressor, - .maxlen = sizeof(zswap_compressor), -}; +static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; static int zswap_compressor_param_set(const char *, const struct kernel_param *); static struct kernel_param_ops zswap_compressor_param_ops = { .set = zswap_compressor_param_set, - .get = param_get_string, + .get = param_get_charp, + .free = param_free_charp, }; module_param_cb(compressor, &zswap_compressor_param_ops, - &zswap_compressor_kparam, 0644); + &zswap_compressor, 0644); /* Compressed storage zpool to use */ #define ZSWAP_ZPOOL_DEFAULT "zbud" -static char zswap_zpool_type[32 /* arbitrary */] = ZSWAP_ZPOOL_DEFAULT; -static struct kparam_string zswap_zpool_kparam = { - .string = zswap_zpool_type, - .maxlen = sizeof(zswap_zpool_type), -}; +static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; static int zswap_zpool_param_set(const char *, const struct kernel_param *); static struct kernel_param_ops zswap_zpool_param_ops = { - .set = zswap_zpool_param_set, - .get = param_get_string, + .set = zswap_zpool_param_set, + .get = param_get_charp, + .free = param_free_charp, }; -module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_kparam, 0644); +module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); /* The maximum percentage of memory that the compressed pool can occupy */ static unsigned int zswap_max_pool_percent = 20; @@ -342,7 +336,7 @@ static void zswap_entry_put(struct zswap_tree *tree, static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, pgoff_t offset) { - struct zswap_entry *entry = NULL; + struct zswap_entry *entry; entry = zswap_rb_search(root, offset); if (entry) @@ -571,7 +565,7 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { struct zswap_pool *pool; - gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; + gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) { @@ -615,19 +609,29 @@ error: return NULL; } -static struct zswap_pool *__zswap_pool_create_fallback(void) +static __init struct zswap_pool *__zswap_pool_create_fallback(void) { if (!crypto_has_comp(zswap_compressor, 0, 0)) { + if (!strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) { + pr_err("default compressor %s not available\n", + zswap_compressor); + return NULL; + } pr_err("compressor %s not available, using default %s\n", zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT); - strncpy(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT, - sizeof(zswap_compressor)); + param_free_charp(&zswap_compressor); + zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; } if (!zpool_has_pool(zswap_zpool_type)) { + if (!strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { + pr_err("default zpool %s not available\n", + zswap_zpool_type); + return NULL; + } pr_err("zpool %s not available, using default %s\n", zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT); - strncpy(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT, - sizeof(zswap_zpool_type)); + param_free_charp(&zswap_zpool_type); + zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; } return zswap_pool_create(zswap_zpool_type, zswap_compressor); @@ -684,43 +688,39 @@ static void zswap_pool_put(struct zswap_pool *pool) * param callbacks **********************************/ +/* val must be a null-terminated string */ static int __zswap_param_set(const char *val, const struct kernel_param *kp, char *type, char *compressor) { struct zswap_pool *pool, *put_pool = NULL; - char str[kp->str->maxlen], *s; + char *s = strstrip((char *)val); int ret; - /* - * kp is either zswap_zpool_kparam or zswap_compressor_kparam, defined - * at the top of this file, so maxlen is CRYPTO_MAX_ALG_NAME (64) or - * 32 (arbitrary). - */ - strlcpy(str, val, kp->str->maxlen); - s = strim(str); + /* no change required */ + if (!strcmp(s, *(char **)kp->arg)) + return 0; /* if this is load-time (pre-init) param setting, * don't create a pool; that's done during init. */ if (!zswap_init_started) - return param_set_copystring(s, kp); - - /* no change required */ - if (!strncmp(kp->str->string, s, kp->str->maxlen)) - return 0; + return param_set_charp(s, kp); if (!type) { - type = s; - if (!zpool_has_pool(type)) { - pr_err("zpool %s not available\n", type); + if (!zpool_has_pool(s)) { + pr_err("zpool %s not available\n", s); return -ENOENT; } + type = s; } else if (!compressor) { - compressor = s; - if (!crypto_has_comp(compressor, 0, 0)) { - pr_err("compressor %s not available\n", compressor); + if (!crypto_has_comp(s, 0, 0)) { + pr_err("compressor %s not available\n", s); return -ENOENT; } + compressor = s; + } else { + WARN_ON(1); + return -EINVAL; } spin_lock(&zswap_pools_lock); @@ -736,7 +736,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, } if (pool) - ret = param_set_copystring(s, kp); + ret = param_set_charp(s, kp); else ret = -EINVAL; @@ -1011,7 +1011,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* store */ len = dlen + sizeof(struct zswap_header); ret = zpool_malloc(entry->pool->zpool, len, - __GFP_NORETRY | __GFP_NOWARN, &handle); + __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM, + &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; goto put_dstmem; |