diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/compaction.c | 22 | ||||
-rw-r--r-- | mm/filemap.c | 50 | ||||
-rw-r--r-- | mm/huge_memory.c | 13 | ||||
-rw-r--r-- | mm/hugetlb.c | 20 | ||||
-rw-r--r-- | mm/memcontrol.c | 20 | ||||
-rw-r--r-- | mm/memory.c | 58 | ||||
-rw-r--r-- | mm/page-writeback.c | 6 | ||||
-rw-r--r-- | mm/slab.c | 6 | ||||
-rw-r--r-- | mm/slab.h | 1 | ||||
-rw-r--r-- | mm/slab_common.c | 13 | ||||
-rw-r--r-- | mm/slub.c | 41 | ||||
-rw-r--r-- | mm/truncate.c | 8 | ||||
-rw-r--r-- | mm/util.c | 10 | ||||
-rw-r--r-- | mm/vmacache.c | 8 | ||||
-rw-r--r-- | mm/vmscan.c | 20 |
15 files changed, 183 insertions, 113 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index 37f9762..627dc2e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -671,16 +671,20 @@ static void isolate_freepages(struct zone *zone, struct compact_control *cc) { struct page *page; - unsigned long high_pfn, low_pfn, pfn, z_end_pfn, end_pfn; + unsigned long high_pfn, low_pfn, pfn, z_end_pfn; int nr_freepages = cc->nr_freepages; struct list_head *freelist = &cc->freepages; /* * Initialise the free scanner. The starting point is where we last - * scanned from (or the end of the zone if starting). The low point - * is the end of the pageblock the migration scanner is using. + * successfully isolated from, zone-cached value, or the end of the + * zone when isolating for the first time. We need this aligned to + * the pageblock boundary, because we do pfn -= pageblock_nr_pages + * in the for loop. + * The low boundary is the end of the pageblock the migration scanner + * is using. */ - pfn = cc->free_pfn; + pfn = cc->free_pfn & ~(pageblock_nr_pages-1); low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); /* @@ -700,6 +704,7 @@ static void isolate_freepages(struct zone *zone, for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; pfn -= pageblock_nr_pages) { unsigned long isolated; + unsigned long end_pfn; /* * This can iterate a massively long zone without finding any @@ -734,13 +739,10 @@ static void isolate_freepages(struct zone *zone, isolated = 0; /* - * As pfn may not start aligned, pfn+pageblock_nr_page - * may cross a MAX_ORDER_NR_PAGES boundary and miss - * a pfn_valid check. Ensure isolate_freepages_block() - * only scans within a pageblock + * Take care when isolating in last pageblock of a zone which + * ends in the middle of a pageblock. */ - end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); - end_pfn = min(end_pfn, z_end_pfn); + end_pfn = min(pfn + pageblock_nr_pages, z_end_pfn); isolated = isolate_freepages_block(cc, pfn, end_pfn, freelist, false); nr_freepages += isolated; diff --git a/mm/filemap.c b/mm/filemap.c index a82fbe4..000a220 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -906,8 +906,8 @@ EXPORT_SYMBOL(page_cache_prev_hole); * Looks up the page cache slot at @mapping & @offset. If there is a * page cache page, it is returned with an increased refcount. * - * If the slot holds a shadow entry of a previously evicted page, it - * is returned. + * If the slot holds a shadow entry of a previously evicted page, or a + * swap entry from shmem/tmpfs, it is returned. * * Otherwise, %NULL is returned. */ @@ -928,9 +928,9 @@ repeat: if (radix_tree_deref_retry(page)) goto repeat; /* - * Otherwise, shmem/tmpfs must be storing a swap entry - * here as an exceptional entry: so return it without - * attempting to raise page count. + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Return + * it without attempting to raise page count. */ goto out; } @@ -983,8 +983,8 @@ EXPORT_SYMBOL(find_get_page); * page cache page, it is returned locked and with an increased * refcount. * - * If the slot holds a shadow entry of a previously evicted page, it - * is returned. + * If the slot holds a shadow entry of a previously evicted page, or a + * swap entry from shmem/tmpfs, it is returned. * * Otherwise, %NULL is returned. * @@ -1099,8 +1099,8 @@ EXPORT_SYMBOL(find_or_create_page); * with ascending indexes. There may be holes in the indices due to * not-present pages. * - * Any shadow entries of evicted pages are included in the returned - * array. + * Any shadow entries of evicted pages, or swap entries from + * shmem/tmpfs, are included in the returned array. * * find_get_entries() returns the number of pages and shadow entries * which were found. @@ -1128,9 +1128,9 @@ repeat: if (radix_tree_deref_retry(page)) goto restart; /* - * Otherwise, we must be storing a swap entry - * here as an exceptional entry: so return it - * without attempting to raise page count. + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Return + * it without attempting to raise page count. */ goto export; } @@ -1198,9 +1198,9 @@ repeat: goto restart; } /* - * Otherwise, shmem/tmpfs must be storing a swap entry - * here as an exceptional entry: so skip over it - - * we only reach this from invalidate_mapping_pages(). + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Skip + * over it. */ continue; } @@ -1265,9 +1265,9 @@ repeat: goto restart; } /* - * Otherwise, shmem/tmpfs must be storing a swap entry - * here as an exceptional entry: so stop looking for - * contiguous pages. + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Stop + * looking for contiguous pages. */ break; } @@ -1341,10 +1341,17 @@ repeat: goto restart; } /* - * This function is never used on a shmem/tmpfs - * mapping, so a swap entry won't be found here. + * A shadow entry of a recently evicted page. + * + * Those entries should never be tagged, but + * this tree walk is lockless and the tags are + * looked up in bulk, one radix tree node at a + * time, so there is a sizable window for page + * reclaim to evict a page we saw tagged. + * + * Skip over it. */ - BUG(); + continue; } if (!page_cache_get_speculative(page)) @@ -2581,7 +2588,6 @@ EXPORT_SYMBOL(generic_perform_write); * @iocb: IO state structure (file, offset, etc.) * @iov: vector with data to write * @nr_segs: number of segments in the vector - * @ppos: position where to write * * This function does all the work needed for actually writing data to a * file. It does all basic checks, removes SUID from the file, updates diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 64635f5..b4b1feb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1536,16 +1536,23 @@ pmd_t *page_check_address_pmd(struct page *page, enum page_check_address_pmd_flag flag, spinlock_t **ptl) { + pgd_t *pgd; + pud_t *pud; pmd_t *pmd; if (address & ~HPAGE_PMD_MASK) return NULL; - pmd = mm_find_pmd(mm, address); - if (!pmd) + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) return NULL; + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return NULL; + pmd = pmd_offset(pud, address); + *ptl = pmd_lock(mm, pmd); - if (pmd_none(*pmd)) + if (!pmd_present(*pmd)) goto unlock; if (pmd_page(*pmd) != page) goto unlock; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dd30f22..c82290b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1172,6 +1172,7 @@ static void return_unused_surplus_pages(struct hstate *h, while (nr_pages--) { if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) break; + cond_resched_lock(&hugetlb_lock); } } @@ -1980,11 +1981,7 @@ static int __init hugetlb_init(void) { int i; - /* Some platform decide whether they support huge pages at boot - * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when - * there is no such support - */ - if (HPAGE_SHIFT == 0) + if (!hugepages_supported()) return 0; if (!size_to_hstate(default_hstate_size)) { @@ -2111,6 +2108,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, unsigned long tmp; int ret; + if (!hugepages_supported()) + return -ENOTSUPP; + tmp = h->max_huge_pages; if (write && h->order >= MAX_ORDER) @@ -2164,6 +2164,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, unsigned long tmp; int ret; + if (!hugepages_supported()) + return -ENOTSUPP; + tmp = h->nr_overcommit_huge_pages; if (write && h->order >= MAX_ORDER) @@ -2189,6 +2192,8 @@ out: void hugetlb_report_meminfo(struct seq_file *m) { struct hstate *h = &default_hstate; + if (!hugepages_supported()) + return; seq_printf(m, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" @@ -2205,6 +2210,8 @@ void hugetlb_report_meminfo(struct seq_file *m) int hugetlb_report_node_meminfo(int nid, char *buf) { struct hstate *h = &default_hstate; + if (!hugepages_supported()) + return 0; return sprintf(buf, "Node %d HugePages_Total: %5u\n" "Node %d HugePages_Free: %5u\n" @@ -2219,6 +2226,9 @@ void hugetlb_show_meminfo(void) struct hstate *h; int nid; + if (!hugepages_supported()) + return; + for_each_node_state(nid, N_MEMORY) for_each_hstate(h) pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 29501f0..c47dffd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6686,16 +6686,20 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, pgoff = pte_to_pgoff(ptent); /* page is moved even if it's not RSS of this task(page-faulted). */ - page = find_get_page(mapping, pgoff); - #ifdef CONFIG_SWAP /* shmem/tmpfs may report page out on swap: account for that too. */ - if (radix_tree_exceptional_entry(page)) { - swp_entry_t swap = radix_to_swp_entry(page); - if (do_swap_account) - *entry = swap; - page = find_get_page(swap_address_space(swap), swap.val); - } + if (shmem_mapping(mapping)) { + page = find_get_entry(mapping, pgoff); + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swp = radix_to_swp_entry(page); + if (do_swap_account) + *entry = swp; + page = find_get_page(swap_address_space(swp), swp.val); + } + } else + page = find_get_page(mapping, pgoff); +#else + page = find_get_page(mapping, pgoff); #endif return page; } diff --git a/mm/memory.c b/mm/memory.c index d0f0bef..037b812 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -232,17 +232,18 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long #endif } -void tlb_flush_mmu(struct mmu_gather *tlb) +static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) { - struct mmu_gather_batch *batch; - - if (!tlb->need_flush) - return; tlb->need_flush = 0; tlb_flush(tlb); #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb_table_flush(tlb); #endif +} + +static void tlb_flush_mmu_free(struct mmu_gather *tlb) +{ + struct mmu_gather_batch *batch; for (batch = &tlb->local; batch; batch = batch->next) { free_pages_and_swap_cache(batch->pages, batch->nr); @@ -251,6 +252,14 @@ void tlb_flush_mmu(struct mmu_gather *tlb) tlb->active = &tlb->local; } +void tlb_flush_mmu(struct mmu_gather *tlb) +{ + if (!tlb->need_flush) + return; + tlb_flush_mmu_tlbonly(tlb); + tlb_flush_mmu_free(tlb); +} + /* tlb_finish_mmu * Called at the end of the shootdown operation to free up any resources * that were required. @@ -1127,8 +1136,10 @@ again: if (PageAnon(page)) rss[MM_ANONPAGES]--; else { - if (pte_dirty(ptent)) + if (pte_dirty(ptent)) { + force_flush = 1; set_page_dirty(page); + } if (pte_young(ptent) && likely(!(vma->vm_flags & VM_SEQ_READ))) mark_page_accessed(page); @@ -1137,9 +1148,10 @@ again: page_remove_rmap(page); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); - force_flush = !__tlb_remove_page(tlb, page); - if (force_flush) + if (unlikely(!__tlb_remove_page(tlb, page))) { + force_flush = 1; break; + } continue; } /* @@ -1174,18 +1186,11 @@ again: add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(start_pte, ptl); - /* - * mmu_gather ran out of room to batch pages, we break out of - * the PTE lock to avoid doing the potential expensive TLB invalidate - * and page-free while holding it. - */ + /* Do the actual TLB flush before dropping ptl */ if (force_flush) { unsigned long old_end; - force_flush = 0; - /* * Flush the TLB just for the previous segment, * then update the range to be the remaining @@ -1193,11 +1198,21 @@ again: */ old_end = tlb->end; tlb->end = addr; - - tlb_flush_mmu(tlb); - + tlb_flush_mmu_tlbonly(tlb); tlb->start = addr; tlb->end = old_end; + } + pte_unmap_unlock(start_pte, ptl); + + /* + * If we forced a TLB flush (either due to running out of + * batch buffers or because we needed to flush dirty TLB + * entries before releasing the ptl), free the batched + * memory too. Restart if we didn't do everything. + */ + if (force_flush) { + force_flush = 0; + tlb_flush_mmu_free(tlb); if (addr != end) goto again; @@ -1955,12 +1970,17 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags) { struct vm_area_struct *vma; + vm_flags_t vm_flags; int ret; vma = find_extend_vma(mm, address); if (!vma || address < vma->vm_start) return -EFAULT; + vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; + if (!(vm_flags & vma->vm_flags)) + return -EFAULT; + ret = handle_mm_fault(mm, vma, address, fault_flags); if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ef41349..a4317da 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -593,14 +593,14 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) * => fast response on large errors; small oscillation near setpoint */ -static inline long long pos_ratio_polynom(unsigned long setpoint, +static long long pos_ratio_polynom(unsigned long setpoint, unsigned long dirty, unsigned long limit) { long long pos_ratio; long x; - x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, + x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, limit - setpoint + 1); pos_ratio = x; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; @@ -842,7 +842,7 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, x_intercept = bdi_setpoint + span; if (bdi_dirty < x_intercept - span / 4) { - pos_ratio = div_u64(pos_ratio * (x_intercept - bdi_dirty), + pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty), x_intercept - bdi_setpoint + 1); } else pos_ratio /= 4; @@ -166,7 +166,7 @@ typedef unsigned char freelist_idx_t; typedef unsigned short freelist_idx_t; #endif -#define SLAB_OBJ_MAX_NUM (1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) +#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) /* * true if a page was allocated from pfmemalloc reserves for network-based @@ -2572,13 +2572,13 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, return freelist; } -static inline freelist_idx_t get_free_obj(struct page *page, unsigned char idx) +static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx) { return ((freelist_idx_t *)page->freelist)[idx]; } static inline void set_free_obj(struct page *page, - unsigned char idx, freelist_idx_t val) + unsigned int idx, freelist_idx_t val) { ((freelist_idx_t *)(page->freelist))[idx] = val; } @@ -91,6 +91,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) int __kmem_cache_shutdown(struct kmem_cache *); +void slab_kmem_cache_release(struct kmem_cache *); struct seq_file; struct file; diff --git a/mm/slab_common.c b/mm/slab_common.c index f3cfccf..102cc6f 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -323,6 +323,12 @@ static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) } #endif /* CONFIG_MEMCG_KMEM */ +void slab_kmem_cache_release(struct kmem_cache *s) +{ + kfree(s->name); + kmem_cache_free(kmem_cache, s); +} + void kmem_cache_destroy(struct kmem_cache *s) { get_online_cpus(); @@ -352,8 +358,11 @@ void kmem_cache_destroy(struct kmem_cache *s) rcu_barrier(); memcg_free_cache_params(s); - kfree(s->name); - kmem_cache_free(kmem_cache, s); +#ifdef SLAB_SUPPORTS_SYSFS + sysfs_slab_remove(s); +#else + slab_kmem_cache_release(s); +#endif goto out_put_cpus; out_unlock: @@ -210,14 +210,11 @@ enum track_item { TRACK_ALLOC, TRACK_FREE }; #ifdef CONFIG_SYSFS static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); -static void sysfs_slab_remove(struct kmem_cache *); static void memcg_propagate_slab_attrs(struct kmem_cache *s); #else static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } -static inline void sysfs_slab_remove(struct kmem_cache *s) { } - static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } #endif @@ -3238,24 +3235,7 @@ static inline int kmem_cache_close(struct kmem_cache *s) int __kmem_cache_shutdown(struct kmem_cache *s) { - int rc = kmem_cache_close(s); - - if (!rc) { - /* - * Since slab_attr_store may take the slab_mutex, we should - * release the lock while removing the sysfs entry in order to - * avoid a deadlock. Because this is pretty much the last - * operation we do and the lock will be released shortly after - * that in slab_common.c, we could just move sysfs_slab_remove - * to a later point in common code. We should do that when we - * have a common sysfs framework for all allocators. - */ - mutex_unlock(&slab_mutex); - sysfs_slab_remove(s); - mutex_lock(&slab_mutex); - } - - return rc; + return kmem_cache_close(s); } /******************************************************************** @@ -5071,15 +5051,18 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) #ifdef CONFIG_MEMCG_KMEM int i; char *buffer = NULL; + struct kmem_cache *root_cache; - if (!is_root_cache(s)) + if (is_root_cache(s)) return; + root_cache = s->memcg_params->root_cache; + /* * This mean this cache had no attribute written. Therefore, no point * in copying default values around */ - if (!s->max_attr_size) + if (!root_cache->max_attr_size) return; for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { @@ -5101,7 +5084,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) */ if (buffer) buf = buffer; - else if (s->max_attr_size < ARRAY_SIZE(mbuf)) + else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf)) buf = mbuf; else { buffer = (char *) get_zeroed_page(GFP_KERNEL); @@ -5110,7 +5093,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) buf = buffer; } - attr->show(s->memcg_params->root_cache, buf); + attr->show(root_cache, buf); attr->store(s, buf, strlen(buf)); } @@ -5119,6 +5102,11 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) #endif } +static void kmem_cache_release(struct kobject *k) +{ + slab_kmem_cache_release(to_slab(k)); +} + static const struct sysfs_ops slab_sysfs_ops = { .show = slab_attr_show, .store = slab_attr_store, @@ -5126,6 +5114,7 @@ static const struct sysfs_ops slab_sysfs_ops = { static struct kobj_type slab_ktype = { .sysfs_ops = &slab_sysfs_ops, + .release = kmem_cache_release, }; static int uevent_filter(struct kset *kset, struct kobject *kobj) @@ -5252,7 +5241,7 @@ out_put_kobj: goto out; } -static void sysfs_slab_remove(struct kmem_cache *s) +void sysfs_slab_remove(struct kmem_cache *s) { if (slab_state < FULL) /* diff --git a/mm/truncate.c b/mm/truncate.c index e5cc39a..6a78c81 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -484,14 +484,6 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, unsigned long count = 0; int i; - /* - * Note: this function may get called on a shmem/tmpfs mapping: - * pagevec_lookup() might then return 0 prematurely (because it - * got a gangful of swap entries); but it's hardly worth worrying - * about - it can rarely have anything to free from such a mapping - * (most pages are dirty), and already skips over any difficulties. - */ - pagevec_init(&pvec, 0); while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, @@ -10,6 +10,7 @@ #include <linux/swapops.h> #include <linux/mman.h> #include <linux/hugetlb.h> +#include <linux/vmalloc.h> #include <asm/uaccess.h> @@ -387,6 +388,15 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); +void kvfree(const void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} +EXPORT_SYMBOL(kvfree); + struct address_space *page_mapping(struct page *page) { struct address_space *mapping = page->mapping; diff --git a/mm/vmacache.c b/mm/vmacache.c index d4224b3..1037a3ba 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -81,10 +81,12 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) for (i = 0; i < VMACACHE_SIZE; i++) { struct vm_area_struct *vma = current->vmacache[i]; - if (vma && vma->vm_start <= addr && vma->vm_end > addr) { - BUG_ON(vma->vm_mm != mm); + if (!vma) + continue; + if (WARN_ON_ONCE(vma->vm_mm != mm)) + break; + if (vma->vm_start <= addr && vma->vm_end > addr) return vma; - } } return NULL; diff --git a/mm/vmscan.c b/mm/vmscan.c index 9b6497e..32c661d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1158,7 +1158,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, TTU_UNMAP|TTU_IGNORE_ACCESS, &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); list_splice(&clean_pages, page_list); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); + mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); return ret; } @@ -1916,6 +1916,24 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, get_lru_size(lruvec, LRU_INACTIVE_FILE); /* + * Prevent the reclaimer from falling into the cache trap: as + * cache pages start out inactive, every cache fault will tip + * the scan balance towards the file LRU. And as the file LRU + * shrinks, so does the window for rotation from references. + * This means we have a runaway feedback loop where a tiny + * thrashing file LRU becomes infinitely more attractive than + * anon pages. Try to detect this based on file LRU size. + */ + if (global_reclaim(sc)) { + unsigned long free = zone_page_state(zone, NR_FREE_PAGES); + + if (unlikely(file + free <= high_wmark_pages(zone))) { + scan_balance = SCAN_ANON; + goto out; + } + } + + /* * There is enough inactive page cache, do not reclaim * anything from the anonymous working set right now. */ |