diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-09-08 17:52:23 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-09-08 17:52:23 -0700 |
commit | f6f7a6369203fa3e07efb7f35cfd81efe9f25b07 (patch) | |
tree | 97bec9ddd999040822acf314647eaf4208213589 /mm | |
parent | 839fe9156fbe89c3157aa6146d22090f8cffddd8 (diff) | |
parent | df69f52d990bd85159727bd26e819d3a6e49c666 (diff) | |
download | op-kernel-dev-f6f7a6369203fa3e07efb7f35cfd81efe9f25b07.zip op-kernel-dev-f6f7a6369203fa3e07efb7f35cfd81efe9f25b07.tar.gz |
Merge branch 'akpm' (patches from Andrew)
Merge second patch-bomb from Andrew Morton:
"Almost all of the rest of MM. There was an unusually large amount of
MM material this time"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (141 commits)
zpool: remove no-op module init/exit
mm: zbud: constify the zbud_ops
mm: zpool: constify the zpool_ops
mm: swap: zswap: maybe_preload & refactoring
zram: unify error reporting
zsmalloc: remove null check from destroy_handle_cache()
zsmalloc: do not take class lock in zs_shrinker_count()
zsmalloc: use class->pages_per_zspage
zsmalloc: consider ZS_ALMOST_FULL as migrate source
zsmalloc: partial page ordering within a fullness_list
zsmalloc: use shrinker to trigger auto-compaction
zsmalloc: account the number of compacted pages
zsmalloc/zram: introduce zs_pool_stats api
zsmalloc: cosmetic compaction code adjustments
zsmalloc: introduce zs_can_compact() function
zsmalloc: always keep per-class stats
zsmalloc: drop unused variable `nr_to_migrate'
mm/memblock.c: fix comment in __next_mem_range()
mm/page_alloc.c: fix type information of memoryless node
memory-hotplug: fix comments in zone_spanned_pages_in_node() and zone_spanned_pages_in_node()
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/bootmem.c | 7 | ||||
-rw-r--r-- | mm/compaction.c | 175 | ||||
-rw-r--r-- | mm/dmapool.c | 12 | ||||
-rw-r--r-- | mm/early_ioremap.c | 22 | ||||
-rw-r--r-- | mm/filemap.c | 36 | ||||
-rw-r--r-- | mm/huge_memory.c | 163 | ||||
-rw-r--r-- | mm/hugetlb.c | 432 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 2 | ||||
-rw-r--r-- | mm/internal.h | 1 | ||||
-rw-r--r-- | mm/kmemleak.c | 3 | ||||
-rw-r--r-- | mm/list_lru.c | 4 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memblock.c | 31 | ||||
-rw-r--r-- | mm/memcontrol.c | 394 | ||||
-rw-r--r-- | mm/memory-failure.c | 103 | ||||
-rw-r--r-- | mm/memory.c | 48 | ||||
-rw-r--r-- | mm/mempolicy.c | 7 | ||||
-rw-r--r-- | mm/mempool.c | 3 | ||||
-rw-r--r-- | mm/memtest.c | 27 | ||||
-rw-r--r-- | mm/migrate.c | 13 | ||||
-rw-r--r-- | mm/mmap.c | 71 | ||||
-rw-r--r-- | mm/oom_kill.c | 142 | ||||
-rw-r--r-- | mm/page_alloc.c | 80 | ||||
-rw-r--r-- | mm/page_isolation.c | 35 | ||||
-rw-r--r-- | mm/shmem.c | 16 | ||||
-rw-r--r-- | mm/slab.c | 2 | ||||
-rw-r--r-- | mm/slab_common.c | 5 | ||||
-rw-r--r-- | mm/slob.c | 4 | ||||
-rw-r--r-- | mm/slub.c | 2 | ||||
-rw-r--r-- | mm/swap_state.c | 37 | ||||
-rw-r--r-- | mm/swapfile.c | 42 | ||||
-rw-r--r-- | mm/vmscan.c | 14 | ||||
-rw-r--r-- | mm/zbud.c | 10 | ||||
-rw-r--r-- | mm/zpool.c | 18 | ||||
-rw-r--r-- | mm/zsmalloc.c | 235 | ||||
-rw-r--r-- | mm/zswap.c | 75 |
36 files changed, 1243 insertions, 1030 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c index a23dd19..3b63807 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -236,6 +236,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) count += pages; while (pages--) __free_pages_bootmem(page++, cur++, 0); + bdata->node_bootmem_map = NULL; bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); @@ -294,6 +295,9 @@ static void __init __free(bootmem_data_t *bdata, sidx + bdata->node_min_pfn, eidx + bdata->node_min_pfn); + if (WARN_ON(bdata->node_bootmem_map == NULL)) + return; + if (bdata->hint_idx > sidx) bdata->hint_idx = sidx; @@ -314,6 +318,9 @@ static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx, eidx + bdata->node_min_pfn, flags); + if (WARN_ON(bdata->node_bootmem_map == NULL)) + return 0; + for (idx = sidx; idx < eidx; idx++) if (test_and_set_bit(idx, bdata->node_bootmem_map)) { if (exclusive) { diff --git a/mm/compaction.c b/mm/compaction.c index 018f08d..c5c627a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -207,6 +207,13 @@ static inline bool isolation_suitable(struct compact_control *cc, return !get_pageblock_skip(page); } +static void reset_cached_positions(struct zone *zone) +{ + zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; + zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; + zone->compact_cached_free_pfn = zone_end_pfn(zone); +} + /* * This function is called to clear all cached information on pageblocks that * should be skipped for page isolation when the migrate and free page scanner @@ -218,9 +225,6 @@ static void __reset_isolation_suitable(struct zone *zone) unsigned long end_pfn = zone_end_pfn(zone); unsigned long pfn; - zone->compact_cached_migrate_pfn[0] = start_pfn; - zone->compact_cached_migrate_pfn[1] = start_pfn; - zone->compact_cached_free_pfn = end_pfn; zone->compact_blockskip_flush = false; /* Walk the zone and mark every pageblock as suitable for isolation */ @@ -238,6 +242,8 @@ static void __reset_isolation_suitable(struct zone *zone) clear_pageblock_skip(page); } + + reset_cached_positions(zone); } void reset_isolation_suitable(pg_data_t *pgdat) @@ -431,6 +437,24 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (!valid_page) valid_page = page; + + /* + * For compound pages such as THP and hugetlbfs, we can save + * potentially a lot of iterations if we skip them at once. + * The check is racy, but we can consider only valid values + * and the only danger is skipping too much. + */ + if (PageCompound(page)) { + unsigned int comp_order = compound_order(page); + + if (likely(comp_order < MAX_ORDER)) { + blockpfn += (1UL << comp_order) - 1; + cursor += (1UL << comp_order) - 1; + } + + goto isolate_fail; + } + if (!PageBuddy(page)) goto isolate_fail; @@ -490,6 +514,13 @@ isolate_fail: } + /* + * There is a tiny chance that we have read bogus compound_order(), + * so be careful to not go outside of the pageblock. + */ + if (unlikely(blockpfn > end_pfn)) + blockpfn = end_pfn; + trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn, nr_scanned, total_isolated); @@ -674,6 +705,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Time to isolate some pages for migration */ for (; low_pfn < end_pfn; low_pfn++) { + bool is_lru; + /* * Periodically drop the lock (if held) regardless of its * contention, to give chance to IRQs. Abort async compaction @@ -717,36 +750,35 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * It's possible to migrate LRU pages and balloon pages * Skip any other type of page */ - if (!PageLRU(page)) { + is_lru = PageLRU(page); + if (!is_lru) { if (unlikely(balloon_page_movable(page))) { if (balloon_page_isolate(page)) { /* Successfully isolated */ goto isolate_success; } } - continue; } /* - * PageLRU is set. lru_lock normally excludes isolation - * splitting and collapsing (collapsing has already happened - * if PageLRU is set) but the lock is not necessarily taken - * here and it is wasteful to take it just to check transhuge. - * Check TransHuge without lock and skip the whole pageblock if - * it's either a transhuge or hugetlbfs page, as calling - * compound_order() without preventing THP from splitting the - * page underneath us may return surprising results. + * Regardless of being on LRU, compound pages such as THP and + * hugetlbfs are not to be compacted. We can potentially save + * a lot of iterations if we skip them at once. The check is + * racy, but we can consider only valid values and the only + * danger is skipping too much. */ - if (PageTransHuge(page)) { - if (!locked) - low_pfn = ALIGN(low_pfn + 1, - pageblock_nr_pages) - 1; - else - low_pfn += (1 << compound_order(page)) - 1; + if (PageCompound(page)) { + unsigned int comp_order = compound_order(page); + + if (likely(comp_order < MAX_ORDER)) + low_pfn += (1UL << comp_order) - 1; continue; } + if (!is_lru) + continue; + /* * Migration will fail if an anonymous page is pinned in memory, * so avoid taking lru_lock and isolating it unnecessarily in an @@ -763,11 +795,17 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (!locked) break; - /* Recheck PageLRU and PageTransHuge under lock */ + /* Recheck PageLRU and PageCompound under lock */ if (!PageLRU(page)) continue; - if (PageTransHuge(page)) { - low_pfn += (1 << compound_order(page)) - 1; + + /* + * Page become compound since the non-locked check, + * and it's on LRU. It can only be a THP so the order + * is safe to read and it's 0 for tail pages. + */ + if (unlikely(PageCompound(page))) { + low_pfn += (1UL << compound_order(page)) - 1; continue; } } @@ -778,7 +816,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (__isolate_lru_page(page, isolate_mode) != 0) continue; - VM_BUG_ON_PAGE(PageTransCompound(page), page); + VM_BUG_ON_PAGE(PageCompound(page), page); /* Successfully isolated */ del_page_from_lru_list(page, lruvec, page_lru(page)); @@ -898,6 +936,16 @@ static bool suitable_migration_target(struct page *page) } /* + * Test whether the free scanner has reached the same or lower pageblock than + * the migration scanner, and compaction should thus terminate. + */ +static inline bool compact_scanners_met(struct compact_control *cc) +{ + return (cc->free_pfn >> pageblock_order) + <= (cc->migrate_pfn >> pageblock_order); +} + +/* * Based on information in the current compact_control, find blocks * suitable for isolating free pages from and then isolate them. */ @@ -933,8 +981,7 @@ static void isolate_freepages(struct compact_control *cc) * pages on cc->migratepages. We stop searching if the migrate * and free page scanners meet or enough free pages are isolated. */ - for (; block_start_pfn >= low_pfn && - cc->nr_migratepages > cc->nr_freepages; + for (; block_start_pfn >= low_pfn; block_end_pfn = block_start_pfn, block_start_pfn -= pageblock_nr_pages, isolate_start_pfn = block_start_pfn) { @@ -966,6 +1013,8 @@ static void isolate_freepages(struct compact_control *cc) block_end_pfn, freelist, false); /* + * If we isolated enough freepages, or aborted due to async + * compaction being contended, terminate the loop. * Remember where the free scanner should restart next time, * which is where isolate_freepages_block() left off. * But if it scanned the whole pageblock, isolate_start_pfn @@ -974,27 +1023,31 @@ static void isolate_freepages(struct compact_control *cc) * In that case we will however want to restart at the start * of the previous pageblock. */ - cc->free_pfn = (isolate_start_pfn < block_end_pfn) ? - isolate_start_pfn : - block_start_pfn - pageblock_nr_pages; - - /* - * isolate_freepages_block() might have aborted due to async - * compaction being contended - */ - if (cc->contended) + if ((cc->nr_freepages >= cc->nr_migratepages) + || cc->contended) { + if (isolate_start_pfn >= block_end_pfn) + isolate_start_pfn = + block_start_pfn - pageblock_nr_pages; break; + } else { + /* + * isolate_freepages_block() should not terminate + * prematurely unless contended, or isolated enough + */ + VM_BUG_ON(isolate_start_pfn < block_end_pfn); + } } /* split_free_page does not map the pages */ map_pages(freelist); /* - * If we crossed the migrate scanner, we want to keep it that way - * so that compact_finished() may detect this + * Record where the free scanner will restart next time. Either we + * broke from the loop and set isolate_start_pfn based on the last + * call to isolate_freepages_block(), or we met the migration scanner + * and the loop terminated due to isolate_start_pfn < low_pfn */ - if (block_start_pfn < low_pfn) - cc->free_pfn = cc->migrate_pfn; + cc->free_pfn = isolate_start_pfn; } /* @@ -1062,6 +1115,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, struct compact_control *cc) { unsigned long low_pfn, end_pfn; + unsigned long isolate_start_pfn; struct page *page; const isolate_mode_t isolate_mode = (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | @@ -1110,6 +1164,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, continue; /* Perform the isolation */ + isolate_start_pfn = low_pfn; low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, isolate_mode); @@ -1119,6 +1174,15 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, } /* + * Record where we could have freed pages by migration and not + * yet flushed them to buddy allocator. + * - this is the lowest page that could have been isolated and + * then freed by migration. + */ + if (cc->nr_migratepages && !cc->last_migrated_pfn) + cc->last_migrated_pfn = isolate_start_pfn; + + /* * Either we isolated something and proceed with migration. Or * we failed and compact_zone should decide if we should * continue or not. @@ -1127,12 +1191,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, } acct_isolated(zone, cc); - /* - * Record where migration scanner will be restarted. If we end up in - * the same pageblock as the free scanner, make the scanners fully - * meet so that compact_finished() terminates compaction. - */ - cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn; + /* Record where migration scanner will be restarted. */ + cc->migrate_pfn = low_pfn; return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; } @@ -1147,11 +1207,9 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, return COMPACT_PARTIAL; /* Compaction run completes if the migrate and free scanner meet */ - if (cc->free_pfn <= cc->migrate_pfn) { + if (compact_scanners_met(cc)) { /* Let the next compaction start anew. */ - zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; - zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; - zone->compact_cached_free_pfn = zone_end_pfn(zone); + reset_cached_positions(zone); /* * Mark that the PG_migrate_skip information should be cleared @@ -1295,7 +1353,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) unsigned long end_pfn = zone_end_pfn(zone); const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); const bool sync = cc->mode != MIGRATE_ASYNC; - unsigned long last_migrated_pfn = 0; ret = compaction_suitable(zone, cc->order, cc->alloc_flags, cc->classzone_idx); @@ -1333,6 +1390,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; } + cc->last_migrated_pfn = 0; trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn, sync); @@ -1342,7 +1400,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) while ((ret = compact_finished(zone, cc, migratetype)) == COMPACT_CONTINUE) { int err; - unsigned long isolate_start_pfn = cc->migrate_pfn; switch (isolate_migratepages(zone, cc)) { case ISOLATE_ABORT: @@ -1376,22 +1433,12 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) * migrate_pages() may return -ENOMEM when scanners meet * and we want compact_finished() to detect it */ - if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) { + if (err == -ENOMEM && !compact_scanners_met(cc)) { ret = COMPACT_PARTIAL; goto out; } } - /* - * Record where we could have freed pages by migration and not - * yet flushed them to buddy allocator. We use the pfn that - * isolate_migratepages() started from in this loop iteration - * - this is the lowest page that could have been isolated and - * then freed by migration. - */ - if (!last_migrated_pfn) - last_migrated_pfn = isolate_start_pfn; - check_drain: /* * Has the migration scanner moved away from the previous @@ -1400,18 +1447,18 @@ check_drain: * compact_finished() can detect immediately if allocation * would succeed. */ - if (cc->order > 0 && last_migrated_pfn) { + if (cc->order > 0 && cc->last_migrated_pfn) { int cpu; unsigned long current_block_start = cc->migrate_pfn & ~((1UL << cc->order) - 1); - if (last_migrated_pfn < current_block_start) { + if (cc->last_migrated_pfn < current_block_start) { cpu = get_cpu(); lru_add_drain_cpu(cpu); drain_local_pages(zone); put_cpu(); /* No more flushing until we migrate again */ - last_migrated_pfn = 0; + cc->last_migrated_pfn = 0; } } diff --git a/mm/dmapool.c b/mm/dmapool.c index 59d10d1..71a8998 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -271,6 +271,9 @@ void dma_pool_destroy(struct dma_pool *pool) { bool empty = false; + if (unlikely(!pool)) + return; + mutex_lock(&pools_reg_lock); mutex_lock(&pools_lock); list_del(&pool->pools); @@ -334,7 +337,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */ spin_unlock_irqrestore(&pool->lock, flags); - page = pool_alloc_page(pool, mem_flags); + page = pool_alloc_page(pool, mem_flags & (~__GFP_ZERO)); if (!page) return NULL; @@ -372,9 +375,14 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, break; } } - memset(retval, POOL_POISON_ALLOCATED, pool->size); + if (!(mem_flags & __GFP_ZERO)) + memset(retval, POOL_POISON_ALLOCATED, pool->size); #endif spin_unlock_irqrestore(&pool->lock, flags); + + if (mem_flags & __GFP_ZERO) + memset(retval, 0, pool->size); + return retval; } EXPORT_SYMBOL(dma_pool_alloc); diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index 0cfadaf..23f744d 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -224,6 +224,28 @@ early_memremap_ro(resource_size_t phys_addr, unsigned long size) return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO); } #endif + +#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) + +void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size) +{ + unsigned long slop, clen; + char *p; + + while (size) { + slop = src & ~PAGE_MASK; + clen = size; + if (clen > MAX_MAP_CHUNK - slop) + clen = MAX_MAP_CHUNK - slop; + p = early_memremap(src & PAGE_MASK, clen + slop); + memcpy(dest, p + slop, clen); + early_memunmap(p, clen + slop); + dest += clen; + src += clen; + size -= clen; + } +} + #else /* CONFIG_MMU */ void __init __iomem * diff --git a/mm/filemap.c b/mm/filemap.c index 1283fc8..72940fb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -674,7 +674,7 @@ struct page *__page_cache_alloc(gfp_t gfp) do { cpuset_mems_cookie = read_mems_allowed_begin(); n = cpuset_mem_spread_node(); - page = alloc_pages_exact_node(n, gfp, 0); + page = __alloc_pages_node(n, gfp, 0); } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); return page; @@ -2473,21 +2473,6 @@ ssize_t generic_perform_write(struct file *file, iov_iter_count(i)); again: - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. - * - * Not only is this an optimisation, but it is also required - * to check that the address is actually valid, when atomic - * usercopies are used, below. - */ - if (unlikely(iov_iter_fault_in_readable(i, bytes))) { - status = -EFAULT; - break; - } - status = a_ops->write_begin(file, mapping, pos, bytes, flags, &page, &fsdata); if (unlikely(status < 0)) @@ -2495,8 +2480,17 @@ again: if (mapping_writably_mapped(mapping)) flush_dcache_page(page); - + /* + * 'page' is now locked. If we are trying to copy from a + * mapping of 'page' in userspace, the copy might fault and + * would need PageUptodate() to complete. But, page can not be + * made Uptodate without acquiring the page lock, which we hold. + * Deadlock. Avoid with pagefault_disable(). Fix up below with + * iov_iter_fault_in_readable(). + */ + pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); + pagefault_enable(); flush_dcache_page(page); status = a_ops->write_end(file, mapping, pos, bytes, copied, @@ -2519,6 +2513,14 @@ again: */ bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, iov_iter_single_seg_count(i)); + /* + * This is the fallback to recover if the copy from + * userspace above faults. + */ + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { + status = -EFAULT; + break; + } goto again; } pos += copied; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 279a818..b16279c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -16,6 +16,7 @@ #include <linux/swap.h> #include <linux/shrinker.h> #include <linux/mm_inline.h> +#include <linux/dax.h> #include <linux/kthread.h> #include <linux/khugepaged.h> #include <linux/freezer.h> @@ -105,7 +106,7 @@ static struct khugepaged_scan khugepaged_scan = { }; -static int set_recommended_min_free_kbytes(void) +static void set_recommended_min_free_kbytes(void) { struct zone *zone; int nr_zones = 0; @@ -140,7 +141,6 @@ static int set_recommended_min_free_kbytes(void) min_free_kbytes = recommended_min; } setup_per_zone_wmarks(); - return 0; } static int start_stop_khugepaged(void) @@ -172,12 +172,7 @@ fail: static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; -static inline bool is_huge_zero_pmd(pmd_t pmd) -{ - return is_huge_zero_page(pmd_page(pmd)); -} - -static struct page *get_huge_zero_page(void) +struct page *get_huge_zero_page(void) { struct page *zero_page; retry: @@ -794,16 +789,19 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) } /* Caller must hold page table lock. */ -static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, +static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, struct page *zero_page) { pmd_t entry; + if (!pmd_none(*pmd)) + return false; entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); atomic_long_inc(&mm->nr_ptes); + return true; } int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, @@ -870,6 +868,49 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, flags); } +static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write) +{ + struct mm_struct *mm = vma->vm_mm; + pmd_t entry; + spinlock_t *ptl; + + ptl = pmd_lock(mm, pmd); + if (pmd_none(*pmd)) { + entry = pmd_mkhuge(pfn_pmd(pfn, prot)); + if (write) { + entry = pmd_mkyoung(pmd_mkdirty(entry)); + entry = maybe_pmd_mkwrite(entry, vma); + } + set_pmd_at(mm, addr, pmd, entry); + update_mmu_cache_pmd(vma, addr, pmd); + } + spin_unlock(ptl); +} + +int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, unsigned long pfn, bool write) +{ + pgprot_t pgprot = vma->vm_page_prot; + /* + * If we had pmd_special, we could avoid all these restrictions, + * but we need to be consistent with PTEs and architectures that + * can't support a 'special' bit. + */ + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); + BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == + (VM_PFNMAP|VM_MIXEDMAP)); + BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); + BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); + + if (addr < vma->vm_start || addr >= vma->vm_end) + return VM_FAULT_SIGBUS; + if (track_pfn_insert(vma, &pgprot, pfn)) + return VM_FAULT_SIGBUS; + insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write); + return VM_FAULT_NOPAGE; +} + int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma) @@ -1414,41 +1455,41 @@ out: int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { + pmd_t orig_pmd; spinlock_t *ptl; - int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { - struct page *page; - pgtable_t pgtable; - pmd_t orig_pmd; - /* - * For architectures like ppc64 we look at deposited pgtable - * when calling pmdp_huge_get_and_clear. So do the - * pgtable_trans_huge_withdraw after finishing pmdp related - * operations. - */ - orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, - tlb->fullmm); - tlb_remove_pmd_tlb_entry(tlb, pmd, addr); - pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); - if (is_huge_zero_pmd(orig_pmd)) { - atomic_long_dec(&tlb->mm->nr_ptes); - spin_unlock(ptl); + if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1) + return 0; + /* + * For architectures like ppc64 we look at deposited pgtable + * when calling pmdp_huge_get_and_clear. So do the + * pgtable_trans_huge_withdraw after finishing pmdp related + * operations. + */ + orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, + tlb->fullmm); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + if (vma_is_dax(vma)) { + spin_unlock(ptl); + if (is_huge_zero_pmd(orig_pmd)) put_huge_zero_page(); - } else { - page = pmd_page(orig_pmd); - page_remove_rmap(page); - VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); - add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); - VM_BUG_ON_PAGE(!PageHead(page), page); - atomic_long_dec(&tlb->mm->nr_ptes); - spin_unlock(ptl); - tlb_remove_page(tlb, page); - } - pte_free(tlb->mm, pgtable); - ret = 1; + } else if (is_huge_zero_pmd(orig_pmd)) { + pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); + atomic_long_dec(&tlb->mm->nr_ptes); + spin_unlock(ptl); + put_huge_zero_page(); + } else { + struct page *page = pmd_page(orig_pmd); + page_remove_rmap(page); + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); + VM_BUG_ON_PAGE(!PageHead(page), page); + pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); + atomic_long_dec(&tlb->mm->nr_ptes); + spin_unlock(ptl); + tlb_remove_page(tlb, page); } - return ret; + return 1; } int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, @@ -2285,8 +2326,12 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, static void khugepaged_alloc_sleep(void) { - wait_event_freezable_timeout(khugepaged_wait, false, - msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); + DEFINE_WAIT(wait); + + add_wait_queue(&khugepaged_wait, &wait); + freezable_schedule_timeout_interruptible( + msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); } static int khugepaged_node_load[MAX_NUMNODES]; @@ -2373,7 +2418,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, */ up_read(&mm->mmap_sem); - *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER); + *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); *hpage = ERR_PTR(-ENOMEM); @@ -2911,7 +2956,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd) { spinlock_t *ptl; - struct page *page; + struct page *page = NULL; struct mm_struct *mm = vma->vm_mm; unsigned long haddr = address & HPAGE_PMD_MASK; unsigned long mmun_start; /* For mmu_notifiers */ @@ -2924,25 +2969,27 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, again: mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_trans_huge(*pmd))) { - spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - return; - } - if (is_huge_zero_pmd(*pmd)) { + if (unlikely(!pmd_trans_huge(*pmd))) + goto unlock; + if (vma_is_dax(vma)) { + pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); + if (is_huge_zero_pmd(_pmd)) + put_huge_zero_page(); + } else if (is_huge_zero_pmd(*pmd)) { __split_huge_zero_page_pmd(vma, haddr, pmd); - spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - return; + } else { + page = pmd_page(*pmd); + VM_BUG_ON_PAGE(!page_count(page), page); + get_page(page); } - page = pmd_page(*pmd); - VM_BUG_ON_PAGE(!page_count(page), page); - get_page(page); + unlock: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - split_huge_page(page); + if (!page) + return; + split_huge_page(page); put_page(page); /* @@ -2991,7 +3038,7 @@ static void split_huge_page_address(struct mm_struct *mm, split_huge_page_pmd_mm(mm, address, pmd); } -void __vma_adjust_trans_huge(struct vm_area_struct *vma, +void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 51ae41d..999fb0a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -64,7 +64,7 @@ DEFINE_SPINLOCK(hugetlb_lock); * prevent spurious OOMs when the hugepage pool is fully utilized. */ static int num_fault_mutexes; -static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; +struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); @@ -240,11 +240,14 @@ struct file_region { /* * Add the huge page range represented by [f, t) to the reserve - * map. Existing regions will be expanded to accommodate the - * specified range. We know only existing regions need to be - * expanded, because region_add is only called after region_chg - * with the same range. If a new file_region structure must - * be allocated, it is done in region_chg. + * map. In the normal case, existing regions will be expanded + * to accommodate the specified range. Sufficient regions should + * exist for expansion due to the previous call to region_chg + * with the same range. However, it is possible that region_del + * could have been called after region_chg and modifed the map + * in such a way that no region exists to be expanded. In this + * case, pull a region descriptor from the cache associated with + * the map and use that for the new range. * * Return the number of new huge pages added to the map. This * number is greater than or equal to zero. @@ -261,6 +264,28 @@ static long region_add(struct resv_map *resv, long f, long t) if (f <= rg->to) break; + /* + * If no region exists which can be expanded to include the + * specified range, the list must have been modified by an + * interleving call to region_del(). Pull a region descriptor + * from the cache and use it for this range. + */ + if (&rg->link == head || t < rg->from) { + VM_BUG_ON(resv->region_cache_count <= 0); + + resv->region_cache_count--; + nrg = list_first_entry(&resv->region_cache, struct file_region, + link); + list_del(&nrg->link); + + nrg->from = f; + nrg->to = t; + list_add(&nrg->link, rg->link.prev); + + add += t - f; + goto out_locked; + } + /* Round our left edge to the current segment if it encloses us. */ if (f > rg->from) f = rg->from; @@ -294,6 +319,8 @@ static long region_add(struct resv_map *resv, long f, long t) add += t - nrg->to; /* Added to end of region */ nrg->to = t; +out_locked: + resv->adds_in_progress--; spin_unlock(&resv->lock); VM_BUG_ON(add < 0); return add; @@ -312,11 +339,14 @@ static long region_add(struct resv_map *resv, long f, long t) * so that the subsequent region_add call will have all the * regions it needs and will not fail. * - * Returns the number of huge pages that need to be added - * to the existing reservation map for the range [f, t). - * This number is greater or equal to zero. -ENOMEM is - * returned if a new file_region structure is needed and can - * not be allocated. + * Upon entry, region_chg will also examine the cache of region descriptors + * associated with the map. If there are not enough descriptors cached, one + * will be allocated for the in progress add operation. + * + * Returns the number of huge pages that need to be added to the existing + * reservation map for the range [f, t). This number is greater or equal to + * zero. -ENOMEM is returned if a new file_region structure or cache entry + * is needed and can not be allocated. */ static long region_chg(struct resv_map *resv, long f, long t) { @@ -326,6 +356,31 @@ static long region_chg(struct resv_map *resv, long f, long t) retry: spin_lock(&resv->lock); +retry_locked: + resv->adds_in_progress++; + + /* + * Check for sufficient descriptors in the cache to accommodate + * the number of in progress add operations. + */ + if (resv->adds_in_progress > resv->region_cache_count) { + struct file_region *trg; + + VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1); + /* Must drop lock to allocate a new descriptor. */ + resv->adds_in_progress--; + spin_unlock(&resv->lock); + + trg = kmalloc(sizeof(*trg), GFP_KERNEL); + if (!trg) + return -ENOMEM; + + spin_lock(&resv->lock); + list_add(&trg->link, &resv->region_cache); + resv->region_cache_count++; + goto retry_locked; + } + /* Locate the region we are before or in. */ list_for_each_entry(rg, head, link) if (f <= rg->to) @@ -336,6 +391,7 @@ retry: * size such that we can guarantee to record the reservation. */ if (&rg->link == head || t < rg->from) { if (!nrg) { + resv->adds_in_progress--; spin_unlock(&resv->lock); nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); if (!nrg) @@ -385,43 +441,131 @@ out_nrg: } /* - * Truncate the reserve map at index 'end'. Modify/truncate any - * region which contains end. Delete any regions past end. - * Return the number of huge pages removed from the map. + * Abort the in progress add operation. The adds_in_progress field + * of the resv_map keeps track of the operations in progress between + * calls to region_chg and region_add. Operations are sometimes + * aborted after the call to region_chg. In such cases, region_abort + * is called to decrement the adds_in_progress counter. + * + * NOTE: The range arguments [f, t) are not needed or used in this + * routine. They are kept to make reading the calling code easier as + * arguments will match the associated region_chg call. */ -static long region_truncate(struct resv_map *resv, long end) +static void region_abort(struct resv_map *resv, long f, long t) +{ + spin_lock(&resv->lock); + VM_BUG_ON(!resv->region_cache_count); + resv->adds_in_progress--; + spin_unlock(&resv->lock); +} + +/* + * Delete the specified range [f, t) from the reserve map. If the + * t parameter is LONG_MAX, this indicates that ALL regions after f + * should be deleted. Locate the regions which intersect [f, t) + * and either trim, delete or split the existing regions. + * + * Returns the number of huge pages deleted from the reserve map. + * In the normal case, the return value is zero or more. In the + * case where a region must be split, a new region descriptor must + * be allocated. If the allocation fails, -ENOMEM will be returned. + * NOTE: If the parameter t == LONG_MAX, then we will never split + * a region and possibly return -ENOMEM. Callers specifying + * t == LONG_MAX do not need to check for -ENOMEM error. + */ +static long region_del(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; struct file_region *rg, *trg; - long chg = 0; + struct file_region *nrg = NULL; + long del = 0; +retry: spin_lock(&resv->lock); - /* Locate the region we are either in or before. */ - list_for_each_entry(rg, head, link) - if (end <= rg->to) + list_for_each_entry_safe(rg, trg, head, link) { + if (rg->to <= f) + continue; + if (rg->from >= t) break; - if (&rg->link == head) - goto out; - /* If we are in the middle of a region then adjust it. */ - if (end > rg->from) { - chg = rg->to - end; - rg->to = end; - rg = list_entry(rg->link.next, typeof(*rg), link); - } + if (f > rg->from && t < rg->to) { /* Must split region */ + /* + * Check for an entry in the cache before dropping + * lock and attempting allocation. + */ + if (!nrg && + resv->region_cache_count > resv->adds_in_progress) { + nrg = list_first_entry(&resv->region_cache, + struct file_region, + link); + list_del(&nrg->link); + resv->region_cache_count--; + } - /* Drop any remaining regions. */ - list_for_each_entry_safe(rg, trg, rg->link.prev, link) { - if (&rg->link == head) + if (!nrg) { + spin_unlock(&resv->lock); + nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); + if (!nrg) + return -ENOMEM; + goto retry; + } + + del += t - f; + + /* New entry for end of split region */ + nrg->from = t; + nrg->to = rg->to; + INIT_LIST_HEAD(&nrg->link); + + /* Original entry is trimmed */ + rg->to = f; + + list_add(&nrg->link, &rg->link); + nrg = NULL; break; - chg += rg->to - rg->from; - list_del(&rg->link); - kfree(rg); + } + + if (f <= rg->from && t >= rg->to) { /* Remove entire region */ + del += rg->to - rg->from; + list_del(&rg->link); + kfree(rg); + continue; + } + + if (f <= rg->from) { /* Trim beginning of region */ + del += t - rg->from; + rg->from = t; + } else { /* Trim end of region */ + del += rg->to - f; + rg->to = f; + } } -out: spin_unlock(&resv->lock); - return chg; + kfree(nrg); + return del; +} + +/* + * A rare out of memory error was encountered which prevented removal of + * the reserve map region for a page. The huge page itself was free'ed + * and removed from the page cache. This routine will adjust the subpool + * usage count, and the global reserve count if needed. By incrementing + * these counts, the reserve map entry which could not be deleted will + * appear as a "reserved" entry instead of simply dangling with incorrect + * counts. + */ +void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve) +{ + struct hugepage_subpool *spool = subpool_inode(inode); + long rsv_adjust; + + rsv_adjust = hugepage_subpool_get_pages(spool, 1); + if (restore_reserve && rsv_adjust) { + struct hstate *h = hstate_inode(inode); + + hugetlb_acct_memory(h, 1); + } } /* @@ -544,22 +688,44 @@ static void set_vma_private_data(struct vm_area_struct *vma, struct resv_map *resv_map_alloc(void) { struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); - if (!resv_map) + struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL); + + if (!resv_map || !rg) { + kfree(resv_map); + kfree(rg); return NULL; + } kref_init(&resv_map->refs); spin_lock_init(&resv_map->lock); INIT_LIST_HEAD(&resv_map->regions); + resv_map->adds_in_progress = 0; + + INIT_LIST_HEAD(&resv_map->region_cache); + list_add(&rg->link, &resv_map->region_cache); + resv_map->region_cache_count = 1; + return resv_map; } void resv_map_release(struct kref *ref) { struct resv_map *resv_map = container_of(ref, struct resv_map, refs); + struct list_head *head = &resv_map->region_cache; + struct file_region *rg, *trg; /* Clear out any active regions before we release the map. */ - region_truncate(resv_map, 0); + region_del(resv_map, 0, LONG_MAX); + + /* ... and any entries left in the cache */ + list_for_each_entry_safe(rg, trg, head, link) { + list_del(&rg->link); + kfree(rg); + } + + VM_BUG_ON(resv_map->adds_in_progress); + kfree(resv_map); } @@ -635,8 +801,19 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) } /* Shared mappings always use reserves */ - if (vma->vm_flags & VM_MAYSHARE) - return true; + if (vma->vm_flags & VM_MAYSHARE) { + /* + * We know VM_NORESERVE is not set. Therefore, there SHOULD + * be a region map for all pages. The only situation where + * there is no region map is if a hole was punched via + * fallocate. In this case, there really are no reverves to + * use. This situation is indicated if chg != 0. + */ + if (chg) + return false; + else + return true; + } /* * Only the process that called mmap() has reserves for @@ -1154,7 +1331,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; - page = alloc_pages_exact_node(nid, + page = __alloc_pages_node(nid, htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); @@ -1306,7 +1483,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); else - page = alloc_pages_exact_node(nid, + page = __alloc_pages_node(nid, htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); @@ -1473,16 +1650,19 @@ static void return_unused_surplus_pages(struct hstate *h, } } + /* - * vma_needs_reservation and vma_commit_reservation are used by the huge - * page allocation routines to manage reservations. + * vma_needs_reservation, vma_commit_reservation and vma_end_reservation + * are used by the huge page allocation routines to manage reservations. * * vma_needs_reservation is called to determine if the huge page at addr * within the vma has an associated reservation. If a reservation is * needed, the value 1 is returned. The caller is then responsible for * managing the global reservation and subpool usage counts. After * the huge page has been allocated, vma_commit_reservation is called - * to add the page to the reservation map. + * to add the page to the reservation map. If the page allocation fails, + * the reservation must be ended instead of committed. vma_end_reservation + * is called in such cases. * * In the normal case, vma_commit_reservation returns the same value * as the preceding vma_needs_reservation call. The only time this @@ -1490,9 +1670,14 @@ static void return_unused_surplus_pages(struct hstate *h, * is the responsibility of the caller to notice the difference and * take appropriate action. */ +enum vma_resv_mode { + VMA_NEEDS_RESV, + VMA_COMMIT_RESV, + VMA_END_RESV, +}; static long __vma_reservation_common(struct hstate *h, struct vm_area_struct *vma, unsigned long addr, - bool commit) + enum vma_resv_mode mode) { struct resv_map *resv; pgoff_t idx; @@ -1503,10 +1688,20 @@ static long __vma_reservation_common(struct hstate *h, return 1; idx = vma_hugecache_offset(h, vma, addr); - if (commit) - ret = region_add(resv, idx, idx + 1); - else + switch (mode) { + case VMA_NEEDS_RESV: ret = region_chg(resv, idx, idx + 1); + break; + case VMA_COMMIT_RESV: + ret = region_add(resv, idx, idx + 1); + break; + case VMA_END_RESV: + region_abort(resv, idx, idx + 1); + ret = 0; + break; + default: + BUG(); + } if (vma->vm_flags & VM_MAYSHARE) return ret; @@ -1517,47 +1712,79 @@ static long __vma_reservation_common(struct hstate *h, static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - return __vma_reservation_common(h, vma, addr, false); + return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV); } static long vma_commit_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - return __vma_reservation_common(h, vma, addr, true); + return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV); +} + +static void vma_end_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV); } -static struct page *alloc_huge_page(struct vm_area_struct *vma, +struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct page *page; - long chg, commit; + long map_chg, map_commit; + long gbl_chg; int ret, idx; struct hugetlb_cgroup *h_cg; idx = hstate_index(h); /* - * Processes that did not create the mapping will have no - * reserves and will not have accounted against subpool - * limit. Check that the subpool limit can be made before - * satisfying the allocation MAP_NORESERVE mappings may also - * need pages and subpool limit allocated allocated if no reserve - * mapping overlaps. + * Examine the region/reserve map to determine if the process + * has a reservation for the page to be allocated. A return + * code of zero indicates a reservation exists (no change). */ - chg = vma_needs_reservation(h, vma, addr); - if (chg < 0) + map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); + if (map_chg < 0) return ERR_PTR(-ENOMEM); - if (chg || avoid_reserve) - if (hugepage_subpool_get_pages(spool, 1) < 0) + + /* + * Processes that did not create the mapping will have no + * reserves as indicated by the region/reserve map. Check + * that the allocation will not exceed the subpool limit. + * Allocations for MAP_NORESERVE mappings also need to be + * checked against any subpool limit. + */ + if (map_chg || avoid_reserve) { + gbl_chg = hugepage_subpool_get_pages(spool, 1); + if (gbl_chg < 0) { + vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); + } + + /* + * Even though there was no reservation in the region/reserve + * map, there could be reservations associated with the + * subpool that can be used. This would be indicated if the + * return value of hugepage_subpool_get_pages() is zero. + * However, if avoid_reserve is specified we still avoid even + * the subpool reservations. + */ + if (avoid_reserve) + gbl_chg = 1; + } ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); if (ret) goto out_subpool_put; spin_lock(&hugetlb_lock); - page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); + /* + * glb_chg is passed to indicate whether or not a page must be taken + * from the global free pool (global change). gbl_chg == 0 indicates + * a reservation exists for the allocation. + */ + page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); if (!page) { spin_unlock(&hugetlb_lock); page = alloc_buddy_huge_page(h, NUMA_NO_NODE); @@ -1573,8 +1800,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, set_page_private(page, (unsigned long)spool); - commit = vma_commit_reservation(h, vma, addr); - if (unlikely(chg > commit)) { + map_commit = vma_commit_reservation(h, vma, addr); + if (unlikely(map_chg > map_commit)) { /* * The page was added to the reservation map between * vma_needs_reservation and vma_commit_reservation. @@ -1594,8 +1821,9 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); out_subpool_put: - if (chg || avoid_reserve) + if (map_chg || avoid_reserve) hugepage_subpool_put_pages(spool, 1); + vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); } @@ -2311,7 +2539,7 @@ static void __exit hugetlb_exit(void) } kobject_put(hugepages_kobj); - kfree(htlb_fault_mutex_table); + kfree(hugetlb_fault_mutex_table); } module_exit(hugetlb_exit); @@ -2344,12 +2572,12 @@ static int __init hugetlb_init(void) #else num_fault_mutexes = 1; #endif - htlb_fault_mutex_table = + hugetlb_fault_mutex_table = kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL); - BUG_ON(!htlb_fault_mutex_table); + BUG_ON(!hugetlb_fault_mutex_table); for (i = 0; i < num_fault_mutexes; i++) - mutex_init(&htlb_fault_mutex_table[i]); + mutex_init(&hugetlb_fault_mutex_table[i]); return 0; } module_init(hugetlb_init); @@ -3147,6 +3375,23 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, return page != NULL; } +int huge_add_to_page_cache(struct page *page, struct address_space *mapping, + pgoff_t idx) +{ + struct inode *inode = mapping->host; + struct hstate *h = hstate_inode(inode); + int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); + + if (err) + return err; + ClearPagePrivate(page); + + spin_lock(&inode->i_lock); + inode->i_blocks += blocks_per_huge_page(h); + spin_unlock(&inode->i_lock); + return 0; +} + static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, unsigned long address, pte_t *ptep, unsigned int flags) @@ -3194,21 +3439,13 @@ retry: set_page_huge_active(page); if (vma->vm_flags & VM_MAYSHARE) { - int err; - struct inode *inode = mapping->host; - - err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); + int err = huge_add_to_page_cache(page, mapping, idx); if (err) { put_page(page); if (err == -EEXIST) goto retry; goto out; } - ClearPagePrivate(page); - - spin_lock(&inode->i_lock); - inode->i_blocks += blocks_per_huge_page(h); - spin_unlock(&inode->i_lock); } else { lock_page(page); if (unlikely(anon_vma_prepare(vma))) { @@ -3236,11 +3473,14 @@ retry: * any allocations necessary to record that reservation occur outside * the spinlock. */ - if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { if (vma_needs_reservation(h, vma, address) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; } + /* Just decrements count, does not deallocate */ + vma_end_reservation(h, vma, address); + } ptl = huge_pte_lockptr(h, mm, ptep); spin_lock(ptl); @@ -3280,7 +3520,7 @@ backout_unlocked: } #ifdef CONFIG_SMP -static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, unsigned long address) @@ -3305,7 +3545,7 @@ static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, * For uniprocesor systems we always use a single mutex, so just * return 0 and avoid the hashing overhead. */ -static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, unsigned long address) @@ -3353,8 +3593,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - hash = fault_mutex_hash(h, mm, vma, mapping, idx, address); - mutex_lock(&htlb_fault_mutex_table[hash]); + hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address); + mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { @@ -3387,6 +3627,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = VM_FAULT_OOM; goto out_mutex; } + /* Just decrements count, does not deallocate */ + vma_end_reservation(h, vma, address); if (!(vma->vm_flags & VM_MAYSHARE)) pagecache_page = hugetlbfs_pagecache_page(h, @@ -3437,7 +3679,7 @@ out_ptl: put_page(pagecache_page); } out_mutex: - mutex_unlock(&htlb_fault_mutex_table[hash]); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* * Generally it's safe to hold refcount during waiting page lock. But * here we just wait to defer the next page fault to avoid busy loop and @@ -3726,12 +3968,15 @@ int hugetlb_reserve_pages(struct inode *inode, } return 0; out_err: + if (!vma || vma->vm_flags & VM_MAYSHARE) + region_abort(resv_map, from, to); if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) kref_put(&resv_map->refs, resv_map_release); return ret; } -void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) +long hugetlb_unreserve_pages(struct inode *inode, long start, long end, + long freed) { struct hstate *h = hstate_inode(inode); struct resv_map *resv_map = inode_resv_map(inode); @@ -3739,8 +3984,17 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) struct hugepage_subpool *spool = subpool_inode(inode); long gbl_reserve; - if (resv_map) - chg = region_truncate(resv_map, offset); + if (resv_map) { + chg = region_del(resv_map, start, end); + /* + * region_del() can fail in the rare case where a region + * must be split and another region descriptor can not be + * allocated. If end == LONG_MAX, it will not fail. + */ + if (chg < 0) + return chg; + } + spin_lock(&inode->i_lock); inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); @@ -3751,6 +4005,8 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) */ gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); hugetlb_acct_memory(h, -gbl_reserve); + + return 0; } #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index bf73ac1..aeba0ed 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -58,7 +58,7 @@ inject: pr_info("Injecting memory failure at pfn %#lx\n", pfn); return memory_failure(pfn, 18, MF_COUNT_INCREASED); put_out: - put_page(p); + put_hwpoison_page(p); return 0; } diff --git a/mm/internal.h b/mm/internal.h index 1195dd2..bc0fa9a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -182,6 +182,7 @@ struct compact_control { unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ + unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ int order; /* order a direct compactor needs */ diff --git a/mm/kmemleak.c b/mm/kmemleak.c index cf79f11..f532f6a 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -838,6 +838,7 @@ static void __init log_early(int op_type, const void *ptr, size_t size, } if (crt_early_log >= ARRAY_SIZE(early_log)) { + crt_early_log++; kmemleak_disable(); return; } @@ -1882,7 +1883,7 @@ void __init kmemleak_init(void) object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); - if (crt_early_log >= ARRAY_SIZE(early_log)) + if (crt_early_log > ARRAY_SIZE(early_log)) pr_warning("Early log buffer exceeded (%d), please increase " "DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log); diff --git a/mm/list_lru.c b/mm/list_lru.c index 909eca2..e1da19f 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -99,8 +99,8 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) struct list_lru_one *l; spin_lock(&nlru->lock); - l = list_lru_from_kmem(nlru, item); if (list_empty(item)) { + l = list_lru_from_kmem(nlru, item); list_add_tail(item, &l->list); l->nr_items++; spin_unlock(&nlru->lock); @@ -118,8 +118,8 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) struct list_lru_one *l; spin_lock(&nlru->lock); - l = list_lru_from_kmem(nlru, item); if (!list_empty(item)) { + l = list_lru_from_kmem(nlru, item); list_del_init(item); l->nr_items--; spin_unlock(&nlru->lock); diff --git a/mm/madvise.c b/mm/madvise.c index ce3a422..c889fcb 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -301,7 +301,7 @@ static long madvise_remove(struct vm_area_struct *vma, *prev = NULL; /* tell sys_madvise we drop mmap_sem */ - if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB)) + if (vma->vm_flags & VM_LOCKED) return -EINVAL; f = vma->vm_file; diff --git a/mm/memblock.c b/mm/memblock.c index 95ce68c..1c7b647 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -91,7 +91,7 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); } -static long __init_memblock memblock_overlaps_region(struct memblock_type *type, +bool __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) { unsigned long i; @@ -103,7 +103,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, break; } - return (i < type->cnt) ? i : -1; + return i < type->cnt; } /* @@ -569,6 +569,7 @@ repeat: #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP WARN_ON(nid != memblock_get_region_node(rgn)); #endif + WARN_ON(flags != rgn->flags); nr_new++; if (insert) memblock_insert_region(type, i++, base, @@ -614,14 +615,14 @@ static int __init_memblock memblock_add_region(phys_addr_t base, int nid, unsigned long flags) { - struct memblock_type *_rgn = &memblock.memory; + struct memblock_type *type = &memblock.memory; memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, (unsigned long long)base + size - 1, flags, (void *)_RET_IP_); - return memblock_add_range(_rgn, base, size, nid, flags); + return memblock_add_range(type, base, size, nid, flags); } int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) @@ -761,7 +762,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) * * This function isolates region [@base, @base + @size), and sets/clears flag * - * Return 0 on succees, -errno on failure. + * Return 0 on success, -errno on failure. */ static int __init_memblock memblock_setclr_flag(phys_addr_t base, phys_addr_t size, int set, int flag) @@ -788,7 +789,7 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base, * @base: the base phys addr of the region * @size: the size of the region * - * Return 0 on succees, -errno on failure. + * Return 0 on success, -errno on failure. */ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) { @@ -800,7 +801,7 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) * @base: the base phys addr of the region * @size: the size of the region * - * Return 0 on succees, -errno on failure. + * Return 0 on success, -errno on failure. */ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) { @@ -812,7 +813,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) * @base: the base phys addr of the region * @size: the size of the region * - * Return 0 on succees, -errno on failure. + * Return 0 on success, -errno on failure. */ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) { @@ -834,10 +835,10 @@ void __init_memblock __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, phys_addr_t *out_end) { - struct memblock_type *rsv = &memblock.reserved; + struct memblock_type *type = &memblock.reserved; - if (*idx >= 0 && *idx < rsv->cnt) { - struct memblock_region *r = &rsv->regions[*idx]; + if (*idx >= 0 && *idx < type->cnt) { + struct memblock_region *r = &type->regions[*idx]; phys_addr_t base = r->base; phys_addr_t size = r->size; @@ -975,7 +976,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, * in type_b. * * @idx: pointer to u64 loop variable - * @nid: nid: node selector, %NUMA_NO_NODE for all nodes + * @nid: node selector, %NUMA_NO_NODE for all nodes * @flags: pick from blocks based on memory attributes * @type_a: pointer to memblock_type from where the range is taken * @type_b: pointer to memblock_type which excludes memory from being taken @@ -1565,12 +1566,12 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size * Check if the region [@base, @base+@size) intersects a reserved memory block. * * RETURNS: - * 0 if false, non-zero if true + * True if they intersect, false if not. */ -int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) +bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) { memblock_cap_size(base, &size); - return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; + return memblock_overlaps_region(&memblock.reserved, base, size); } void __init_memblock memblock_trim_memory(phys_addr_t align) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1af0575..1742a2d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -111,56 +111,10 @@ static const char * const mem_cgroup_lru_names[] = { "unevictable", }; -/* - * Per memcg event counter is incremented at every pagein/pageout. With THP, - * it will be incremated by the number of pages. This counter is used for - * for trigger some periodic events. This is straightforward and better - * than using jiffies etc. to handle periodic memcg event. - */ -enum mem_cgroup_events_target { - MEM_CGROUP_TARGET_THRESH, - MEM_CGROUP_TARGET_SOFTLIMIT, - MEM_CGROUP_TARGET_NUMAINFO, - MEM_CGROUP_NTARGETS, -}; #define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024 #define NUMAINFO_EVENTS_TARGET 1024 -struct mem_cgroup_stat_cpu { - long count[MEM_CGROUP_STAT_NSTATS]; - unsigned long events[MEMCG_NR_EVENTS]; - unsigned long nr_page_events; - unsigned long targets[MEM_CGROUP_NTARGETS]; -}; - -struct reclaim_iter { - struct mem_cgroup *position; - /* scan generation, increased every round-trip */ - unsigned int generation; -}; - -/* - * per-zone information in memory controller. - */ -struct mem_cgroup_per_zone { - struct lruvec lruvec; - unsigned long lru_size[NR_LRU_LISTS]; - - struct reclaim_iter iter[DEF_PRIORITY + 1]; - - struct rb_node tree_node; /* RB tree node */ - unsigned long usage_in_excess;/* Set to the value by which */ - /* the soft limit is exceeded*/ - bool on_tree; - struct mem_cgroup *memcg; /* Back pointer, we cannot */ - /* use container_of */ -}; - -struct mem_cgroup_per_node { - struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; -}; - /* * Cgroups above their limits are maintained in a RB-Tree, independent of * their hierarchy representation @@ -181,32 +135,6 @@ struct mem_cgroup_tree { static struct mem_cgroup_tree soft_limit_tree __read_mostly; -struct mem_cgroup_threshold { - struct eventfd_ctx *eventfd; - unsigned long threshold; -}; - -/* For threshold */ -struct mem_cgroup_threshold_ary { - /* An array index points to threshold just below or equal to usage. */ - int current_threshold; - /* Size of entries[] */ - unsigned int size; - /* Array of thresholds */ - struct mem_cgroup_threshold entries[0]; -}; - -struct mem_cgroup_thresholds { - /* Primary thresholds array */ - struct mem_cgroup_threshold_ary *primary; - /* - * Spare threshold array. - * This is needed to make mem_cgroup_unregister_event() "never fail". - * It must be able to store at least primary->size - 1 entries. - */ - struct mem_cgroup_threshold_ary *spare; -}; - /* for OOM */ struct mem_cgroup_eventfd_list { struct list_head list; @@ -256,113 +184,6 @@ struct mem_cgroup_event { static void mem_cgroup_threshold(struct mem_cgroup *memcg); static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); -/* - * The memory controller data structure. The memory controller controls both - * page cache and RSS per cgroup. We would eventually like to provide - * statistics based on the statistics developed by Rik Van Riel for clock-pro, - * to help the administrator determine what knobs to tune. - */ -struct mem_cgroup { - struct cgroup_subsys_state css; - - /* Accounted resources */ - struct page_counter memory; - struct page_counter memsw; - struct page_counter kmem; - - /* Normal memory consumption range */ - unsigned long low; - unsigned long high; - - unsigned long soft_limit; - - /* vmpressure notifications */ - struct vmpressure vmpressure; - - /* css_online() has been completed */ - int initialized; - - /* - * Should the accounting and control be hierarchical, per subtree? - */ - bool use_hierarchy; - - /* protected by memcg_oom_lock */ - bool oom_lock; - int under_oom; - - int swappiness; - /* OOM-Killer disable */ - int oom_kill_disable; - - /* protect arrays of thresholds */ - struct mutex thresholds_lock; - - /* thresholds for memory usage. RCU-protected */ - struct mem_cgroup_thresholds thresholds; - - /* thresholds for mem+swap usage. RCU-protected */ - struct mem_cgroup_thresholds memsw_thresholds; - - /* For oom notifier event fd */ - struct list_head oom_notify; - - /* - * Should we move charges of a task when a task is moved into this - * mem_cgroup ? And what type of charges should we move ? - */ - unsigned long move_charge_at_immigrate; - /* - * set > 0 if pages under this cgroup are moving to other cgroup. - */ - atomic_t moving_account; - /* taken only while moving_account > 0 */ - spinlock_t move_lock; - struct task_struct *move_lock_task; - unsigned long move_lock_flags; - /* - * percpu counter. - */ - struct mem_cgroup_stat_cpu __percpu *stat; - spinlock_t pcp_counter_lock; - -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) - struct cg_proto tcp_mem; -#endif -#if defined(CONFIG_MEMCG_KMEM) - /* Index in the kmem_cache->memcg_params.memcg_caches array */ - int kmemcg_id; - bool kmem_acct_activated; - bool kmem_acct_active; -#endif - - int last_scanned_node; -#if MAX_NUMNODES > 1 - nodemask_t scan_nodes; - atomic_t numainfo_events; - atomic_t numainfo_updating; -#endif - -#ifdef CONFIG_CGROUP_WRITEBACK - struct list_head cgwb_list; - struct wb_domain cgwb_domain; -#endif - - /* List of events which userspace want to receive */ - struct list_head event_list; - spinlock_t event_list_lock; - - struct mem_cgroup_per_node *nodeinfo[0]; - /* WARNING: nodeinfo must be the last member here */ -}; - -#ifdef CONFIG_MEMCG_KMEM -bool memcg_kmem_is_active(struct mem_cgroup *memcg) -{ - return memcg->kmem_acct_active; -} -#endif - /* Stuffs for move charges at task migration. */ /* * Types of charges to be moved. @@ -423,11 +244,6 @@ enum res_type { */ static DEFINE_MUTEX(memcg_create_mutex); -struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) -{ - return s ? container_of(s, struct mem_cgroup, css) : NULL; -} - /* Some nice accessors for the vmpressure. */ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) { @@ -499,8 +315,7 @@ void sock_update_memcg(struct sock *sk) rcu_read_lock(); memcg = mem_cgroup_from_task(current); cg_proto = sk->sk_prot->proto_cgroup(memcg); - if (!mem_cgroup_is_root(memcg) && - memcg_proto_active(cg_proto) && + if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) && css_tryget_online(&memcg->css)) { sk->sk_cgrp = cg_proto; } @@ -593,11 +408,6 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) return &memcg->nodeinfo[nid]->zoneinfo[zid]; } -struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) -{ - return &memcg->css; -} - /** * mem_cgroup_css_from_page - css of the memcg associated with a page * @page: page of interest @@ -876,14 +686,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat->nr_page_events, nr_pages); } -unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) -{ - struct mem_cgroup_per_zone *mz; - - mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); - return mz->lru_size[lru]; -} - static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) @@ -986,6 +788,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); } +EXPORT_SYMBOL(mem_cgroup_from_task); static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { @@ -1031,7 +834,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, struct mem_cgroup_reclaim_cookie *reclaim) { - struct reclaim_iter *uninitialized_var(iter); + struct mem_cgroup_reclaim_iter *uninitialized_var(iter); struct cgroup_subsys_state *css = NULL; struct mem_cgroup *memcg = NULL; struct mem_cgroup *pos = NULL; @@ -1173,30 +976,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) -void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) -{ - struct mem_cgroup *memcg; - - rcu_read_lock(); - memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (unlikely(!memcg)) - goto out; - - switch (idx) { - case PGFAULT: - this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); - break; - case PGMAJFAULT: - this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); - break; - default: - BUG(); - } -out: - rcu_read_unlock(); -} -EXPORT_SYMBOL(__mem_cgroup_count_vm_event); - /** * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg * @zone: zone of the wanted lruvec @@ -1295,15 +1074,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, VM_BUG_ON((long)(*lru_size) < 0); } -bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root) -{ - if (root == memcg) - return true; - if (!root->use_hierarchy) - return false; - return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); -} - bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) { struct mem_cgroup *task_memcg; @@ -1330,39 +1100,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) return ret; } -int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) -{ - unsigned long inactive_ratio; - unsigned long inactive; - unsigned long active; - unsigned long gb; - - inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); - active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); - - gb = (inactive + active) >> (30 - PAGE_SHIFT); - if (gb) - inactive_ratio = int_sqrt(10 * gb); - else - inactive_ratio = 1; - - return inactive * inactive_ratio < active; -} - -bool mem_cgroup_lruvec_online(struct lruvec *lruvec) -{ - struct mem_cgroup_per_zone *mz; - struct mem_cgroup *memcg; - - if (mem_cgroup_disabled()) - return true; - - mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); - memcg = mz->memcg; - - return !!(memcg->css.flags & CSS_ONLINE); -} - #define mem_cgroup_from_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) @@ -1394,15 +1131,6 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) return margin; } -int mem_cgroup_swappiness(struct mem_cgroup *memcg) -{ - /* root ? */ - if (mem_cgroup_disabled() || !memcg->css.parent) - return vm_swappiness; - - return memcg->swappiness; -} - /* * A routine for checking "mem" is under move_account() or not. * @@ -1545,6 +1273,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order) { + struct oom_control oc = { + .zonelist = NULL, + .nodemask = NULL, + .gfp_mask = gfp_mask, + .order = order, + }; struct mem_cgroup *iter; unsigned long chosen_points = 0; unsigned long totalpages; @@ -1563,7 +1297,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, goto unlock; } - check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg); + check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg); totalpages = mem_cgroup_get_limit(memcg) ? : 1; for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; @@ -1571,8 +1305,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, css_task_iter_start(&iter->css, &it); while ((task = css_task_iter_next(&it))) { - switch (oom_scan_process_thread(task, totalpages, NULL, - false)) { + switch (oom_scan_process_thread(&oc, task, totalpages)) { case OOM_SCAN_SELECT: if (chosen) put_task_struct(chosen); @@ -1610,8 +1343,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, if (chosen) { points = chosen_points * 1000 / totalpages; - oom_kill_process(chosen, gfp_mask, order, points, totalpages, - memcg, NULL, "Memory cgroup out of memory"); + oom_kill_process(&oc, chosen, points, totalpages, memcg, + "Memory cgroup out of memory"); } unlock: mutex_unlock(&oom_lock); @@ -2062,23 +1795,6 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) } EXPORT_SYMBOL(mem_cgroup_end_page_stat); -/** - * mem_cgroup_update_page_stat - update page state statistics - * @memcg: memcg to account against - * @idx: page state item to account - * @val: number of pages (positive or negative) - * - * See mem_cgroup_begin_page_stat() for locking requirements. - */ -void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, - enum mem_cgroup_stat_index idx, int val) -{ - VM_BUG_ON(!rcu_read_lock_held()); - - if (memcg) - this_cpu_add(memcg->stat->count[idx], val); -} - /* * size of first charge trial. "32" comes from vmscan.c's magic value. * TODO: maybe necessary to use big numbers in big irons. @@ -2504,16 +2220,6 @@ void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages) css_put_many(&memcg->css, nr_pages); } -/* - * helper for acessing a memcg's index. It will be used as an index in the - * child cache array in kmem_cache, and also to derive its name. This function - * will return -1 when this is not a kmem-limited memcg. - */ -int memcg_cache_id(struct mem_cgroup *memcg) -{ - return memcg ? memcg->kmemcg_id : -1; -} - static int memcg_alloc_cache_id(void) { int id, size; @@ -5127,10 +4833,12 @@ static void mem_cgroup_clear_mc(void) static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { - struct task_struct *p = cgroup_taskset_first(tset); - int ret = 0; struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *from; + struct task_struct *p; + struct mm_struct *mm; unsigned long move_flags; + int ret = 0; /* * We are now commited to this value whatever it is. Changes in this @@ -5138,36 +4846,37 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, * So we need to save it, and keep it going. */ move_flags = READ_ONCE(memcg->move_charge_at_immigrate); - if (move_flags) { - struct mm_struct *mm; - struct mem_cgroup *from = mem_cgroup_from_task(p); + if (!move_flags) + return 0; - VM_BUG_ON(from == memcg); + p = cgroup_taskset_first(tset); + from = mem_cgroup_from_task(p); - mm = get_task_mm(p); - if (!mm) - return 0; - /* We move charges only when we move a owner of the mm */ - if (mm->owner == p) { - VM_BUG_ON(mc.from); - VM_BUG_ON(mc.to); - VM_BUG_ON(mc.precharge); - VM_BUG_ON(mc.moved_charge); - VM_BUG_ON(mc.moved_swap); - - spin_lock(&mc.lock); - mc.from = from; - mc.to = memcg; - mc.flags = move_flags; - spin_unlock(&mc.lock); - /* We set mc.moving_task later */ - - ret = mem_cgroup_precharge_mc(mm); - if (ret) - mem_cgroup_clear_mc(); - } - mmput(mm); + VM_BUG_ON(from == memcg); + + mm = get_task_mm(p); + if (!mm) + return 0; + /* We move charges only when we move a owner of the mm */ + if (mm->owner == p) { + VM_BUG_ON(mc.from); + VM_BUG_ON(mc.to); + VM_BUG_ON(mc.precharge); + VM_BUG_ON(mc.moved_charge); + VM_BUG_ON(mc.moved_swap); + + spin_lock(&mc.lock); + mc.from = from; + mc.to = memcg; + mc.flags = move_flags; + spin_unlock(&mc.lock); + /* We set mc.moving_task later */ + + ret = mem_cgroup_precharge_mc(mm); + if (ret) + mem_cgroup_clear_mc(); } + mmput(mm); return ret; } @@ -5521,19 +5230,6 @@ struct cgroup_subsys memory_cgrp_subsys = { }; /** - * mem_cgroup_events - count memory events against a cgroup - * @memcg: the memory cgroup - * @idx: the event index - * @nr: the number of events to account for - */ -void mem_cgroup_events(struct mem_cgroup *memcg, - enum mem_cgroup_events_index idx, - unsigned int nr) -{ - this_cpu_add(memcg->stat->events[idx], nr); -} - -/** * mem_cgroup_low - check if memory consumption is below the normal range * @root: the highest ancestor to consider * @memcg: the memory cgroup to check diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1f4446a9..eeda648 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -146,7 +146,7 @@ static int hwpoison_filter_task(struct page *p) if (!mem) return -EINVAL; - css = mem_cgroup_css(mem); + css = &mem->css; ino = cgroup_ino(css->cgroup); css_put(css); @@ -934,6 +934,27 @@ int get_hwpoison_page(struct page *page) } EXPORT_SYMBOL_GPL(get_hwpoison_page); +/** + * put_hwpoison_page() - Put refcount for memory error handling: + * @page: raw error page (hit by memory error) + */ +void put_hwpoison_page(struct page *page) +{ + struct page *head = compound_head(page); + + if (PageHuge(head)) { + put_page(head); + return; + } + + if (PageTransHuge(head)) + if (page != head) + put_page(head); + + put_page(page); +} +EXPORT_SYMBOL_GPL(put_hwpoison_page); + /* * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. @@ -1100,7 +1121,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) nr_pages = 1 << compound_order(hpage); else /* normal page or thp */ nr_pages = 1; - atomic_long_add(nr_pages, &num_poisoned_pages); + num_poisoned_pages_add(nr_pages); /* * We need/can do nothing about count=0 pages. @@ -1128,7 +1149,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) if (PageHWPoison(hpage)) { if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) || (p != hpage && TestSetPageHWPoison(hpage))) { - atomic_long_sub(nr_pages, &num_poisoned_pages); + num_poisoned_pages_sub(nr_pages); unlock_page(hpage); return 0; } @@ -1152,10 +1173,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags) else pr_err("MCE: %#lx: thp split failed\n", pfn); if (TestClearPageHWPoison(p)) - atomic_long_sub(nr_pages, &num_poisoned_pages); - put_page(p); - if (p != hpage) - put_page(hpage); + num_poisoned_pages_sub(nr_pages); + put_hwpoison_page(p); return -EBUSY; } VM_BUG_ON_PAGE(!page_count(p), p); @@ -1214,16 +1233,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags) */ if (!PageHWPoison(p)) { printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); - atomic_long_sub(nr_pages, &num_poisoned_pages); + num_poisoned_pages_sub(nr_pages); unlock_page(hpage); - put_page(hpage); + put_hwpoison_page(hpage); return 0; } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) - atomic_long_sub(nr_pages, &num_poisoned_pages); + num_poisoned_pages_sub(nr_pages); unlock_page(hpage); - put_page(hpage); + put_hwpoison_page(hpage); return 0; } @@ -1237,7 +1256,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED); unlock_page(hpage); - put_page(hpage); + put_hwpoison_page(hpage); return 0; } /* @@ -1426,6 +1445,22 @@ int unpoison_memory(unsigned long pfn) return 0; } + if (page_count(page) > 1) { + pr_info("MCE: Someone grabs the hwpoison page %#lx\n", pfn); + return 0; + } + + if (page_mapped(page)) { + pr_info("MCE: Someone maps the hwpoison page %#lx\n", pfn); + return 0; + } + + if (page_mapping(page)) { + pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n", + pfn); + return 0; + } + /* * unpoison_memory() can encounter thp only when the thp is being * worked by memory_failure() and the page lock is not held yet. @@ -1450,7 +1485,7 @@ int unpoison_memory(unsigned long pfn) return 0; } if (TestClearPageHWPoison(p)) - atomic_long_dec(&num_poisoned_pages); + num_poisoned_pages_dec(); pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); return 0; } @@ -1464,16 +1499,16 @@ int unpoison_memory(unsigned long pfn) */ if (TestClearPageHWPoison(page)) { pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); - atomic_long_sub(nr_pages, &num_poisoned_pages); + num_poisoned_pages_sub(nr_pages); freeit = 1; if (PageHuge(page)) clear_page_hwpoison_huge_page(page); } unlock_page(page); - put_page(page); + put_hwpoison_page(page); if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) - put_page(page); + put_hwpoison_page(page); return 0; } @@ -1486,7 +1521,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x) return alloc_huge_page_node(page_hstate(compound_head(p)), nid); else - return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); + return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0); } /* @@ -1533,7 +1568,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) /* * Try to free it. */ - put_page(page); + put_hwpoison_page(page); shake_page(page, 1); /* @@ -1542,7 +1577,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) ret = __get_any_page(page, pfn, 0); if (!PageLRU(page)) { /* Drop page reference which is from __get_any_page() */ - put_page(page); + put_hwpoison_page(page); pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", pfn, page->flags); return -EIO; @@ -1565,7 +1600,7 @@ static int soft_offline_huge_page(struct page *page, int flags) lock_page(hpage); if (PageHWPoison(hpage)) { unlock_page(hpage); - put_page(hpage); + put_hwpoison_page(hpage); pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); return -EBUSY; } @@ -1576,7 +1611,7 @@ static int soft_offline_huge_page(struct page *page, int flags) * get_any_page() and isolate_huge_page() takes a refcount each, * so need to drop one here. */ - put_page(hpage); + put_hwpoison_page(hpage); if (!ret) { pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn); return -EBUSY; @@ -1600,11 +1635,10 @@ static int soft_offline_huge_page(struct page *page, int flags) if (PageHuge(page)) { set_page_hwpoison_huge_page(hpage); dequeue_hwpoisoned_huge_page(hpage); - atomic_long_add(1 << compound_order(hpage), - &num_poisoned_pages); + num_poisoned_pages_add(1 << compound_order(hpage)); } else { SetPageHWPoison(page); - atomic_long_inc(&num_poisoned_pages); + num_poisoned_pages_inc(); } } return ret; @@ -1625,7 +1659,7 @@ static int __soft_offline_page(struct page *page, int flags) wait_on_page_writeback(page); if (PageHWPoison(page)) { unlock_page(page); - put_page(page); + put_hwpoison_page(page); pr_info("soft offline: %#lx page already poisoned\n", pfn); return -EBUSY; } @@ -1640,10 +1674,10 @@ static int __soft_offline_page(struct page *page, int flags) * would need to fix isolation locking first. */ if (ret == 1) { - put_page(page); + put_hwpoison_page(page); pr_info("soft_offline: %#lx: invalidated\n", pfn); SetPageHWPoison(page); - atomic_long_inc(&num_poisoned_pages); + num_poisoned_pages_inc(); return 0; } @@ -1657,14 +1691,12 @@ static int __soft_offline_page(struct page *page, int flags) * Drop page reference which is came from get_any_page() * successful isolate_lru_page() already took another one. */ - put_page(page); + put_hwpoison_page(page); if (!ret) { LIST_HEAD(pagelist); inc_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); list_add(&page->lru, &pagelist); - if (!TestSetPageHWPoison(page)) - atomic_long_inc(&num_poisoned_pages); ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { @@ -1679,8 +1711,6 @@ static int __soft_offline_page(struct page *page, int flags) pfn, ret, page->flags); if (ret > 0) ret = -EIO; - if (TestClearPageHWPoison(page)) - atomic_long_dec(&num_poisoned_pages); } } else { pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", @@ -1719,12 +1749,16 @@ int soft_offline_page(struct page *page, int flags) if (PageHWPoison(page)) { pr_info("soft offline: %#lx page already poisoned\n", pfn); + if (flags & MF_COUNT_INCREASED) + put_hwpoison_page(page); return -EBUSY; } if (!PageHuge(page) && PageTransHuge(hpage)) { if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { pr_info("soft offline: %#lx: failed to split THP\n", pfn); + if (flags & MF_COUNT_INCREASED) + put_hwpoison_page(page); return -EBUSY; } } @@ -1742,11 +1776,10 @@ int soft_offline_page(struct page *page, int flags) if (PageHuge(page)) { set_page_hwpoison_huge_page(hpage); if (!dequeue_hwpoisoned_huge_page(hpage)) - atomic_long_add(1 << compound_order(hpage), - &num_poisoned_pages); + num_poisoned_pages_add(1 << compound_order(hpage)); } else { if (!TestSetPageHWPoison(page)) - atomic_long_inc(&num_poisoned_pages); + num_poisoned_pages_inc(); } } return ret; diff --git a/mm/memory.c b/mm/memory.c index bb04d8f..6cd0b21 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2426,8 +2426,6 @@ void unmap_mapping_range(struct address_space *mapping, if (details.last_index < details.first_index) details.last_index = ULONG_MAX; - - /* DAX uses i_mmap_lock to serialise file truncate vs page fault */ i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); @@ -3015,9 +3013,9 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else { /* * The fault handler has no page to lock, so it holds - * i_mmap_lock for read to protect against truncate. + * i_mmap_lock for write to protect against truncate. */ - i_mmap_unlock_read(vma->vm_file->f_mapping); + i_mmap_unlock_write(vma->vm_file->f_mapping); } goto uncharge_out; } @@ -3031,9 +3029,9 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else { /* * The fault handler has no page to lock, so it holds - * i_mmap_lock for read to protect against truncate. + * i_mmap_lock for write to protect against truncate. */ - i_mmap_unlock_read(vma->vm_file->f_mapping); + i_mmap_unlock_write(vma->vm_file->f_mapping); } return ret; uncharge_out: @@ -3232,6 +3230,27 @@ out: return 0; } +static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, unsigned int flags) +{ + if (!vma->vm_ops) + return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags); + if (vma->vm_ops->pmd_fault) + return vma->vm_ops->pmd_fault(vma, address, pmd, flags); + return VM_FAULT_FALLBACK; +} + +static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, pmd_t orig_pmd, + unsigned int flags) +{ + if (!vma->vm_ops) + return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); + if (vma->vm_ops->pmd_fault) + return vma->vm_ops->pmd_fault(vma, address, pmd, flags); + return VM_FAULT_FALLBACK; +} + /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most @@ -3267,12 +3286,12 @@ static int handle_pte_fault(struct mm_struct *mm, barrier(); if (!pte_present(entry)) { if (pte_none(entry)) { - if (vma->vm_ops) + if (vma_is_anonymous(vma)) + return do_anonymous_page(mm, vma, address, + pte, pmd, flags); + else return do_fault(mm, vma, address, pte, pmd, flags, entry); - - return do_anonymous_page(mm, vma, address, pte, pmd, - flags); } return do_swap_page(mm, vma, address, pte, pmd, flags, entry); @@ -3334,10 +3353,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!pmd) return VM_FAULT_OOM; if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { - int ret = VM_FAULT_FALLBACK; - if (!vma->vm_ops) - ret = do_huge_pmd_anonymous_page(mm, vma, address, - pmd, flags); + int ret = create_huge_pmd(mm, vma, address, pmd, flags); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { @@ -3361,8 +3377,8 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, orig_pmd, pmd); if (dirty && !pmd_write(orig_pmd)) { - ret = do_huge_pmd_wp_page(mm, vma, address, pmd, - orig_pmd); + ret = wp_huge_pmd(mm, vma, address, pmd, + orig_pmd, flags); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a7f1e0d..87a1779 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -608,9 +608,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, qp->prev = vma; - if (vma->vm_flags & VM_PFNMAP) - return 1; - if (flags & MPOL_MF_LAZY) { /* Similar to task_numa_work, skip inaccessible VMAs */ if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) @@ -945,7 +942,7 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x return alloc_huge_page_node(page_hstate(compound_head(page)), node); else - return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE | + return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); } @@ -2001,7 +1998,7 @@ retry_cpuset: nmask = policy_nodemask(gfp, pol); if (!nmask || node_isset(hpage_node, *nmask)) { mpol_cond_put(pol); - page = alloc_pages_exact_node(hpage_node, + page = __alloc_pages_node(hpage_node, gfp | __GFP_THISNODE, order); goto out; } diff --git a/mm/mempool.c b/mm/mempool.c index 2cc08de..4c533bc 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -150,6 +150,9 @@ static void *remove_element(mempool_t *pool) */ void mempool_destroy(mempool_t *pool) { + if (unlikely(!pool)) + return; + while (pool->curr_nr) { void *element = remove_element(pool); pool->free(element, pool->pool_data); diff --git a/mm/memtest.c b/mm/memtest.c index 0a1cc13..8eaa4c3 100644 --- a/mm/memtest.c +++ b/mm/memtest.c @@ -1,11 +1,6 @@ #include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> #include <linux/types.h> -#include <linux/mm.h> -#include <linux/smp.h> #include <linux/init.h> -#include <linux/pfn.h> #include <linux/memblock.h> static u64 patterns[] __initdata = { @@ -31,10 +26,8 @@ static u64 patterns[] __initdata = { static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad) { - printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n", - (unsigned long long) pattern, - (unsigned long long) start_bad, - (unsigned long long) end_bad); + pr_info(" %016llx bad mem addr %pa - %pa reserved\n", + cpu_to_be64(pattern), &start_bad, &end_bad); memblock_reserve(start_bad, end_bad - start_bad); } @@ -79,26 +72,26 @@ static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end) this_start = clamp(this_start, start, end); this_end = clamp(this_end, start, end); if (this_start < this_end) { - printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", - (unsigned long long)this_start, - (unsigned long long)this_end, - (unsigned long long)cpu_to_be64(pattern)); + pr_info(" %pa - %pa pattern %016llx\n", + &this_start, &this_end, cpu_to_be64(pattern)); memtest(pattern, this_start, this_end - this_start); } } } /* default is disabled */ -static int memtest_pattern __initdata; +static unsigned int memtest_pattern __initdata; static int __init parse_memtest(char *arg) { + int ret = 0; + if (arg) - memtest_pattern = simple_strtoul(arg, NULL, 0); + ret = kstrtouint(arg, 0, &memtest_pattern); else memtest_pattern = ARRAY_SIZE(patterns); - return 0; + return ret; } early_param("memtest", parse_memtest); @@ -111,7 +104,7 @@ void __init early_memtest(phys_addr_t start, phys_addr_t end) if (!memtest_pattern) return; - printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); + pr_info("early_memtest: # of tests: %u\n", memtest_pattern); for (i = memtest_pattern-1; i < UINT_MAX; --i) { idx = i % ARRAY_SIZE(patterns); do_one_pass(patterns[idx], start, end); diff --git a/mm/migrate.c b/mm/migrate.c index 5c08cab..02ce25d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -880,8 +880,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, /* Establish migration ptes or remove ptes */ if (page_mapped(page)) { try_to_unmap(page, - TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS| - TTU_IGNORE_HWPOISON); + TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); page_was_mapped = 1; } @@ -952,9 +951,11 @@ out: dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); /* Soft-offlined page shouldn't go through lru cache list */ - if (reason == MR_MEMORY_FAILURE) + if (reason == MR_MEMORY_FAILURE) { put_page(page); - else + if (!test_set_page_hwpoison(page)) + num_poisoned_pages_inc(); + } else putback_lru_page(page); } @@ -1194,7 +1195,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, return alloc_huge_page_node(page_hstate(compound_head(p)), pm->node); else - return alloc_pages_exact_node(pm->node, + return __alloc_pages_node(pm->node, GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); } @@ -1554,7 +1555,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, int nid = (int) data; struct page *newpage; - newpage = alloc_pages_exact_node(nid, + newpage = __alloc_pages_node(nid, (GFP_HIGHUSER_MOVABLE | __GFP_THISNODE | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & @@ -2455,7 +2455,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, int new_below) { struct vm_area_struct *new; - int err = -ENOMEM; + int err; if (is_vm_hugetlb_page(vma) && (addr & ~(huge_page_mask(hstate_vma(vma))))) @@ -2463,7 +2463,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!new) - goto out_err; + return -ENOMEM; /* most fields are the same, copy all, and then fixup */ *new = *vma; @@ -2511,7 +2511,6 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, mpol_put(vma_policy(new)); out_free_vma: kmem_cache_free(vm_area_cachep, new); - out_err: return err; } @@ -2872,6 +2871,13 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) struct vm_area_struct *prev; struct rb_node **rb_link, *rb_parent; + if (find_vma_links(mm, vma->vm_start, vma->vm_end, + &prev, &rb_link, &rb_parent)) + return -ENOMEM; + if ((vma->vm_flags & VM_ACCOUNT) && + security_vm_enough_memory_mm(mm, vma_pages(vma))) + return -ENOMEM; + /* * The vm_pgoff of a purely anonymous vma should be irrelevant * until its first write fault, when page's anon_vma and index @@ -2884,16 +2890,10 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) * using the existing file pgoff checks and manipulations. * Similarly in do_mmap_pgoff and in do_brk. */ - if (!vma->vm_file) { + if (vma_is_anonymous(vma)) { BUG_ON(vma->anon_vma); vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; } - if (find_vma_links(mm, vma->vm_start, vma->vm_end, - &prev, &rb_link, &rb_parent)) - return -ENOMEM; - if ((vma->vm_flags & VM_ACCOUNT) && - security_vm_enough_memory_mm(mm, vma_pages(vma))) - return -ENOMEM; vma_link(mm, vma, prev, rb_link, rb_parent); return 0; @@ -2918,7 +2918,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, * If anonymous vma has not yet been faulted, update new pgoff * to match new location, to increase its chance of merging. */ - if (unlikely(!vma->vm_file && !vma->anon_vma)) { + if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { pgoff = addr >> PAGE_SHIFT; faulted_in_anon_vma = false; } @@ -2952,30 +2952,31 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); } else { new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (new_vma) { - *new_vma = *vma; - new_vma->vm_start = addr; - new_vma->vm_end = addr + len; - new_vma->vm_pgoff = pgoff; - if (vma_dup_policy(vma, new_vma)) - goto out_free_vma; - INIT_LIST_HEAD(&new_vma->anon_vma_chain); - if (anon_vma_clone(new_vma, vma)) - goto out_free_mempol; - if (new_vma->vm_file) - get_file(new_vma->vm_file); - if (new_vma->vm_ops && new_vma->vm_ops->open) - new_vma->vm_ops->open(new_vma); - vma_link(mm, new_vma, prev, rb_link, rb_parent); - *need_rmap_locks = false; - } + if (!new_vma) + goto out; + *new_vma = *vma; + new_vma->vm_start = addr; + new_vma->vm_end = addr + len; + new_vma->vm_pgoff = pgoff; + if (vma_dup_policy(vma, new_vma)) + goto out_free_vma; + INIT_LIST_HEAD(&new_vma->anon_vma_chain); + if (anon_vma_clone(new_vma, vma)) + goto out_free_mempol; + if (new_vma->vm_file) + get_file(new_vma->vm_file); + if (new_vma->vm_ops && new_vma->vm_ops->open) + new_vma->vm_ops->open(new_vma); + vma_link(mm, new_vma, prev, rb_link, rb_parent); + *need_rmap_locks = false; } return new_vma; - out_free_mempol: +out_free_mempol: mpol_put(vma_policy(new_vma)); - out_free_vma: +out_free_vma: kmem_cache_free(vm_area_cachep, new_vma); +out: return NULL; } @@ -3027,21 +3028,13 @@ static int special_mapping_fault(struct vm_area_struct *vma, pgoff_t pgoff; struct page **pages; - /* - * special mappings have no vm_file, and in that case, the mm - * uses vm_pgoff internally. So we have to subtract it from here. - * We are allowed to do this because we are the mm; do not copy - * this code into drivers! - */ - pgoff = vmf->pgoff - vma->vm_pgoff; - if (vma->vm_ops == &legacy_special_mapping_vmops) pages = vma->vm_private_data; else pages = ((struct vm_special_mapping *)vma->vm_private_data)-> pages; - for (; pgoff && *pages; ++pages) + for (pgoff = vmf->pgoff; pgoff && *pages; ++pages) pgoff--; if (*pages) { diff --git a/mm/oom_kill.c b/mm/oom_kill.c index dff991e..1ecc0bc 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -196,27 +196,26 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * Determine the type of allocation constraint. */ #ifdef CONFIG_NUMA -static enum oom_constraint constrained_alloc(struct zonelist *zonelist, - gfp_t gfp_mask, nodemask_t *nodemask, - unsigned long *totalpages) +static enum oom_constraint constrained_alloc(struct oom_control *oc, + unsigned long *totalpages) { struct zone *zone; struct zoneref *z; - enum zone_type high_zoneidx = gfp_zone(gfp_mask); + enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask); bool cpuset_limited = false; int nid; /* Default to all available memory */ *totalpages = totalram_pages + total_swap_pages; - if (!zonelist) + if (!oc->zonelist) return CONSTRAINT_NONE; /* * Reach here only when __GFP_NOFAIL is used. So, we should avoid * to kill current.We have to random task kill in this case. * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now. */ - if (gfp_mask & __GFP_THISNODE) + if (oc->gfp_mask & __GFP_THISNODE) return CONSTRAINT_NONE; /* @@ -224,17 +223,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, * the page allocator means a mempolicy is in effect. Cpuset policy * is enforced in get_page_from_freelist(). */ - if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) { + if (oc->nodemask && + !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) { *totalpages = total_swap_pages; - for_each_node_mask(nid, *nodemask) + for_each_node_mask(nid, *oc->nodemask) *totalpages += node_spanned_pages(nid); return CONSTRAINT_MEMORY_POLICY; } /* Check this allocation failure is caused by cpuset's wall function */ - for_each_zone_zonelist_nodemask(zone, z, zonelist, - high_zoneidx, nodemask) - if (!cpuset_zone_allowed(zone, gfp_mask)) + for_each_zone_zonelist_nodemask(zone, z, oc->zonelist, + high_zoneidx, oc->nodemask) + if (!cpuset_zone_allowed(zone, oc->gfp_mask)) cpuset_limited = true; if (cpuset_limited) { @@ -246,20 +246,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, return CONSTRAINT_NONE; } #else -static enum oom_constraint constrained_alloc(struct zonelist *zonelist, - gfp_t gfp_mask, nodemask_t *nodemask, - unsigned long *totalpages) +static enum oom_constraint constrained_alloc(struct oom_control *oc, + unsigned long *totalpages) { *totalpages = totalram_pages + total_swap_pages; return CONSTRAINT_NONE; } #endif -enum oom_scan_t oom_scan_process_thread(struct task_struct *task, - unsigned long totalpages, const nodemask_t *nodemask, - bool force_kill) +enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, + struct task_struct *task, unsigned long totalpages) { - if (oom_unkillable_task(task, NULL, nodemask)) + if (oom_unkillable_task(task, NULL, oc->nodemask)) return OOM_SCAN_CONTINUE; /* @@ -267,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, * Don't allow any other task to have access to the reserves. */ if (test_tsk_thread_flag(task, TIF_MEMDIE)) { - if (!force_kill) + if (oc->order != -1) return OOM_SCAN_ABORT; } if (!task->mm) @@ -280,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, if (oom_task_origin(task)) return OOM_SCAN_SELECT; - if (task_will_free_mem(task) && !force_kill) + if (task_will_free_mem(task) && oc->order != -1) return OOM_SCAN_ABORT; return OOM_SCAN_OK; @@ -289,12 +287,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, /* * Simple selection loop. We chose the process with the highest * number of 'points'. Returns -1 on scan abort. - * - * (not docbooked, we don't want this one cluttering up the manual) */ -static struct task_struct *select_bad_process(unsigned int *ppoints, - unsigned long totalpages, const nodemask_t *nodemask, - bool force_kill) +static struct task_struct *select_bad_process(struct oom_control *oc, + unsigned int *ppoints, unsigned long totalpages) { struct task_struct *g, *p; struct task_struct *chosen = NULL; @@ -304,8 +299,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, for_each_process_thread(g, p) { unsigned int points; - switch (oom_scan_process_thread(p, totalpages, nodemask, - force_kill)) { + switch (oom_scan_process_thread(oc, p, totalpages)) { case OOM_SCAN_SELECT: chosen = p; chosen_points = ULONG_MAX; @@ -318,7 +312,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, case OOM_SCAN_OK: break; }; - points = oom_badness(p, NULL, nodemask, totalpages); + points = oom_badness(p, NULL, oc->nodemask, totalpages); if (!points || points < chosen_points) continue; /* Prefer thread group leaders for display purposes */ @@ -380,13 +374,13 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) rcu_read_unlock(); } -static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, - struct mem_cgroup *memcg, const nodemask_t *nodemask) +static void dump_header(struct oom_control *oc, struct task_struct *p, + struct mem_cgroup *memcg) { task_lock(current); pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " "oom_score_adj=%hd\n", - current->comm, gfp_mask, order, + current->comm, oc->gfp_mask, oc->order, current->signal->oom_score_adj); cpuset_print_task_mems_allowed(current); task_unlock(current); @@ -396,7 +390,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, else show_mem(SHOW_MEM_FILTER_NODES); if (sysctl_oom_dump_tasks) - dump_tasks(memcg, nodemask); + dump_tasks(memcg, oc->nodemask); } /* @@ -487,10 +481,9 @@ void oom_killer_enable(void) * Must be called while holding a reference to p, which will be released upon * returning. */ -void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, +void oom_kill_process(struct oom_control *oc, struct task_struct *p, unsigned int points, unsigned long totalpages, - struct mem_cgroup *memcg, nodemask_t *nodemask, - const char *message) + struct mem_cgroup *memcg, const char *message) { struct task_struct *victim = p; struct task_struct *child; @@ -514,7 +507,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, task_unlock(p); if (__ratelimit(&oom_rs)) - dump_header(p, gfp_mask, order, memcg, nodemask); + dump_header(oc, p, memcg); task_lock(p); pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", @@ -537,7 +530,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* * oom_badness() returns 0 if the thread is unkillable */ - child_points = oom_badness(child, memcg, nodemask, + child_points = oom_badness(child, memcg, oc->nodemask, totalpages); if (child_points > victim_points) { put_task_struct(victim); @@ -600,8 +593,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ -void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, - int order, const nodemask_t *nodemask, +void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint, struct mem_cgroup *memcg) { if (likely(!sysctl_panic_on_oom)) @@ -615,7 +607,10 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, if (constraint != CONSTRAINT_NONE) return; } - dump_header(NULL, gfp_mask, order, memcg, nodemask); + /* Do not panic for oom kills triggered by sysrq */ + if (oc->order == -1) + return; + dump_header(oc, NULL, memcg); panic("Out of memory: %s panic_on_oom is enabled\n", sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); } @@ -635,28 +630,21 @@ int unregister_oom_notifier(struct notifier_block *nb) EXPORT_SYMBOL_GPL(unregister_oom_notifier); /** - * __out_of_memory - kill the "best" process when we run out of memory - * @zonelist: zonelist pointer - * @gfp_mask: memory allocation flags - * @order: amount of memory being requested as a power of 2 - * @nodemask: nodemask passed to page allocator - * @force_kill: true if a task must be killed, even if others are exiting + * out_of_memory - kill the "best" process when we run out of memory + * @oc: pointer to struct oom_control * * If we run out of memory, we have the choice between either * killing a random task (bad), letting the system crash (worse) * OR try to be smart about which process to kill. Note that we * don't have to be perfect here, we just have to be good. */ -bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, - int order, nodemask_t *nodemask, bool force_kill) +bool out_of_memory(struct oom_control *oc) { - const nodemask_t *mpol_mask; struct task_struct *p; unsigned long totalpages; unsigned long freed = 0; unsigned int uninitialized_var(points); enum oom_constraint constraint = CONSTRAINT_NONE; - int killed = 0; if (oom_killer_disabled) return false; @@ -664,7 +652,7 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, blocking_notifier_call_chain(&oom_notify_list, 0, &freed); if (freed > 0) /* Got some memory back in the last second. */ - goto out; + return true; /* * If current has a pending SIGKILL or is exiting, then automatically @@ -677,47 +665,42 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, if (current->mm && (fatal_signal_pending(current) || task_will_free_mem(current))) { mark_oom_victim(current); - goto out; + return true; } /* * Check if there were limitations on the allocation (only relevant for * NUMA) that may require different handling. */ - constraint = constrained_alloc(zonelist, gfp_mask, nodemask, - &totalpages); - mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; - check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL); + constraint = constrained_alloc(oc, &totalpages); + if (constraint != CONSTRAINT_MEMORY_POLICY) + oc->nodemask = NULL; + check_panic_on_oom(oc, constraint, NULL); if (sysctl_oom_kill_allocating_task && current->mm && - !oom_unkillable_task(current, NULL, nodemask) && + !oom_unkillable_task(current, NULL, oc->nodemask) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { get_task_struct(current); - oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, - nodemask, + oom_kill_process(oc, current, 0, totalpages, NULL, "Out of memory (oom_kill_allocating_task)"); - goto out; + return true; } - p = select_bad_process(&points, totalpages, mpol_mask, force_kill); + p = select_bad_process(oc, &points, totalpages); /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!p) { - dump_header(NULL, gfp_mask, order, NULL, mpol_mask); + if (!p && oc->order != -1) { + dump_header(oc, NULL, NULL); panic("Out of memory and no killable processes...\n"); } - if (p != (void *)-1UL) { - oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, - nodemask, "Out of memory"); - killed = 1; - } -out: - /* - * Give the killed threads a good chance of exiting before trying to - * allocate memory again. - */ - if (killed) + if (p && p != (void *)-1UL) { + oom_kill_process(oc, p, points, totalpages, NULL, + "Out of memory"); + /* + * Give the killed process a good chance to exit before trying + * to allocate memory again. + */ schedule_timeout_killable(1); - + } return true; } @@ -728,13 +711,20 @@ out: */ void pagefault_out_of_memory(void) { + struct oom_control oc = { + .zonelist = NULL, + .nodemask = NULL, + .gfp_mask = 0, + .order = 0, + }; + if (mem_cgroup_oom_synchronize(true)) return; if (!mutex_trylock(&oom_lock)) return; - if (!out_of_memory(NULL, 0, 0, NULL, false)) { + if (!out_of_memory(&oc)) { /* * There shouldn't be any user tasks runnable while the * OOM killer is disabled, so the current task has to diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b401d40..48aaf7b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -125,6 +125,24 @@ unsigned long dirty_balance_reserve __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; +/* + * A cached value of the page's pageblock's migratetype, used when the page is + * put on a pcplist. Used to avoid the pageblock migratetype lookup when + * freeing from pcplists in most cases, at the cost of possibly becoming stale. + * Also the migratetype set in the page does not necessarily match the pcplist + * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any + * other index - this ensures that it will be put on the correct CMA freelist. + */ +static inline int get_pcppage_migratetype(struct page *page) +{ + return page->index; +} + +static inline void set_pcppage_migratetype(struct page *page, int migratetype) +{ + page->index = migratetype; +} + #ifdef CONFIG_PM_SLEEP /* * The following functions are used by the suspend/hibernate code to temporarily @@ -791,7 +809,11 @@ static void free_pcppages_bulk(struct zone *zone, int count, page = list_entry(list->prev, struct page, lru); /* must delete as __free_one_page list manipulates */ list_del(&page->lru); - mt = get_freepage_migratetype(page); + + mt = get_pcppage_migratetype(page); + /* MIGRATE_ISOLATE page should not go to pcplists */ + VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); + /* Pageblock could have been isolated meanwhile */ if (unlikely(has_isolate_pageblock(zone))) mt = get_pageblock_migratetype(page); @@ -955,7 +977,6 @@ static void __free_pages_ok(struct page *page, unsigned int order) migratetype = get_pfnblock_migratetype(page, pfn); local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); - set_freepage_migratetype(page, migratetype); free_one_page(page_zone(page), page, pfn, order, migratetype); local_irq_restore(flags); } @@ -1383,7 +1404,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, rmv_page_order(page); area->nr_free--; expand(zone, page, order, current_order, area, migratetype); - set_freepage_migratetype(page, migratetype); + set_pcppage_migratetype(page, migratetype); return page; } @@ -1460,7 +1481,6 @@ int move_freepages(struct zone *zone, order = page_order(page); list_move(&page->lru, &zone->free_area[order].free_list[migratetype]); - set_freepage_migratetype(page, migratetype); page += 1 << order; pages_moved += 1 << order; } @@ -1630,14 +1650,13 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) expand(zone, page, order, current_order, area, start_migratetype); /* - * The freepage_migratetype may differ from pageblock's + * The pcppage_migratetype may differ from pageblock's * migratetype depending on the decisions in - * try_to_steal_freepages(). This is OK as long as it - * does not differ for MIGRATE_CMA pageblocks. For CMA - * we need to make sure unallocated pages flushed from - * pcp lists are returned to the correct freelist. + * find_suitable_fallback(). This is OK as long as it does not + * differ for MIGRATE_CMA pageblocks. Those can be used as + * fallback only via special __rmqueue_cma_fallback() function */ - set_freepage_migratetype(page, start_migratetype); + set_pcppage_migratetype(page, start_migratetype); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, fallback_mt); @@ -1713,7 +1732,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, else list_add_tail(&page->lru, list); list = &page->lru; - if (is_migrate_cma(get_freepage_migratetype(page))) + if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); } @@ -1910,7 +1929,7 @@ void free_hot_cold_page(struct page *page, bool cold) return; migratetype = get_pfnblock_migratetype(page, pfn); - set_freepage_migratetype(page, migratetype); + set_pcppage_migratetype(page, migratetype); local_irq_save(flags); __count_vm_event(PGFREE); @@ -2115,7 +2134,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, if (!page) goto failed; __mod_zone_freepage_state(zone, -(1 << order), - get_freepage_migratetype(page)); + get_pcppage_migratetype(page)); } __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); @@ -2696,6 +2715,12 @@ static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac, unsigned long *did_some_progress) { + struct oom_control oc = { + .zonelist = ac->zonelist, + .nodemask = ac->nodemask, + .gfp_mask = gfp_mask, + .order = order, + }; struct page *page; *did_some_progress = 0; @@ -2747,8 +2772,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; } /* Exhausted what can be done so it's blamo time */ - if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false) - || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) + if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) *did_some_progress = 1; out: mutex_unlock(&oom_lock); @@ -3490,8 +3514,6 @@ EXPORT_SYMBOL(alloc_pages_exact); * * Like alloc_pages_exact(), but try to allocate on node nid first before falling * back. - * Note this is not alloc_pages_exact_node() which allocates on a specific node, - * but is not exact. */ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { @@ -5066,7 +5088,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, { unsigned long zone_start_pfn, zone_end_pfn; - /* When hotadd a new node, the node should be empty */ + /* When hotadd a new node from cpu_up(), the node should be empty */ if (!node_start_pfn && !node_end_pfn) return 0; @@ -5133,7 +5155,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; unsigned long zone_start_pfn, zone_end_pfn; - /* When hotadd a new node, the node should be empty */ + /* When hotadd a new node from cpu_up(), the node should be empty */ if (!node_start_pfn && !node_end_pfn) return 0; @@ -5306,8 +5328,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, * * NOTE: pgdat should get zeroed by caller. */ -static void __paginginit free_area_init_core(struct pglist_data *pgdat, - unsigned long node_start_pfn, unsigned long node_end_pfn) +static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; @@ -5458,7 +5479,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, - (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); + (u64)start_pfn << PAGE_SHIFT, + end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); #endif calculate_node_totalpages(pgdat, start_pfn, end_pfn, zones_size, zholes_size); @@ -5470,7 +5492,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, (unsigned long)pgdat->node_mem_map); #endif - free_area_init_core(pgdat, start_pfn, end_pfn); + free_area_init_core(pgdat); } #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -5481,11 +5503,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, */ void __init setup_nr_node_ids(void) { - unsigned int node; - unsigned int highest = 0; + unsigned int highest; - for_each_node_mask(node, node_possible_map) - highest = node; + highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES); nr_node_ids = highest + 1; } #endif @@ -6006,7 +6026,7 @@ void __init mem_init_print_info(const char *str) * set_dma_reserve - set the specified number of pages reserved in the first zone * @new_dma_reserve: The number of pages to mark reserved * - * The per-cpu batchsize and zone watermarks are determined by present_pages. + * The per-cpu batchsize and zone watermarks are determined by managed_pages. * In the DMA zone, a significant percentage may be consumed by kernel image * and other unfreeable allocations which can skew the watermarks badly. This * function may optionally be used to account for unfreeable pages in the @@ -6059,7 +6079,7 @@ void __init page_alloc_init(void) } /* - * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio + * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio * or min_free_kbytes changes. */ static void calculate_totalreserve_pages(void) @@ -6103,7 +6123,7 @@ static void calculate_totalreserve_pages(void) /* * setup_per_zone_lowmem_reserve - called whenever - * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone + * sysctl_lowmem_reserve_ratio changes. Ensures that each zone * has a correct pages reserved value, so an adequate number of * pages are left in the zone after a successful __alloc_pages(). */ diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 303c908..4568fd5 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -9,7 +9,8 @@ #include <linux/hugetlb.h> #include "internal.h" -int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) +static int set_migratetype_isolate(struct page *page, + bool skip_hwpoisoned_pages) { struct zone *zone; unsigned long flags, pfn; @@ -72,7 +73,7 @@ out: return ret; } -void unset_migratetype_isolate(struct page *page, unsigned migratetype) +static void unset_migratetype_isolate(struct page *page, unsigned migratetype) { struct zone *zone; unsigned long flags, nr_pages; @@ -223,34 +224,16 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, continue; } page = pfn_to_page(pfn); - if (PageBuddy(page)) { + if (PageBuddy(page)) /* - * If race between isolatation and allocation happens, - * some free pages could be in MIGRATE_MOVABLE list - * although pageblock's migratation type of the page - * is MIGRATE_ISOLATE. Catch it and move the page into - * MIGRATE_ISOLATE list. + * If the page is on a free list, it has to be on + * the correct MIGRATE_ISOLATE freelist. There is no + * simple way to verify that as VM_BUG_ON(), though. */ - if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) { - struct page *end_page; - - end_page = page + (1 << page_order(page)) - 1; - move_freepages(page_zone(page), page, end_page, - MIGRATE_ISOLATE); - } pfn += 1 << page_order(page); - } - else if (page_count(page) == 0 && - get_freepage_migratetype(page) == MIGRATE_ISOLATE) - pfn += 1; - else if (skip_hwpoisoned_pages && PageHWPoison(page)) { - /* - * The HWPoisoned page may be not in buddy - * system, and page_count() is not 0. - */ + else if (skip_hwpoisoned_pages && PageHWPoison(page)) + /* A HWPoisoned page cannot be also PageBuddy */ pfn++; - continue; - } else break; } @@ -542,6 +542,21 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) } EXPORT_SYMBOL_GPL(shmem_truncate_range); +static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct shmem_inode_info *info = SHMEM_I(inode); + + spin_lock(&info->lock); + shmem_recalc_inode(inode); + spin_unlock(&info->lock); + + generic_fillattr(inode, stat); + + return 0; +} + static int shmem_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -3122,6 +3137,7 @@ static const struct file_operations shmem_file_operations = { }; static const struct inode_operations shmem_inode_operations = { + .getattr = shmem_getattr, .setattr = shmem_setattr, #ifdef CONFIG_TMPFS_XATTR .setxattr = shmem_setxattr, @@ -1595,7 +1595,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, if (memcg_charge_slab(cachep, flags, cachep->gfporder)) return NULL; - page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); + page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); if (!page) { memcg_uncharge_slab(cachep, cachep->gfporder); slab_out_of_memory(cachep, flags, nodeid); diff --git a/mm/slab_common.c b/mm/slab_common.c index c26829f..5ce4fae 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -500,7 +500,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, struct kmem_cache *root_cache) { static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */ - struct cgroup_subsys_state *css = mem_cgroup_css(memcg); + struct cgroup_subsys_state *css = &memcg->css; struct memcg_cache_array *arr; struct kmem_cache *s = NULL; char *cache_name; @@ -640,6 +640,9 @@ void kmem_cache_destroy(struct kmem_cache *s) bool need_rcu_barrier = false; bool busy = false; + if (unlikely(!s)) + return; + BUG_ON(!is_root_cache(s)); get_online_cpus(); @@ -45,7 +45,7 @@ * NUMA support in SLOB is fairly simplistic, pushing most of the real * logic down to the page allocator, and simply doing the node accounting * on the upper levels. In the event that a node id is explicitly - * provided, alloc_pages_exact_node() with the specified node id is used + * provided, __alloc_pages_node() with the specified node id is used * instead. The common case (or when the node id isn't explicitly provided) * will default to the current node, as per numa_node_id(). * @@ -193,7 +193,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) #ifdef CONFIG_NUMA if (node != NUMA_NO_NODE) - page = alloc_pages_exact_node(node, gfp, order); + page = __alloc_pages_node(node, gfp, order); else #endif page = alloc_pages(gfp, order); @@ -1334,7 +1334,7 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, if (node == NUMA_NO_NODE) page = alloc_pages(flags, order); else - page = alloc_pages_exact_node(node, flags, order); + page = __alloc_pages_node(node, flags, order); if (!page) memcg_uncharge_slab(s, order); diff --git a/mm/swap_state.c b/mm/swap_state.c index 8bc8e66..d504adb 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -288,17 +288,14 @@ struct page * lookup_swap_cache(swp_entry_t entry) return page; } -/* - * Locate a page of swap in physical memory, reserving swap cache space - * and reading the disk if it is not already cached. - * A failure return means that either the page allocation failed or that - * the swap entry is no longer in use. - */ -struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr) +struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, unsigned long addr, + bool *new_page_allocated) { struct page *found_page, *new_page = NULL; + struct address_space *swapper_space = swap_address_space(entry); int err; + *new_page_allocated = false; do { /* @@ -306,8 +303,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ - found_page = find_get_page(swap_address_space(entry), - entry.val); + found_page = find_get_page(swapper_space, entry.val); if (found_page) break; @@ -366,7 +362,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * Initiate read into locked page and return. */ lru_cache_add_anon(new_page); - swap_readpage(new_page); + *new_page_allocated = true; return new_page; } radix_tree_preload_end(); @@ -384,6 +380,25 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return found_page; } +/* + * Locate a page of swap in physical memory, reserving swap cache space + * and reading the disk if it is not already cached. + * A failure return means that either the page allocation failed or that + * the swap entry is no longer in use. + */ +struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, unsigned long addr) +{ + bool page_was_allocated; + struct page *retpage = __read_swap_cache_async(entry, gfp_mask, + vma, addr, &page_was_allocated); + + if (page_was_allocated) + swap_readpage(retpage); + + return retpage; +} + static unsigned long swapin_nr_pages(unsigned long offset) { static unsigned long prev_offset; diff --git a/mm/swapfile.c b/mm/swapfile.c index aebc2dd6e..5887731 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -875,6 +875,48 @@ int page_swapcount(struct page *page) } /* + * How many references to @entry are currently swapped out? + * This considers COUNT_CONTINUED so it returns exact answer. + */ +int swp_swapcount(swp_entry_t entry) +{ + int count, tmp_count, n; + struct swap_info_struct *p; + struct page *page; + pgoff_t offset; + unsigned char *map; + + p = swap_info_get(entry); + if (!p) + return 0; + + count = swap_count(p->swap_map[swp_offset(entry)]); + if (!(count & COUNT_CONTINUED)) + goto out; + + count &= ~COUNT_CONTINUED; + n = SWAP_MAP_MAX + 1; + + offset = swp_offset(entry); + page = vmalloc_to_page(p->swap_map + offset); + offset &= ~PAGE_MASK; + VM_BUG_ON(page_private(page) != SWP_CONTINUED); + + do { + page = list_entry(page->lru.next, struct page, lru); + map = kmap_atomic(page); + tmp_count = map[offset]; + kunmap_atomic(map); + + count += (tmp_count & ~COUNT_CONTINUED) * n; + n *= (SWAP_CONT_MAX + 1); + } while (tmp_count & COUNT_CONTINUED); +out: + spin_unlock(&p->lock); + return count; +} + +/* * We can write to an anon page without COW if there are no other references * to it. And as a side-effect, free up its swap: because the old content * on disk will never be read, and seeking back there to write new content diff --git a/mm/vmscan.c b/mm/vmscan.c index b113903..2d978b2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -175,7 +175,7 @@ static bool sane_reclaim(struct scan_control *sc) if (!memcg) return true; #ifdef CONFIG_CGROUP_WRITEBACK - if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup)) + if (memcg->css.cgroup) return true; #endif return false; @@ -985,7 +985,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * __GFP_IO|__GFP_FS for this reason); but more thought * would probably show more reasons. * - * 3) Legacy memcg encounters a page that is not already marked + * 3) Legacy memcg encounters a page that is already marked * PageReclaim. memcg does not have any dirty pages * throttling so we could easily OOM just because too many * pages are in writeback and there is nothing else to @@ -1015,12 +1015,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ SetPageReclaim(page); nr_writeback++; - goto keep_locked; /* Case 3 above */ } else { + unlock_page(page); wait_on_page_writeback(page); + /* then go back and try same page again */ + list_add_tail(&page->lru, page_list); + continue; } } @@ -1196,7 +1199,7 @@ cull_mlocked: if (PageSwapCache(page)) try_to_free_swap(page); unlock_page(page); - putback_lru_page(page); + list_add(&page->lru, &ret_pages); continue; activate_locked: @@ -1359,7 +1362,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long nr_taken = 0; unsigned long scan; - for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { + for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && + !list_empty(src); scan++) { struct page *page; int nr_pages; @@ -96,10 +96,10 @@ struct zbud_pool { struct list_head buddied; struct list_head lru; u64 pages_nr; - struct zbud_ops *ops; + const struct zbud_ops *ops; #ifdef CONFIG_ZPOOL struct zpool *zpool; - struct zpool_ops *zpool_ops; + const struct zpool_ops *zpool_ops; #endif }; @@ -133,12 +133,12 @@ static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle) return -ENOENT; } -static struct zbud_ops zbud_zpool_ops = { +static const struct zbud_ops zbud_zpool_ops = { .evict = zbud_zpool_evict }; static void *zbud_zpool_create(char *name, gfp_t gfp, - struct zpool_ops *zpool_ops, + const struct zpool_ops *zpool_ops, struct zpool *zpool) { struct zbud_pool *pool; @@ -302,7 +302,7 @@ static int num_free_chunks(struct zbud_header *zhdr) * Return: pointer to the new zbud pool or NULL if the metadata allocation * failed. */ -struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops) +struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops) { struct zbud_pool *pool; int i; @@ -22,7 +22,7 @@ struct zpool { struct zpool_driver *driver; void *pool; - struct zpool_ops *ops; + const struct zpool_ops *ops; struct list_head list; }; @@ -115,7 +115,7 @@ static void zpool_put_driver(struct zpool_driver *driver) * Returns: New zpool on success, NULL on failure. */ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, - struct zpool_ops *ops) + const struct zpool_ops *ops) { struct zpool_driver *driver; struct zpool *zpool; @@ -320,20 +320,6 @@ u64 zpool_get_total_size(struct zpool *zpool) return zpool->driver->total_size(zpool->pool); } -static int __init init_zpool(void) -{ - pr_info("loaded\n"); - return 0; -} - -static void __exit exit_zpool(void) -{ - pr_info("unloaded\n"); -} - -module_init(init_zpool); -module_exit(exit_zpool); - MODULE_LICENSE("GPL"); MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 0a7f81a..f135b1b 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -169,14 +169,12 @@ enum zs_stat_type { NR_ZS_STAT_TYPE, }; -#ifdef CONFIG_ZSMALLOC_STAT - -static struct dentry *zs_stat_root; - struct zs_size_stat { unsigned long objs[NR_ZS_STAT_TYPE]; }; +#ifdef CONFIG_ZSMALLOC_STAT +static struct dentry *zs_stat_root; #endif /* @@ -201,6 +199,8 @@ static int zs_size_classes; static const int fullness_threshold_frac = 4; struct size_class { + spinlock_t lock; + struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; /* * Size of objects stored in this class. Must be multiple * of ZS_ALIGN. @@ -210,16 +210,10 @@ struct size_class { /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ int pages_per_zspage; - /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ - bool huge; - -#ifdef CONFIG_ZSMALLOC_STAT struct zs_size_stat stats; -#endif - - spinlock_t lock; - struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; + /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ + bool huge; }; /* @@ -251,6 +245,15 @@ struct zs_pool { gfp_t flags; /* allocation flags used when growing pool */ atomic_long_t pages_allocated; + struct zs_pool_stats stats; + + /* Compact classes */ + struct shrinker shrinker; + /* + * To signify that register_shrinker() was successful + * and unregister_shrinker() will not Oops. + */ + bool shrinker_enabled; #ifdef CONFIG_ZSMALLOC_STAT struct dentry *stat_dentry; #endif @@ -285,8 +288,7 @@ static int create_handle_cache(struct zs_pool *pool) static void destroy_handle_cache(struct zs_pool *pool) { - if (pool->handle_cachep) - kmem_cache_destroy(pool->handle_cachep); + kmem_cache_destroy(pool->handle_cachep); } static unsigned long alloc_handle(struct zs_pool *pool) @@ -309,7 +311,8 @@ static void record_obj(unsigned long handle, unsigned long obj) #ifdef CONFIG_ZPOOL -static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops, +static void *zs_zpool_create(char *name, gfp_t gfp, + const struct zpool_ops *zpool_ops, struct zpool *zpool) { return zs_create_pool(name, gfp); @@ -441,8 +444,6 @@ static int get_size_class_index(int size) return min(zs_size_classes - 1, idx); } -#ifdef CONFIG_ZSMALLOC_STAT - static inline void zs_stat_inc(struct size_class *class, enum zs_stat_type type, unsigned long cnt) { @@ -461,6 +462,8 @@ static inline unsigned long zs_stat_get(struct size_class *class, return class->stats.objs[type]; } +#ifdef CONFIG_ZSMALLOC_STAT + static int __init zs_stat_init(void) { if (!debugfs_initialized()) @@ -576,23 +579,6 @@ static void zs_pool_stat_destroy(struct zs_pool *pool) } #else /* CONFIG_ZSMALLOC_STAT */ - -static inline void zs_stat_inc(struct size_class *class, - enum zs_stat_type type, unsigned long cnt) -{ -} - -static inline void zs_stat_dec(struct size_class *class, - enum zs_stat_type type, unsigned long cnt) -{ -} - -static inline unsigned long zs_stat_get(struct size_class *class, - enum zs_stat_type type) -{ - return 0; -} - static int __init zs_stat_init(void) { return 0; @@ -610,7 +596,6 @@ static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) static inline void zs_pool_stat_destroy(struct zs_pool *pool) { } - #endif @@ -658,13 +643,22 @@ static void insert_zspage(struct page *page, struct size_class *class, if (fullness >= _ZS_NR_FULLNESS_GROUPS) return; - head = &class->fullness_list[fullness]; - if (*head) - list_add_tail(&page->lru, &(*head)->lru); - - *head = page; zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); + + head = &class->fullness_list[fullness]; + if (!*head) { + *head = page; + return; + } + + /* + * We want to see more ZS_FULL pages and less almost + * empty/full. Put pages with higher ->inuse first. + */ + list_add_tail(&page->lru, &(*head)->lru); + if (page->inuse >= (*head)->inuse) + *head = page; } /* @@ -1495,7 +1489,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle) } EXPORT_SYMBOL_GPL(zs_free); -static void zs_object_copy(unsigned long src, unsigned long dst, +static void zs_object_copy(unsigned long dst, unsigned long src, struct size_class *class) { struct page *s_page, *d_page; @@ -1602,8 +1596,6 @@ struct zs_compact_control { /* Starting object index within @s_page which used for live object * in the subpage. */ int index; - /* how many of objects are migrated */ - int nr_migrated; }; static int migrate_zspage(struct zs_pool *pool, struct size_class *class, @@ -1614,7 +1606,6 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, struct page *s_page = cc->s_page; struct page *d_page = cc->d_page; unsigned long index = cc->index; - int nr_migrated = 0; int ret = 0; while (1) { @@ -1636,23 +1627,21 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, used_obj = handle_to_obj(handle); free_obj = obj_malloc(d_page, class, handle); - zs_object_copy(used_obj, free_obj, class); + zs_object_copy(free_obj, used_obj, class); index++; record_obj(handle, free_obj); unpin_tag(handle); obj_free(pool, class, used_obj); - nr_migrated++; } /* Remember last position in this iteration */ cc->s_page = s_page; cc->index = index; - cc->nr_migrated = nr_migrated; return ret; } -static struct page *alloc_target_page(struct size_class *class) +static struct page *isolate_target_page(struct size_class *class) { int i; struct page *page; @@ -1668,8 +1657,17 @@ static struct page *alloc_target_page(struct size_class *class) return page; } -static void putback_zspage(struct zs_pool *pool, struct size_class *class, - struct page *first_page) +/* + * putback_zspage - add @first_page into right class's fullness list + * @pool: target pool + * @class: destination class + * @first_page: target page + * + * Return @fist_page's fullness_group + */ +static enum fullness_group putback_zspage(struct zs_pool *pool, + struct size_class *class, + struct page *first_page) { enum fullness_group fullness; @@ -1687,50 +1685,72 @@ static void putback_zspage(struct zs_pool *pool, struct size_class *class, free_zspage(first_page); } + + return fullness; } static struct page *isolate_source_page(struct size_class *class) { - struct page *page; + int i; + struct page *page = NULL; - page = class->fullness_list[ZS_ALMOST_EMPTY]; - if (page) - remove_zspage(page, class, ZS_ALMOST_EMPTY); + for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) { + page = class->fullness_list[i]; + if (!page) + continue; + + remove_zspage(page, class, i); + break; + } return page; } -static unsigned long __zs_compact(struct zs_pool *pool, - struct size_class *class) +/* + * + * Based on the number of unused allocated objects calculate + * and return the number of pages that we can free. + */ +static unsigned long zs_can_compact(struct size_class *class) +{ + unsigned long obj_wasted; + + obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) - + zs_stat_get(class, OBJ_USED); + + obj_wasted /= get_maxobj_per_zspage(class->size, + class->pages_per_zspage); + + return obj_wasted * class->pages_per_zspage; +} + +static void __zs_compact(struct zs_pool *pool, struct size_class *class) { - int nr_to_migrate; struct zs_compact_control cc; struct page *src_page; struct page *dst_page = NULL; - unsigned long nr_total_migrated = 0; spin_lock(&class->lock); while ((src_page = isolate_source_page(class))) { BUG_ON(!is_first_page(src_page)); - /* The goal is to migrate all live objects in source page */ - nr_to_migrate = src_page->inuse; + if (!zs_can_compact(class)) + break; + cc.index = 0; cc.s_page = src_page; - while ((dst_page = alloc_target_page(class))) { + while ((dst_page = isolate_target_page(class))) { cc.d_page = dst_page; /* - * If there is no more space in dst_page, try to - * allocate another zspage. + * If there is no more space in dst_page, resched + * and see if anyone had allocated another zspage. */ if (!migrate_zspage(pool, class, &cc)) break; putback_zspage(pool, class, dst_page); - nr_total_migrated += cc.nr_migrated; - nr_to_migrate -= cc.nr_migrated; } /* Stop if we couldn't find slot */ @@ -1738,9 +1758,9 @@ static unsigned long __zs_compact(struct zs_pool *pool, break; putback_zspage(pool, class, dst_page); - putback_zspage(pool, class, src_page); + if (putback_zspage(pool, class, src_page) == ZS_EMPTY) + pool->stats.pages_compacted += class->pages_per_zspage; spin_unlock(&class->lock); - nr_total_migrated += cc.nr_migrated; cond_resched(); spin_lock(&class->lock); } @@ -1749,14 +1769,11 @@ static unsigned long __zs_compact(struct zs_pool *pool, putback_zspage(pool, class, src_page); spin_unlock(&class->lock); - - return nr_total_migrated; } unsigned long zs_compact(struct zs_pool *pool) { int i; - unsigned long nr_migrated = 0; struct size_class *class; for (i = zs_size_classes - 1; i >= 0; i--) { @@ -1765,13 +1782,80 @@ unsigned long zs_compact(struct zs_pool *pool) continue; if (class->index != i) continue; - nr_migrated += __zs_compact(pool, class); + __zs_compact(pool, class); } - return nr_migrated; + return pool->stats.pages_compacted; } EXPORT_SYMBOL_GPL(zs_compact); +void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats) +{ + memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats)); +} +EXPORT_SYMBOL_GPL(zs_pool_stats); + +static unsigned long zs_shrinker_scan(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long pages_freed; + struct zs_pool *pool = container_of(shrinker, struct zs_pool, + shrinker); + + pages_freed = pool->stats.pages_compacted; + /* + * Compact classes and calculate compaction delta. + * Can run concurrently with a manually triggered + * (by user) compaction. + */ + pages_freed = zs_compact(pool) - pages_freed; + + return pages_freed ? pages_freed : SHRINK_STOP; +} + +static unsigned long zs_shrinker_count(struct shrinker *shrinker, + struct shrink_control *sc) +{ + int i; + struct size_class *class; + unsigned long pages_to_free = 0; + struct zs_pool *pool = container_of(shrinker, struct zs_pool, + shrinker); + + if (!pool->shrinker_enabled) + return 0; + + for (i = zs_size_classes - 1; i >= 0; i--) { + class = pool->size_class[i]; + if (!class) + continue; + if (class->index != i) + continue; + + pages_to_free += zs_can_compact(class); + } + + return pages_to_free; +} + +static void zs_unregister_shrinker(struct zs_pool *pool) +{ + if (pool->shrinker_enabled) { + unregister_shrinker(&pool->shrinker); + pool->shrinker_enabled = false; + } +} + +static int zs_register_shrinker(struct zs_pool *pool) +{ + pool->shrinker.scan_objects = zs_shrinker_scan; + pool->shrinker.count_objects = zs_shrinker_count; + pool->shrinker.batch = 0; + pool->shrinker.seeks = DEFAULT_SEEKS; + + return register_shrinker(&pool->shrinker); +} + /** * zs_create_pool - Creates an allocation pool to work from. * @flags: allocation flags used to allocate pool metadata @@ -1857,6 +1941,12 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags) if (zs_pool_stat_create(name, pool)) goto err; + /* + * Not critical, we still can use the pool + * and user can trigger compaction manually. + */ + if (zs_register_shrinker(pool) == 0) + pool->shrinker_enabled = true; return pool; err: @@ -1869,6 +1959,7 @@ void zs_destroy_pool(struct zs_pool *pool) { int i; + zs_unregister_shrinker(pool); zs_pool_stat_destroy(pool); for (i = 0; i < zs_size_classes; i++) { @@ -446,75 +446,14 @@ enum zswap_get_swap_ret { static int zswap_get_swap_cache_page(swp_entry_t entry, struct page **retpage) { - struct page *found_page, *new_page = NULL; - struct address_space *swapper_space = swap_address_space(entry); - int err; + bool page_was_allocated; - *retpage = NULL; - do { - /* - * First check the swap cache. Since this is normally - * called after lookup_swap_cache() failed, re-calling - * that would confuse statistics. - */ - found_page = find_get_page(swapper_space, entry.val); - if (found_page) - break; - - /* - * Get a new page to read into from swap. - */ - if (!new_page) { - new_page = alloc_page(GFP_KERNEL); - if (!new_page) - break; /* Out of memory */ - } - - /* - * call radix_tree_preload() while we can wait. - */ - err = radix_tree_preload(GFP_KERNEL); - if (err) - break; - - /* - * Swap entry may have been freed since our caller observed it. - */ - err = swapcache_prepare(entry); - if (err == -EEXIST) { /* seems racy */ - radix_tree_preload_end(); - continue; - } - if (err) { /* swp entry is obsolete ? */ - radix_tree_preload_end(); - break; - } - - /* May fail (-ENOMEM) if radix-tree node allocation failed. */ - __set_page_locked(new_page); - SetPageSwapBacked(new_page); - err = __add_to_swap_cache(new_page, entry); - if (likely(!err)) { - radix_tree_preload_end(); - lru_cache_add_anon(new_page); - *retpage = new_page; - return ZSWAP_SWAPCACHE_NEW; - } - radix_tree_preload_end(); - ClearPageSwapBacked(new_page); - __clear_page_locked(new_page); - /* - * add_to_swap_cache() doesn't return -EEXIST, so we can safely - * clear SWAP_HAS_CACHE flag. - */ - swapcache_free(entry); - } while (err != -ENOMEM); - - if (new_page) - page_cache_release(new_page); - if (!found_page) + *retpage = __read_swap_cache_async(entry, GFP_KERNEL, + NULL, 0, &page_was_allocated); + if (page_was_allocated) + return ZSWAP_SWAPCACHE_NEW; + if (!*retpage) return ZSWAP_SWAPCACHE_FAIL; - *retpage = found_page; return ZSWAP_SWAPCACHE_EXIST; } @@ -816,7 +755,7 @@ static void zswap_frontswap_invalidate_area(unsigned type) zswap_trees[type] = NULL; } -static struct zpool_ops zswap_zpool_ops = { +static const struct zpool_ops zswap_zpool_ops = { .evict = zswap_writeback_entry }; |