diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 3 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/bootmem.c | 12 | ||||
-rw-r--r-- | mm/compaction.c | 562 | ||||
-rw-r--r-- | mm/fadvise.c | 34 | ||||
-rw-r--r-- | mm/filemap.c | 13 | ||||
-rw-r--r-- | mm/filemap_xip.c | 10 | ||||
-rw-r--r-- | mm/fremap.c | 19 | ||||
-rw-r--r-- | mm/frontswap.c | 34 | ||||
-rw-r--r-- | mm/huge_memory.c | 441 | ||||
-rw-r--r-- | mm/hugetlb.c | 34 | ||||
-rw-r--r-- | mm/internal.h | 52 | ||||
-rw-r--r-- | mm/interval_tree.c | 112 | ||||
-rw-r--r-- | mm/kmemleak.c | 106 | ||||
-rw-r--r-- | mm/ksm.c | 40 | ||||
-rw-r--r-- | mm/madvise.c | 8 | ||||
-rw-r--r-- | mm/memblock.c | 7 | ||||
-rw-r--r-- | mm/memcontrol.c | 29 | ||||
-rw-r--r-- | mm/memory-failure.c | 8 | ||||
-rw-r--r-- | mm/memory.c | 115 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 93 | ||||
-rw-r--r-- | mm/mempolicy.c | 150 | ||||
-rw-r--r-- | mm/mlock.c | 27 | ||||
-rw-r--r-- | mm/mmap.c | 215 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 103 | ||||
-rw-r--r-- | mm/mremap.c | 73 | ||||
-rw-r--r-- | mm/nobootmem.c | 5 | ||||
-rw-r--r-- | mm/nommu.c | 39 | ||||
-rw-r--r-- | mm/oom_kill.c | 4 | ||||
-rw-r--r-- | mm/page_alloc.c | 319 | ||||
-rw-r--r-- | mm/page_isolation.c | 43 | ||||
-rw-r--r-- | mm/percpu.c | 2 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 50 | ||||
-rw-r--r-- | mm/prio_tree.c | 208 | ||||
-rw-r--r-- | mm/readahead.c | 14 | ||||
-rw-r--r-- | mm/rmap.c | 159 | ||||
-rw-r--r-- | mm/shmem.c | 174 | ||||
-rw-r--r-- | mm/slab.c | 357 | ||||
-rw-r--r-- | mm/slab.h | 19 | ||||
-rw-r--r-- | mm/slab_common.c | 159 | ||||
-rw-r--r-- | mm/slob.c | 91 | ||||
-rw-r--r-- | mm/slub.c | 223 | ||||
-rw-r--r-- | mm/swap.c | 13 | ||||
-rw-r--r-- | mm/truncate.c | 3 | ||||
-rw-r--r-- | mm/util.c | 35 | ||||
-rw-r--r-- | mm/vmalloc.c | 5 | ||||
-rw-r--r-- | mm/vmscan.c | 112 | ||||
-rw-r--r-- | mm/vmstat.c | 16 |
48 files changed, 2389 insertions, 1965 deletions
@@ -191,6 +191,7 @@ config SPLIT_PTLOCK_CPUS # support for memory compaction config COMPACTION bool "Allow for memory compaction" + def_bool y select MIGRATION depends on MMU help @@ -318,7 +319,7 @@ config NOMMU_INITIAL_TRIM_EXCESS config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" - depends on X86 && MMU + depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE select COMPACTION help Transparent Hugepages allows the kernel to use huge pages and diff --git a/mm/Makefile b/mm/Makefile index 92753e2..6b025f8 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -14,9 +14,9 @@ endif obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ - prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ + util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o $(mmu-y) + compaction.o interval_tree.o $(mmu-y) obj-y += init-mm.o diff --git a/mm/bootmem.c b/mm/bootmem.c index bcb63ac..434be4a 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -198,6 +198,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) int order = ilog2(BITS_PER_LONG); __free_pages_bootmem(pfn_to_page(start), order); + fixup_zone_present_pages(page_to_nid(pfn_to_page(start)), + start, start + BITS_PER_LONG); count += BITS_PER_LONG; start += BITS_PER_LONG; } else { @@ -208,6 +210,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) if (vec & 1) { page = pfn_to_page(start + off); __free_pages_bootmem(page, 0); + fixup_zone_present_pages( + page_to_nid(page), + start + off, start + off + 1); count++; } vec >>= 1; @@ -221,8 +226,11 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) pages = bdata->node_low_pfn - bdata->node_min_pfn; pages = bootmem_bootmap_pages(pages); count += pages; - while (pages--) + while (pages--) { + fixup_zone_present_pages(page_to_nid(page), + page_to_pfn(page), page_to_pfn(page) + 1); __free_pages_bootmem(page++, 0); + } bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); @@ -419,7 +427,7 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, } /** - * reserve_bootmem - mark a page range as usable + * reserve_bootmem - mark a page range as reserved * @addr: starting address of the range * @size: size of the range in bytes * @flags: reservation flags (see linux/bootmem.h) diff --git a/mm/compaction.c b/mm/compaction.c index 7fcd3a5..2c4ce17 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -50,6 +50,111 @@ static inline bool migrate_async_suitable(int migratetype) return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; } +#ifdef CONFIG_COMPACTION +/* Returns true if the pageblock should be scanned for pages to isolate. */ +static inline bool isolation_suitable(struct compact_control *cc, + struct page *page) +{ + if (cc->ignore_skip_hint) + return true; + + return !get_pageblock_skip(page); +} + +/* + * This function is called to clear all cached information on pageblocks that + * should be skipped for page isolation when the migrate and free page scanner + * meet. + */ +static void __reset_isolation_suitable(struct zone *zone) +{ + unsigned long start_pfn = zone->zone_start_pfn; + unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; + unsigned long pfn; + + zone->compact_cached_migrate_pfn = start_pfn; + zone->compact_cached_free_pfn = end_pfn; + zone->compact_blockskip_flush = false; + + /* Walk the zone and mark every pageblock as suitable for isolation */ + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { + struct page *page; + + cond_resched(); + + if (!pfn_valid(pfn)) + continue; + + page = pfn_to_page(pfn); + if (zone != page_zone(page)) + continue; + + clear_pageblock_skip(page); + } +} + +void reset_isolation_suitable(pg_data_t *pgdat) +{ + int zoneid; + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct zone *zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + /* Only flush if a full compaction finished recently */ + if (zone->compact_blockskip_flush) + __reset_isolation_suitable(zone); + } +} + +/* + * If no pages were isolated then mark this pageblock to be skipped in the + * future. The information is later cleared by __reset_isolation_suitable(). + */ +static void update_pageblock_skip(struct compact_control *cc, + struct page *page, unsigned long nr_isolated, + bool migrate_scanner) +{ + struct zone *zone = cc->zone; + if (!page) + return; + + if (!nr_isolated) { + unsigned long pfn = page_to_pfn(page); + set_pageblock_skip(page); + + /* Update where compaction should restart */ + if (migrate_scanner) { + if (!cc->finished_update_migrate && + pfn > zone->compact_cached_migrate_pfn) + zone->compact_cached_migrate_pfn = pfn; + } else { + if (!cc->finished_update_free && + pfn < zone->compact_cached_free_pfn) + zone->compact_cached_free_pfn = pfn; + } + } +} +#else +static inline bool isolation_suitable(struct compact_control *cc, + struct page *page) +{ + return true; +} + +static void update_pageblock_skip(struct compact_control *cc, + struct page *page, unsigned long nr_isolated, + bool migrate_scanner) +{ +} +#endif /* CONFIG_COMPACTION */ + +static inline bool should_release_lock(spinlock_t *lock) +{ + return need_resched() || spin_is_contended(lock); +} + /* * Compaction requires the taking of some coarse locks that are potentially * very heavily contended. Check if the process needs to be scheduled or @@ -62,7 +167,7 @@ static inline bool migrate_async_suitable(int migratetype) static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, bool locked, struct compact_control *cc) { - if (need_resched() || spin_is_contended(lock)) { + if (should_release_lock(lock)) { if (locked) { spin_unlock_irqrestore(lock, *flags); locked = false; @@ -70,14 +175,11 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, /* async aborts if taking too long or contended */ if (!cc->sync) { - if (cc->contended) - *cc->contended = true; + cc->contended = true; return false; } cond_resched(); - if (fatal_signal_pending(current)) - return false; } if (!locked) @@ -91,44 +193,139 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock, return compact_checklock_irqsave(lock, flags, false, cc); } +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct page *page) +{ + int migratetype = get_pageblock_migratetype(page); + + /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ + if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) + return false; + + /* If the page is a large free page, then allow migration */ + if (PageBuddy(page) && page_order(page) >= pageblock_order) + return true; + + /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ + if (migrate_async_suitable(migratetype)) + return true; + + /* Otherwise skip the block */ + return false; +} + +static void compact_capture_page(struct compact_control *cc) +{ + unsigned long flags; + int mtype, mtype_low, mtype_high; + + if (!cc->page || *cc->page) + return; + + /* + * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP + * regardless of the migratetype of the freelist is is captured from. + * This is fine because the order for a high-order MIGRATE_MOVABLE + * allocation is typically at least a pageblock size and overall + * fragmentation is not impaired. Other allocation types must + * capture pages from their own migratelist because otherwise they + * could pollute other pageblocks like MIGRATE_MOVABLE with + * difficult to move pages and making fragmentation worse overall. + */ + if (cc->migratetype == MIGRATE_MOVABLE) { + mtype_low = 0; + mtype_high = MIGRATE_PCPTYPES; + } else { + mtype_low = cc->migratetype; + mtype_high = cc->migratetype + 1; + } + + /* Speculatively examine the free lists without zone lock */ + for (mtype = mtype_low; mtype < mtype_high; mtype++) { + int order; + for (order = cc->order; order < MAX_ORDER; order++) { + struct page *page; + struct free_area *area; + area = &(cc->zone->free_area[order]); + if (list_empty(&area->free_list[mtype])) + continue; + + /* Take the lock and attempt capture of the page */ + if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) + return; + if (!list_empty(&area->free_list[mtype])) { + page = list_entry(area->free_list[mtype].next, + struct page, lru); + if (capture_free_page(page, cc->order, mtype)) { + spin_unlock_irqrestore(&cc->zone->lock, + flags); + *cc->page = page; + return; + } + } + spin_unlock_irqrestore(&cc->zone->lock, flags); + } + } +} + /* * Isolate free pages onto a private freelist. Caller must hold zone->lock. * If @strict is true, will abort returning 0 on any invalid PFNs or non-free * pages inside of the pageblock (even though it may still end up isolating * some pages). */ -static unsigned long isolate_freepages_block(unsigned long blockpfn, +static unsigned long isolate_freepages_block(struct compact_control *cc, + unsigned long blockpfn, unsigned long end_pfn, struct list_head *freelist, bool strict) { int nr_scanned = 0, total_isolated = 0; - struct page *cursor; + struct page *cursor, *valid_page = NULL; + unsigned long nr_strict_required = end_pfn - blockpfn; + unsigned long flags; + bool locked = false; cursor = pfn_to_page(blockpfn); - /* Isolate free pages. This assumes the block is valid */ + /* Isolate free pages. */ for (; blockpfn < end_pfn; blockpfn++, cursor++) { int isolated, i; struct page *page = cursor; - if (!pfn_valid_within(blockpfn)) { - if (strict) - return 0; - continue; - } nr_scanned++; + if (!pfn_valid_within(blockpfn)) + continue; + if (!valid_page) + valid_page = page; + if (!PageBuddy(page)) + continue; - if (!PageBuddy(page)) { - if (strict) - return 0; + /* + * The zone lock must be held to isolate freepages. + * Unfortunately this is a very coarse lock and can be + * heavily contended if there are parallel allocations + * or parallel compactions. For async compaction do not + * spin on the lock and we acquire the lock as late as + * possible. + */ + locked = compact_checklock_irqsave(&cc->zone->lock, &flags, + locked, cc); + if (!locked) + break; + + /* Recheck this is a suitable migration target under lock */ + if (!strict && !suitable_migration_target(page)) + break; + + /* Recheck this is a buddy page under lock */ + if (!PageBuddy(page)) continue; - } /* Found a free page, break it into order-0 pages */ isolated = split_free_page(page); if (!isolated && strict) - return 0; + break; total_isolated += isolated; for (i = 0; i < isolated; i++) { list_add(&page->lru, freelist); @@ -143,6 +340,22 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn, } trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); + + /* + * If strict isolation is requested by CMA then check that all the + * pages requested were isolated. If there were any failures, 0 is + * returned and CMA will fail. + */ + if (strict && nr_strict_required != total_isolated) + total_isolated = 0; + + if (locked) + spin_unlock_irqrestore(&cc->zone->lock, flags); + + /* Update the pageblock-skip if the whole pageblock was scanned */ + if (blockpfn == end_pfn) + update_pageblock_skip(cc, valid_page, total_isolated, false); + return total_isolated; } @@ -160,17 +373,14 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn, * a free page). */ unsigned long -isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) +isolate_freepages_range(struct compact_control *cc, + unsigned long start_pfn, unsigned long end_pfn) { - unsigned long isolated, pfn, block_end_pfn, flags; - struct zone *zone = NULL; + unsigned long isolated, pfn, block_end_pfn; LIST_HEAD(freelist); - if (pfn_valid(start_pfn)) - zone = page_zone(pfn_to_page(start_pfn)); - for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { - if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn))) + if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn))) break; /* @@ -180,10 +390,8 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); block_end_pfn = min(block_end_pfn, end_pfn); - spin_lock_irqsave(&zone->lock, flags); - isolated = isolate_freepages_block(pfn, block_end_pfn, + isolated = isolate_freepages_block(cc, pfn, block_end_pfn, &freelist, true); - spin_unlock_irqrestore(&zone->lock, flags); /* * In strict mode, isolate_freepages_block() returns 0 if @@ -253,6 +461,7 @@ static bool too_many_isolated(struct zone *zone) * @cc: Compaction control structure. * @low_pfn: The first PFN of the range. * @end_pfn: The one-past-the-last PFN of the range. + * @unevictable: true if it allows to isolate unevictable pages * * Isolate all pages that can be migrated from the range specified by * [low_pfn, end_pfn). Returns zero if there is a fatal signal @@ -268,7 +477,7 @@ static bool too_many_isolated(struct zone *zone) */ unsigned long isolate_migratepages_range(struct zone *zone, struct compact_control *cc, - unsigned long low_pfn, unsigned long end_pfn) + unsigned long low_pfn, unsigned long end_pfn, bool unevictable) { unsigned long last_pageblock_nr = 0, pageblock_nr; unsigned long nr_scanned = 0, nr_isolated = 0; @@ -276,7 +485,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, isolate_mode_t mode = 0; struct lruvec *lruvec; unsigned long flags; - bool locked; + bool locked = false; + struct page *page = NULL, *valid_page = NULL; /* * Ensure that there are not too many pages isolated from the LRU @@ -296,23 +506,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, /* Time to isolate some pages for migration */ cond_resched(); - spin_lock_irqsave(&zone->lru_lock, flags); - locked = true; for (; low_pfn < end_pfn; low_pfn++) { - struct page *page; - /* give a chance to irqs before checking need_resched() */ - if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { - spin_unlock_irqrestore(&zone->lru_lock, flags); - locked = false; + if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) { + if (should_release_lock(&zone->lru_lock)) { + spin_unlock_irqrestore(&zone->lru_lock, flags); + locked = false; + } } - /* Check if it is ok to still hold the lock */ - locked = compact_checklock_irqsave(&zone->lru_lock, &flags, - locked, cc); - if (!locked) - break; - /* * migrate_pfn does not necessarily start aligned to a * pageblock. Ensure that pfn_valid is called when moving @@ -340,6 +542,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, if (page_zone(page) != zone) continue; + if (!valid_page) + valid_page = page; + + /* If isolation recently failed, do not retry */ + pageblock_nr = low_pfn >> pageblock_order; + if (!isolation_suitable(cc, page)) + goto next_pageblock; + /* Skip if free */ if (PageBuddy(page)) continue; @@ -349,24 +559,43 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, * migration is optimistic to see if the minimum amount of work * satisfies the allocation */ - pageblock_nr = low_pfn >> pageblock_order; if (!cc->sync && last_pageblock_nr != pageblock_nr && !migrate_async_suitable(get_pageblock_migratetype(page))) { - low_pfn += pageblock_nr_pages; - low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; - last_pageblock_nr = pageblock_nr; - continue; + cc->finished_update_migrate = true; + goto next_pageblock; } + /* Check may be lockless but that's ok as we recheck later */ if (!PageLRU(page)) continue; /* - * PageLRU is set, and lru_lock excludes isolation, - * splitting and collapsing (collapsing has already - * happened if PageLRU is set). + * PageLRU is set. lru_lock normally excludes isolation + * splitting and collapsing (collapsing has already happened + * if PageLRU is set) but the lock is not necessarily taken + * here and it is wasteful to take it just to check transhuge. + * Check TransHuge without lock and skip the whole pageblock if + * it's either a transhuge or hugetlbfs page, as calling + * compound_order() without preventing THP from splitting the + * page underneath us may return surprising results. */ if (PageTransHuge(page)) { + if (!locked) + goto next_pageblock; + low_pfn += (1 << compound_order(page)) - 1; + continue; + } + + /* Check if it is ok to still hold the lock */ + locked = compact_checklock_irqsave(&zone->lru_lock, &flags, + locked, cc); + if (!locked || fatal_signal_pending(current)) + break; + + /* Recheck PageLRU and PageTransHuge under lock */ + if (!PageLRU(page)) + continue; + if (PageTransHuge(page)) { low_pfn += (1 << compound_order(page)) - 1; continue; } @@ -374,6 +603,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, if (!cc->sync) mode |= ISOLATE_ASYNC_MIGRATE; + if (unevictable) + mode |= ISOLATE_UNEVICTABLE; + lruvec = mem_cgroup_page_lruvec(page, zone); /* Try isolate the page */ @@ -383,6 +615,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, VM_BUG_ON(PageTransCompound(page)); /* Successfully isolated */ + cc->finished_update_migrate = true; del_page_from_lru_list(page, lruvec, page_lru(page)); list_add(&page->lru, migratelist); cc->nr_migratepages++; @@ -393,6 +626,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, ++low_pfn; break; } + + continue; + +next_pageblock: + low_pfn += pageblock_nr_pages; + low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; + last_pageblock_nr = pageblock_nr; } acct_isolated(zone, locked, cc); @@ -400,6 +640,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, if (locked) spin_unlock_irqrestore(&zone->lru_lock, flags); + /* Update the pageblock-skip if the whole pageblock was scanned */ + if (low_pfn == end_pfn) + update_pageblock_skip(cc, valid_page, nr_isolated, true); + trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); return low_pfn; @@ -407,43 +651,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION - -/* Returns true if the page is within a block suitable for migration to */ -static bool suitable_migration_target(struct page *page) -{ - - int migratetype = get_pageblock_migratetype(page); - - /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ - if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) - return false; - - /* If the page is a large free page, then allow migration */ - if (PageBuddy(page) && page_order(page) >= pageblock_order) - return true; - - /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ - if (migrate_async_suitable(migratetype)) - return true; - - /* Otherwise skip the block */ - return false; -} - -/* - * Returns the start pfn of the last page block in a zone. This is the starting - * point for full compaction of a zone. Compaction searches for free pages from - * the end of each zone, while isolate_freepages_block scans forward inside each - * page block. - */ -static unsigned long start_free_pfn(struct zone *zone) -{ - unsigned long free_pfn; - free_pfn = zone->zone_start_pfn + zone->spanned_pages; - free_pfn &= ~(pageblock_nr_pages-1); - return free_pfn; -} - /* * Based on information in the current compact_control, find blocks * suitable for isolating free pages from and then isolate them. @@ -453,7 +660,6 @@ static void isolate_freepages(struct zone *zone, { struct page *page; unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; - unsigned long flags; int nr_freepages = cc->nr_freepages; struct list_head *freelist = &cc->freepages; @@ -501,30 +707,16 @@ static void isolate_freepages(struct zone *zone, if (!suitable_migration_target(page)) continue; - /* - * Found a block suitable for isolating free pages from. Now - * we disabled interrupts, double check things are ok and - * isolate the pages. This is to minimise the time IRQs - * are disabled - */ - isolated = 0; + /* If isolation recently failed, do not retry */ + if (!isolation_suitable(cc, page)) + continue; - /* - * The zone lock must be held to isolate freepages. This - * unfortunately this is a very coarse lock and can be - * heavily contended if there are parallel allocations - * or parallel compactions. For async compaction do not - * spin on the lock - */ - if (!compact_trylock_irqsave(&zone->lock, &flags, cc)) - break; - if (suitable_migration_target(page)) { - end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); - isolated = isolate_freepages_block(pfn, end_pfn, - freelist, false); - nr_freepages += isolated; - } - spin_unlock_irqrestore(&zone->lock, flags); + /* Found a block suitable for isolating free pages from */ + isolated = 0; + end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); + isolated = isolate_freepages_block(cc, pfn, end_pfn, + freelist, false); + nr_freepages += isolated; /* * Record the highest PFN we isolated pages from. When next @@ -532,17 +724,8 @@ static void isolate_freepages(struct zone *zone, * page migration may have returned some pages to the allocator */ if (isolated) { + cc->finished_update_free = true; high_pfn = max(high_pfn, pfn); - - /* - * If the free scanner has wrapped, update - * compact_cached_free_pfn to point to the highest - * pageblock with free pages. This reduces excessive - * scanning of full pageblocks near the end of the - * zone - */ - if (cc->order > 0 && cc->wrapped) - zone->compact_cached_free_pfn = high_pfn; } } @@ -551,11 +734,6 @@ static void isolate_freepages(struct zone *zone, cc->free_pfn = high_pfn; cc->nr_freepages = nr_freepages; - - /* If compact_cached_free_pfn is reset then set it now */ - if (cc->order > 0 && !cc->wrapped && - zone->compact_cached_free_pfn == start_free_pfn(zone)) - zone->compact_cached_free_pfn = high_pfn; } /* @@ -633,8 +811,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, } /* Perform the isolation */ - low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn); - if (!low_pfn) + low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false); + if (!low_pfn || cc->contended) return ISOLATE_ABORT; cc->migrate_pfn = low_pfn; @@ -645,33 +823,24 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, static int compact_finished(struct zone *zone, struct compact_control *cc) { - unsigned int order; unsigned long watermark; if (fatal_signal_pending(current)) return COMPACT_PARTIAL; - /* - * A full (order == -1) compaction run starts at the beginning and - * end of a zone; it completes when the migrate and free scanner meet. - * A partial (order > 0) compaction can start with the free scanner - * at a random point in the zone, and may have to restart. - */ + /* Compaction run completes if the migrate and free scanner meet */ if (cc->free_pfn <= cc->migrate_pfn) { - if (cc->order > 0 && !cc->wrapped) { - /* We started partway through; restart at the end. */ - unsigned long free_pfn = start_free_pfn(zone); - zone->compact_cached_free_pfn = free_pfn; - cc->free_pfn = free_pfn; - cc->wrapped = 1; - return COMPACT_CONTINUE; - } - return COMPACT_COMPLETE; - } + /* + * Mark that the PG_migrate_skip information should be cleared + * by kswapd when it goes to sleep. kswapd does not set the + * flag itself as the decision to be clear should be directly + * based on an allocation request. + */ + if (!current_is_kswapd()) + zone->compact_blockskip_flush = true; - /* We wrapped around and ended up where we started. */ - if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn) return COMPACT_COMPLETE; + } /* * order == -1 is expected when compacting via @@ -688,14 +857,22 @@ static int compact_finished(struct zone *zone, return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ - for (order = cc->order; order < MAX_ORDER; order++) { - /* Job done if page is free of the right migratetype */ - if (!list_empty(&zone->free_area[order].free_list[cc->migratetype])) - return COMPACT_PARTIAL; - - /* Job done if allocation would set block type */ - if (order >= pageblock_order && zone->free_area[order].nr_free) + if (cc->page) { + /* Was a suitable page captured? */ + if (*cc->page) return COMPACT_PARTIAL; + } else { + unsigned int order; + for (order = cc->order; order < MAX_ORDER; order++) { + struct free_area *area = &zone->free_area[cc->order]; + /* Job done if page is free of the right migratetype */ + if (!list_empty(&area->free_list[cc->migratetype])) + return COMPACT_PARTIAL; + + /* Job done if allocation would set block type */ + if (cc->order >= pageblock_order && area->nr_free) + return COMPACT_PARTIAL; + } } return COMPACT_CONTINUE; @@ -754,6 +931,8 @@ unsigned long compaction_suitable(struct zone *zone, int order) static int compact_zone(struct zone *zone, struct compact_control *cc) { int ret; + unsigned long start_pfn = zone->zone_start_pfn; + unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; ret = compaction_suitable(zone, cc->order); switch (ret) { @@ -766,18 +945,30 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ; } - /* Setup to move all movable pages to the end of the zone */ - cc->migrate_pfn = zone->zone_start_pfn; - - if (cc->order > 0) { - /* Incremental compaction. Start where the last one stopped. */ - cc->free_pfn = zone->compact_cached_free_pfn; - cc->start_free_pfn = cc->free_pfn; - } else { - /* Order == -1 starts at the end of the zone. */ - cc->free_pfn = start_free_pfn(zone); + /* + * Setup to move all movable pages to the end of the zone. Used cached + * information on where the scanners should start but check that it + * is initialised by ensuring the values are within zone boundaries. + */ + cc->migrate_pfn = zone->compact_cached_migrate_pfn; + cc->free_pfn = zone->compact_cached_free_pfn; + if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { + cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); + zone->compact_cached_free_pfn = cc->free_pfn; + } + if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { + cc->migrate_pfn = start_pfn; + zone->compact_cached_migrate_pfn = cc->migrate_pfn; } + /* + * Clear pageblock skip if there were failures recently and compaction + * is about to be retried after being deferred. kswapd does not do + * this reset as it'll reset the cached information when going to sleep. + */ + if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) + __reset_isolation_suitable(zone); + migrate_prep_local(); while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { @@ -787,6 +978,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) switch (isolate_migratepages(zone, cc)) { case ISOLATE_ABORT: ret = COMPACT_PARTIAL; + putback_lru_pages(&cc->migratepages); + cc->nr_migratepages = 0; goto out; case ISOLATE_NONE: continue; @@ -817,6 +1010,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) goto out; } } + + /* Capture a page now if it is a suitable size */ + compact_capture_page(cc); } out: @@ -829,8 +1025,10 @@ out: static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, - bool sync, bool *contended) + bool sync, bool *contended, + struct page **page) { + unsigned long ret; struct compact_control cc = { .nr_freepages = 0, .nr_migratepages = 0, @@ -838,12 +1036,18 @@ static unsigned long compact_zone_order(struct zone *zone, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, .sync = sync, - .contended = contended, + .page = page, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); - return compact_zone(zone, &cc); + ret = compact_zone(zone, &cc); + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + + *contended = cc.contended; + return ret; } int sysctl_extfrag_threshold = 500; @@ -855,12 +1059,14 @@ int sysctl_extfrag_threshold = 500; * @gfp_mask: The GFP mask of the current allocation * @nodemask: The allowed nodes to allocate from * @sync: Whether migration is synchronous or not + * @contended: Return value that is true if compaction was aborted due to lock contention + * @page: Optionally capture a free page of the requested order during compaction * * This is the main entry point for direct page compaction. */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync, bool *contended) + bool sync, bool *contended, struct page **page) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -868,28 +1074,30 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; int rc = COMPACT_SKIPPED; + int alloc_flags = 0; - /* - * Check whether it is worth even starting compaction. The order check is - * made because an assumption is made that the page allocator can satisfy - * the "cheaper" orders without taking special steps - */ + /* Check if the GFP flags allow compaction */ if (!order || !may_enter_fs || !may_perform_io) return rc; count_vm_event(COMPACTSTALL); +#ifdef CONFIG_CMA + if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; +#endif /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { int status; status = compact_zone_order(zone, order, gfp_mask, sync, - contended); + contended, page); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ - if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, + alloc_flags)) break; } @@ -940,6 +1148,7 @@ int compact_pgdat(pg_data_t *pgdat, int order) struct compact_control cc = { .order = order, .sync = false, + .page = NULL, }; return __compact_pgdat(pgdat, &cc); @@ -950,6 +1159,7 @@ static int compact_node(int nid) struct compact_control cc = { .order = -1, .sync = true, + .page = NULL, }; return __compact_pgdat(NODE_DATA(nid), &cc); diff --git a/mm/fadvise.c b/mm/fadvise.c index 9b75a04..a47f0f5 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -26,7 +26,7 @@ */ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) { - struct file *file = fget(fd); + struct fd f = fdget(fd); struct address_space *mapping; struct backing_dev_info *bdi; loff_t endbyte; /* inclusive */ @@ -35,15 +35,15 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) unsigned long nrpages; int ret = 0; - if (!file) + if (!f.file) return -EBADF; - if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) { + if (S_ISFIFO(f.file->f_path.dentry->d_inode->i_mode)) { ret = -ESPIPE; goto out; } - mapping = file->f_mapping; + mapping = f.file->f_mapping; if (!mapping || len < 0) { ret = -EINVAL; goto out; @@ -76,21 +76,21 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) switch (advice) { case POSIX_FADV_NORMAL: - file->f_ra.ra_pages = bdi->ra_pages; - spin_lock(&file->f_lock); - file->f_mode &= ~FMODE_RANDOM; - spin_unlock(&file->f_lock); + f.file->f_ra.ra_pages = bdi->ra_pages; + spin_lock(&f.file->f_lock); + f.file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&f.file->f_lock); break; case POSIX_FADV_RANDOM: - spin_lock(&file->f_lock); - file->f_mode |= FMODE_RANDOM; - spin_unlock(&file->f_lock); + spin_lock(&f.file->f_lock); + f.file->f_mode |= FMODE_RANDOM; + spin_unlock(&f.file->f_lock); break; case POSIX_FADV_SEQUENTIAL: - file->f_ra.ra_pages = bdi->ra_pages * 2; - spin_lock(&file->f_lock); - file->f_mode &= ~FMODE_RANDOM; - spin_unlock(&file->f_lock); + f.file->f_ra.ra_pages = bdi->ra_pages * 2; + spin_lock(&f.file->f_lock); + f.file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&f.file->f_lock); break; case POSIX_FADV_WILLNEED: /* First and last PARTIAL page! */ @@ -106,7 +106,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) * Ignore return value because fadvise() shall return * success even if filesystem can't retrieve a hint, */ - force_page_cache_readahead(mapping, file, start_index, + force_page_cache_readahead(mapping, f.file, start_index, nrpages); break; case POSIX_FADV_NOREUSE: @@ -128,7 +128,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) ret = -EINVAL; } out: - fput(file); + fdput(f); return ret; } #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS diff --git a/mm/filemap.c b/mm/filemap.c index fa5ca30..83efee7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1412,12 +1412,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, retval = filemap_write_and_wait_range(mapping, pos, pos + iov_length(iov, nr_segs) - 1); if (!retval) { - struct blk_plug plug; - - blk_start_plug(&plug); retval = mapping->a_ops->direct_IO(READ, iocb, iov, pos, nr_segs); - blk_finish_plug(&plug); } if (retval > 0) { *ppos = pos + retval; @@ -1611,13 +1607,13 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * Do we have something in the page cache already? */ page = find_get_page(mapping, offset); - if (likely(page)) { + if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { /* * We found the page, so try async readahead before * waiting for the lock. */ do_async_mmap_readahead(vma, ra, file, page, offset); - } else { + } else if (!page) { /* No page in the page cache at all */ do_sync_mmap_readahead(vma, ra, file, offset); count_vm_event(PGMAJFAULT); @@ -1741,6 +1737,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite); const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = filemap_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; /* This is used for a general mmap of a disk file */ @@ -1753,7 +1750,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) return -ENOEXEC; file_accessed(file); vma->vm_ops = &generic_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } @@ -2527,14 +2523,12 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - struct blk_plug plug; ssize_t ret; BUG_ON(iocb->ki_pos != pos); sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); - blk_start_plug(&plug); ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); @@ -2545,7 +2539,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0 && ret > 0) ret = err; } - blk_finish_plug(&plug); sb_end_write(inode->i_sb); return ret; } diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 13e013b..a912da6 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -167,7 +167,6 @@ __xip_unmap (struct address_space * mapping, { struct vm_area_struct *vma; struct mm_struct *mm; - struct prio_tree_iter iter; unsigned long address; pte_t *pte; pte_t pteval; @@ -184,7 +183,7 @@ __xip_unmap (struct address_space * mapping, retry: mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { mm = vma->vm_mm; address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); @@ -193,11 +192,13 @@ retry: if (pte) { /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); page_remove_rmap(page); dec_mm_counter(mm, MM_FILEPAGES); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); + /* must invalidate_page _before_ freeing the page */ + mmu_notifier_invalidate_page(mm, address); page_cache_release(page); } } @@ -305,6 +306,7 @@ out: static const struct vm_operations_struct xip_file_vm_ops = { .fault = xip_file_fault, .page_mkwrite = filemap_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; int xip_file_mmap(struct file * file, struct vm_area_struct * vma) @@ -313,7 +315,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma) file_accessed(file); vma->vm_ops = &xip_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP; + vma->vm_flags |= VM_MIXEDMAP; return 0; } EXPORT_SYMBOL_GPL(xip_file_mmap); diff --git a/mm/fremap.c b/mm/fremap.c index 9ed4fd4..3899a86 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -5,6 +5,7 @@ * * started by Ingo Molnar, Copyright (C) 2002, 2003 */ +#include <linux/export.h> #include <linux/backing-dev.h> #include <linux/mm.h> #include <linux/swap.h> @@ -80,9 +81,10 @@ out: return err; } -static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, unsigned long size, pgoff_t pgoff) +int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, + unsigned long size, pgoff_t pgoff) { + struct mm_struct *mm = vma->vm_mm; int err; do { @@ -95,9 +97,9 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, pgoff++; } while (size); - return 0; - + return 0; } +EXPORT_SYMBOL(generic_file_remap_pages); /** * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma @@ -167,7 +169,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) goto out; - if (!(vma->vm_flags & VM_CAN_NONLINEAR)) + if (!vma->vm_ops->remap_pages) goto out; if (start < vma->vm_start || start + size > vma->vm_end) @@ -195,10 +197,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, */ if (mapping_cap_account_dirty(mapping)) { unsigned long addr; - struct file *file = vma->vm_file; + struct file *file = get_file(vma->vm_file); flags &= MAP_NONBLOCK; - get_file(file); addr = mmap_region(file, start, size, flags, vma->vm_flags, pgoff); fput(file); @@ -213,7 +214,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); vma->vm_flags |= VM_NONLINEAR; - vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); @@ -229,7 +230,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, } mmu_notifier_invalidate_range_start(mm, start, start + size); - err = populate_range(mm, vma, start, size, pgoff); + err = vma->vm_ops->remap_pages(vma, start, size, pgoff); mmu_notifier_invalidate_range_end(mm, start, start + size); if (!err && !(flags & MAP_NONBLOCK)) { if (vma->vm_flags & VM_LOCKED) { diff --git a/mm/frontswap.c b/mm/frontswap.c index 6b3e71a..2890e67d 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -44,6 +44,13 @@ EXPORT_SYMBOL(frontswap_enabled); */ static bool frontswap_writethrough_enabled __read_mostly; +/* + * If enabled, the underlying tmem implementation is capable of doing + * exclusive gets, so frontswap_load, on a successful tmem_get must + * mark the page as no longer in frontswap AND mark it dirty. + */ +static bool frontswap_tmem_exclusive_gets_enabled __read_mostly; + #ifdef CONFIG_DEBUG_FS /* * Counters available via /sys/kernel/debug/frontswap (if debugfs is @@ -97,6 +104,15 @@ void frontswap_writethrough(bool enable) EXPORT_SYMBOL(frontswap_writethrough); /* + * Enable/disable frontswap exclusive gets (see above). + */ +void frontswap_tmem_exclusive_gets(bool enable) +{ + frontswap_tmem_exclusive_gets_enabled = enable; +} +EXPORT_SYMBOL(frontswap_tmem_exclusive_gets); + +/* * Called when a swap device is swapon'd. */ void __frontswap_init(unsigned type) @@ -174,8 +190,13 @@ int __frontswap_load(struct page *page) BUG_ON(sis == NULL); if (frontswap_test(sis, offset)) ret = frontswap_ops.load(type, offset, page); - if (ret == 0) + if (ret == 0) { inc_frontswap_loads(); + if (frontswap_tmem_exclusive_gets_enabled) { + SetPageDirty(page); + frontswap_clear(sis, offset); + } + } return ret; } EXPORT_SYMBOL(__frontswap_load); @@ -263,6 +284,11 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, return ret; } +/* + * Used to check if it's necessory and feasible to unuse pages. + * Return 1 when nothing to do, 0 when need to shink pages, + * error code when there is an error. + */ static int __frontswap_shrink(unsigned long target_pages, unsigned long *pages_to_unuse, int *type) @@ -275,7 +301,7 @@ static int __frontswap_shrink(unsigned long target_pages, if (total_pages <= target_pages) { /* Nothing to do */ *pages_to_unuse = 0; - return 0; + return 1; } total_pages_to_unuse = total_pages - target_pages; return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type); @@ -292,7 +318,7 @@ static int __frontswap_shrink(unsigned long target_pages, void frontswap_shrink(unsigned long target_pages) { unsigned long pages_to_unuse = 0; - int type, ret; + int uninitialized_var(type), ret; /* * we don't want to hold swap_lock while doing a very @@ -302,7 +328,7 @@ void frontswap_shrink(unsigned long target_pages) spin_lock(&swap_lock); ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); spin_unlock(&swap_lock); - if (ret == 0 && pages_to_unuse) + if (ret == 0) try_to_unuse(type, true, pages_to_unuse); return; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 57c4b93..a863af2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -102,10 +102,7 @@ static int set_recommended_min_free_kbytes(void) unsigned long recommended_min; extern int min_free_kbytes; - if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG, - &transparent_hugepage_flags) && - !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, - &transparent_hugepage_flags)) + if (!khugepaged_enabled()) return 0; for_each_populated_zone(zone) @@ -139,12 +136,6 @@ static int start_khugepaged(void) { int err = 0; if (khugepaged_enabled()) { - int wakeup; - if (unlikely(!mm_slot_cache || !mm_slots_hash)) { - err = -ENOMEM; - goto out; - } - mutex_lock(&khugepaged_mutex); if (!khugepaged_thread) khugepaged_thread = kthread_run(khugepaged, NULL, "khugepaged"); @@ -154,16 +145,16 @@ static int start_khugepaged(void) err = PTR_ERR(khugepaged_thread); khugepaged_thread = NULL; } - wakeup = !list_empty(&khugepaged_scan.mm_head); - mutex_unlock(&khugepaged_mutex); - if (wakeup) + + if (!list_empty(&khugepaged_scan.mm_head)) wake_up_interruptible(&khugepaged_wait); set_recommended_min_free_kbytes(); - } else - /* wakeup to exit */ - wake_up_interruptible(&khugepaged_wait); -out: + } else if (khugepaged_thread) { + kthread_stop(khugepaged_thread); + khugepaged_thread = NULL; + } + return err; } @@ -224,18 +215,16 @@ static ssize_t enabled_store(struct kobject *kobj, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); if (ret > 0) { - int err = start_khugepaged(); + int err; + + mutex_lock(&khugepaged_mutex); + err = start_khugepaged(); + mutex_unlock(&khugepaged_mutex); + if (err) ret = err; } - if (ret > 0 && - (test_bit(TRANSPARENT_HUGEPAGE_FLAG, - &transparent_hugepage_flags) || - test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, - &transparent_hugepage_flags))) - set_recommended_min_free_kbytes(); - return ret; } static struct kobj_attribute enabled_attr = @@ -570,8 +559,6 @@ static int __init hugepage_init(void) start_khugepaged(); - set_recommended_min_free_kbytes(); - return 0; out: hugepage_exit_sysfs(hugepage_kobj); @@ -611,19 +598,6 @@ out: } __setup("transparent_hugepage=", setup_transparent_hugepage); -static void prepare_pmd_huge_pte(pgtable_t pgtable, - struct mm_struct *mm) -{ - assert_spin_locked(&mm->page_table_lock); - - /* FIFO */ - if (!mm->pmd_huge_pte) - INIT_LIST_HEAD(&pgtable->lru); - else - list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); - mm->pmd_huge_pte = pgtable; -} - static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) @@ -665,7 +639,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, */ page_add_new_anon_rmap(page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); - prepare_pmd_huge_pte(pgtable, mm); + pgtable_trans_huge_deposit(mm, pgtable); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); mm->nr_ptes++; spin_unlock(&mm->page_table_lock); @@ -791,7 +765,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmdp_set_wrprotect(src_mm, addr, src_pmd); pmd = pmd_mkold(pmd_wrprotect(pmd)); set_pmd_at(dst_mm, addr, dst_pmd, pmd); - prepare_pmd_huge_pte(pgtable, dst_mm); + pgtable_trans_huge_deposit(dst_mm, pgtable); dst_mm->nr_ptes++; ret = 0; @@ -802,25 +776,6 @@ out: return ret; } -/* no "address" argument so destroys page coloring of some arch */ -pgtable_t get_pmd_huge_pte(struct mm_struct *mm) -{ - pgtable_t pgtable; - - assert_spin_locked(&mm->page_table_lock); - - /* FIFO */ - pgtable = mm->pmd_huge_pte; - if (list_empty(&pgtable->lru)) - mm->pmd_huge_pte = NULL; - else { - mm->pmd_huge_pte = list_entry(pgtable->lru.next, - struct page, lru); - list_del(&pgtable->lru); - } - return pgtable; -} - static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -832,6 +787,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, pmd_t _pmd; int ret = 0, i; struct page **pages; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, GFP_KERNEL); @@ -868,15 +825,19 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, cond_resched(); } + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_free_pages; VM_BUG_ON(!PageHead(page)); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ - pgtable = get_pmd_huge_pte(mm); + pgtable = pgtable_trans_huge_withdraw(mm); pmd_populate(mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -896,6 +857,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, page_remove_rmap(page); spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + ret |= VM_FAULT_WRITE; put_page(page); @@ -904,6 +867,7 @@ out: out_free_pages: spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); mem_cgroup_uncharge_start(); for (i = 0; i < HPAGE_PMD_NR; i++) { mem_cgroup_uncharge_page(pages[i]); @@ -920,6 +884,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, int ret = 0; struct page *page, *new_page; unsigned long haddr; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ VM_BUG_ON(!vma->anon_vma); spin_lock(&mm->page_table_lock); @@ -934,7 +900,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) - update_mmu_cache(vma, address, entry); + update_mmu_cache_pmd(vma, address, pmd); ret |= VM_FAULT_WRITE; goto out_unlock; } @@ -970,38 +936,47 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + spin_lock(&mm->page_table_lock); put_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { spin_unlock(&mm->page_table_lock); mem_cgroup_uncharge_page(new_page); put_page(new_page); - goto out; + goto out_mn; } else { pmd_t entry; VM_BUG_ON(!PageHead(page)); entry = mk_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); entry = pmd_mkhuge(entry); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_clear_flush(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); - update_mmu_cache(vma, address, entry); + update_mmu_cache_pmd(vma, address, pmd); page_remove_rmap(page); put_page(page); ret |= VM_FAULT_WRITE; } -out_unlock: spin_unlock(&mm->page_table_lock); +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: return ret; +out_unlock: + spin_unlock(&mm->page_table_lock); + return ret; } -struct page *follow_trans_huge_pmd(struct mm_struct *mm, +struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags) { + struct mm_struct *mm = vma->vm_mm; struct page *page = NULL; assert_spin_locked(&mm->page_table_lock); @@ -1024,6 +999,14 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm, _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); } + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + if (page->mapping && trylock_page(page)) { + lru_add_drain(); + if (page->mapping) + mlock_vma_page(page); + unlock_page(page); + } + } page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON(!PageCompound(page)); if (flags & FOLL_GET) @@ -1041,9 +1024,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (__pmd_trans_huge_lock(pmd, vma) == 1) { struct page *page; pgtable_t pgtable; - pgtable = get_pmd_huge_pte(tlb->mm); - page = pmd_page(*pmd); - pmd_clear(pmd); + pmd_t orig_pmd; + pgtable = pgtable_trans_huge_withdraw(tlb->mm); + orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); + page = pmd_page(orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); page_remove_rmap(page); VM_BUG_ON(page_mapcount(page) < 0); @@ -1207,7 +1191,11 @@ static int __split_huge_page_splitting(struct page *page, struct mm_struct *mm = vma->vm_mm; pmd_t *pmd; int ret = 0; + /* For mmu_notifiers */ + const unsigned long mmun_start = address; + const unsigned long mmun_end = address + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); spin_lock(&mm->page_table_lock); pmd = page_check_address_pmd(page, mm, address, PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); @@ -1219,10 +1207,11 @@ static int __split_huge_page_splitting(struct page *page, * and it won't wait on the anon_vma->root->mutex to * serialize against split_huge_page*. */ - pmdp_splitting_flush_notify(vma, address, pmd); + pmdp_splitting_flush(vma, address, pmd); ret = 1; } spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return ret; } @@ -1358,11 +1347,11 @@ static int __split_huge_page_map(struct page *page, pmd = page_check_address_pmd(page, mm, address, PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); if (pmd) { - pgtable = get_pmd_huge_pte(mm); + pgtable = pgtable_trans_huge_withdraw(mm); pmd_populate(mm, &_pmd, pgtable); - for (i = 0, haddr = address; i < HPAGE_PMD_NR; - i++, haddr += PAGE_SIZE) { + haddr = address; + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { pte_t *pte, entry; BUG_ON(PageCompound(page+i)); entry = mk_pte(page + i, vma->vm_page_prot); @@ -1406,8 +1395,7 @@ static int __split_huge_page_map(struct page *page, * SMP TLB and finally we write the non-huge version * of the pmd entry with pmd_populate. */ - set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + pmdp_invalidate(vma, address, pmd); pmd_populate(mm, pmd, pgtable); ret = 1; } @@ -1421,18 +1409,17 @@ static void __split_huge_page(struct page *page, struct anon_vma *anon_vma) { int mapcount, mapcount2; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct anon_vma_chain *avc; BUG_ON(!PageHead(page)); BUG_ON(PageTail(page)); mapcount = 0; - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long addr = vma_address(page, vma); BUG_ON(is_vma_temporary_stack(vma)); - if (addr == -EFAULT) - continue; mapcount += __split_huge_page_splitting(page, vma, addr); } /* @@ -1453,12 +1440,10 @@ static void __split_huge_page(struct page *page, __split_huge_page_refcount(page); mapcount2 = 0; - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long addr = vma_address(page, vma); BUG_ON(is_vma_temporary_stack(vma)); - if (addr == -EFAULT) - continue; mapcount2 += __split_huge_page_map(page, vma, addr); } if (mapcount != mapcount2) @@ -1491,12 +1476,13 @@ out: return ret; } -#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \ - VM_HUGETLB|VM_SHARED|VM_MAYSHARE) +#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE) int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { + struct mm_struct *mm = vma->vm_mm; + switch (advice) { case MADV_HUGEPAGE: /* @@ -1504,6 +1490,8 @@ int hugepage_madvise(struct vm_area_struct *vma, */ if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) return -EINVAL; + if (mm->def_flags & VM_NOHUGEPAGE) + return -EINVAL; *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; /* @@ -1655,11 +1643,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma) if (vma->vm_ops) /* khugepaged not yet working on file or special mappings */ return 0; - /* - * If is_pfn_mapping() is true is_learn_pfn_mapping() must be - * true too, verify it here. - */ - VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); + VM_BUG_ON(vma->vm_flags & VM_NO_THP); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) @@ -1811,7 +1795,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, src_page = pte_page(pteval); copy_user_highpage(page, src_page, address, vma); VM_BUG_ON(page_mapcount(src_page) != 1); - VM_BUG_ON(page_count(src_page) != 2); release_pte_page(src_page); /* * ptl mostly unnecessary, but preempt has to @@ -1834,28 +1817,35 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, } } -static void collapse_huge_page(struct mm_struct *mm, - unsigned long address, - struct page **hpage, - struct vm_area_struct *vma, - int node) +static void khugepaged_alloc_sleep(void) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd, _pmd; - pte_t *pte; - pgtable_t pgtable; - struct page *new_page; - spinlock_t *ptl; - int isolated; - unsigned long hstart, hend; + wait_event_freezable_timeout(khugepaged_wait, false, + msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); +} - VM_BUG_ON(address & ~HPAGE_PMD_MASK); -#ifndef CONFIG_NUMA - up_read(&mm->mmap_sem); - VM_BUG_ON(!*hpage); - new_page = *hpage; -#else +#ifdef CONFIG_NUMA +static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +{ + if (IS_ERR(*hpage)) { + if (!*wait) + return false; + + *wait = false; + *hpage = NULL; + khugepaged_alloc_sleep(); + } else if (*hpage) { + put_page(*hpage); + *hpage = NULL; + } + + return true; +} + +static struct page +*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + int node) +{ VM_BUG_ON(*hpage); /* * Allocate the page while the vma is still valid and under @@ -1867,7 +1857,7 @@ static void collapse_huge_page(struct mm_struct *mm, * mmap_sem in read mode is good idea also to allow greater * scalability. */ - new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, + *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, node, __GFP_OTHER_NODE); /* @@ -1875,20 +1865,85 @@ static void collapse_huge_page(struct mm_struct *mm, * preparation for taking it in write mode. */ up_read(&mm->mmap_sem); - if (unlikely(!new_page)) { + if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); *hpage = ERR_PTR(-ENOMEM); - return; + return NULL; } -#endif count_vm_event(THP_COLLAPSE_ALLOC); - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { -#ifdef CONFIG_NUMA - put_page(new_page); + return *hpage; +} +#else +static struct page *khugepaged_alloc_hugepage(bool *wait) +{ + struct page *hpage; + + do { + hpage = alloc_hugepage(khugepaged_defrag()); + if (!hpage) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + if (!*wait) + return NULL; + + *wait = false; + khugepaged_alloc_sleep(); + } else + count_vm_event(THP_COLLAPSE_ALLOC); + } while (unlikely(!hpage) && likely(khugepaged_enabled())); + + return hpage; +} + +static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +{ + if (!*hpage) + *hpage = khugepaged_alloc_hugepage(wait); + + if (unlikely(!*hpage)) + return false; + + return true; +} + +static struct page +*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + int node) +{ + up_read(&mm->mmap_sem); + VM_BUG_ON(!*hpage); + return *hpage; +} #endif + +static void collapse_huge_page(struct mm_struct *mm, + unsigned long address, + struct page **hpage, + struct vm_area_struct *vma, + int node) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, _pmd; + pte_t *pte; + pgtable_t pgtable; + struct page *new_page; + spinlock_t *ptl; + int isolated; + unsigned long hstart, hend; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + /* release the mmap_sem read lock. */ + new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); + if (!new_page) + return; + + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) return; - } /* * Prevent all access to pagetables with the exception of @@ -1913,11 +1968,7 @@ static void collapse_huge_page(struct mm_struct *mm, goto out; if (is_vma_temporary_stack(vma)) goto out; - /* - * If is_pfn_mapping() is true is_learn_pfn_mapping() must be - * true too, verify it here. - */ - VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); + VM_BUG_ON(vma->vm_flags & VM_NO_THP); pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -1937,6 +1988,9 @@ static void collapse_huge_page(struct mm_struct *mm, pte = pte_offset_map(pmd, address); ptl = pte_lockptr(mm, pmd); + mmun_start = address; + mmun_end = address + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); spin_lock(&mm->page_table_lock); /* probably unnecessary */ /* * After this gup_fast can't run anymore. This also removes @@ -1944,8 +1998,9 @@ static void collapse_huge_page(struct mm_struct *mm, * huge and small TLB entries for the same virtual address * to avoid the risk of CPU bugs in that area. */ - _pmd = pmdp_clear_flush_notify(vma, address, pmd); + _pmd = pmdp_clear_flush(vma, address, pmd); spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); spin_lock(ptl); isolated = __collapse_huge_page_isolate(vma, address, pte); @@ -1971,8 +2026,6 @@ static void collapse_huge_page(struct mm_struct *mm, pte_unmap(pte); __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); - VM_BUG_ON(page_count(pgtable) != 1); - VM_BUG_ON(page_mapcount(pgtable) != 0); _pmd = mk_pmd(new_page, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); @@ -1989,13 +2042,12 @@ static void collapse_huge_page(struct mm_struct *mm, BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); set_pmd_at(mm, address, pmd, _pmd); - update_mmu_cache(vma, address, _pmd); - prepare_pmd_huge_pte(pgtable, mm); + update_mmu_cache_pmd(vma, address, pmd); + pgtable_trans_huge_deposit(mm, pgtable); spin_unlock(&mm->page_table_lock); -#ifndef CONFIG_NUMA *hpage = NULL; -#endif + khugepaged_pages_collapsed++; out_up_write: up_write(&mm->mmap_sem); @@ -2003,9 +2055,6 @@ out_up_write: out: mem_cgroup_uncharge_page(new_page); -#ifdef CONFIG_NUMA - put_page(new_page); -#endif goto out_up_write; } @@ -2155,12 +2204,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, goto skip; if (is_vma_temporary_stack(vma)) goto skip; - /* - * If is_pfn_mapping() is true is_learn_pfn_mapping() - * must be true too, verify it here. - */ - VM_BUG_ON(is_linear_pfn_mapping(vma) || - vma->vm_flags & VM_NO_THP); + VM_BUG_ON(vma->vm_flags & VM_NO_THP); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; @@ -2235,32 +2279,23 @@ static int khugepaged_has_work(void) static int khugepaged_wait_event(void) { return !list_empty(&khugepaged_scan.mm_head) || - !khugepaged_enabled(); + kthread_should_stop(); } -static void khugepaged_do_scan(struct page **hpage) +static void khugepaged_do_scan(void) { + struct page *hpage = NULL; unsigned int progress = 0, pass_through_head = 0; unsigned int pages = khugepaged_pages_to_scan; + bool wait = true; barrier(); /* write khugepaged_pages_to_scan to local stack */ while (progress < pages) { - cond_resched(); - -#ifndef CONFIG_NUMA - if (!*hpage) { - *hpage = alloc_hugepage(khugepaged_defrag()); - if (unlikely(!*hpage)) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - break; - } - count_vm_event(THP_COLLAPSE_ALLOC); - } -#else - if (IS_ERR(*hpage)) + if (!khugepaged_prealloc_page(&hpage, &wait)) break; -#endif + + cond_resched(); if (unlikely(kthread_should_stop() || freezing(current))) break; @@ -2271,73 +2306,32 @@ static void khugepaged_do_scan(struct page **hpage) if (khugepaged_has_work() && pass_through_head < 2) progress += khugepaged_scan_mm_slot(pages - progress, - hpage); + &hpage); else progress = pages; spin_unlock(&khugepaged_mm_lock); } -} -static void khugepaged_alloc_sleep(void) -{ - wait_event_freezable_timeout(khugepaged_wait, false, - msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); + if (!IS_ERR_OR_NULL(hpage)) + put_page(hpage); } -#ifndef CONFIG_NUMA -static struct page *khugepaged_alloc_hugepage(void) +static void khugepaged_wait_work(void) { - struct page *hpage; - - do { - hpage = alloc_hugepage(khugepaged_defrag()); - if (!hpage) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - khugepaged_alloc_sleep(); - } else - count_vm_event(THP_COLLAPSE_ALLOC); - } while (unlikely(!hpage) && - likely(khugepaged_enabled())); - return hpage; -} -#endif + try_to_freeze(); -static void khugepaged_loop(void) -{ - struct page *hpage; + if (khugepaged_has_work()) { + if (!khugepaged_scan_sleep_millisecs) + return; -#ifdef CONFIG_NUMA - hpage = NULL; -#endif - while (likely(khugepaged_enabled())) { -#ifndef CONFIG_NUMA - hpage = khugepaged_alloc_hugepage(); - if (unlikely(!hpage)) - break; -#else - if (IS_ERR(hpage)) { - khugepaged_alloc_sleep(); - hpage = NULL; - } -#endif - - khugepaged_do_scan(&hpage); -#ifndef CONFIG_NUMA - if (hpage) - put_page(hpage); -#endif - try_to_freeze(); - if (unlikely(kthread_should_stop())) - break; - if (khugepaged_has_work()) { - if (!khugepaged_scan_sleep_millisecs) - continue; - wait_event_freezable_timeout(khugepaged_wait, false, - msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); - } else if (khugepaged_enabled()) - wait_event_freezable(khugepaged_wait, - khugepaged_wait_event()); + wait_event_freezable_timeout(khugepaged_wait, + kthread_should_stop(), + msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); + return; } + + if (khugepaged_enabled()) + wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); } static int khugepaged(void *none) @@ -2347,20 +2341,9 @@ static int khugepaged(void *none) set_freezable(); set_user_nice(current, 19); - /* serialize with start_khugepaged() */ - mutex_lock(&khugepaged_mutex); - - for (;;) { - mutex_unlock(&khugepaged_mutex); - VM_BUG_ON(khugepaged_thread != current); - khugepaged_loop(); - VM_BUG_ON(khugepaged_thread != current); - - mutex_lock(&khugepaged_mutex); - if (!khugepaged_enabled()) - break; - if (unlikely(kthread_should_stop())) - break; + while (!kthread_should_stop()) { + khugepaged_do_scan(); + khugepaged_wait_work(); } spin_lock(&khugepaged_mm_lock); @@ -2369,10 +2352,6 @@ static int khugepaged(void *none) if (mm_slot) collect_mm_slot(mm_slot); spin_unlock(&khugepaged_mm_lock); - - khugepaged_thread = NULL; - mutex_unlock(&khugepaged_mutex); - return 0; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bc72712..59a0059 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -30,7 +30,6 @@ #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #include <linux/node.h> -#include <linux/hugetlb_cgroup.h> #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; @@ -637,6 +636,7 @@ static void free_huge_page(struct page *page) h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; } else { + arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); } spin_unlock(&hugetlb_lock); @@ -671,6 +671,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) } } +/* + * PageHuge() only returns true for hugetlbfs pages, but not for normal or + * transparent huge pages. See the PageTransHuge() documentation for more + * details. + */ int PageHuge(struct page *page) { compound_page_dtor *dtor; @@ -2355,13 +2360,15 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); + const unsigned long mmun_start = start; /* For mmu_notifiers */ + const unsigned long mmun_end = end; /* For mmu_notifiers */ WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); BUG_ON(end & ~huge_page_mask(h)); tlb_start_vma(tlb, vma); - mmu_notifier_invalidate_range_start(mm, start, end); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); again: spin_lock(&mm->page_table_lock); for (address = start; address < end; address += sz) { @@ -2425,7 +2432,7 @@ again: if (address < end && !ref_page) goto again; } - mmu_notifier_invalidate_range_end(mm, start, end); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); tlb_end_vma(tlb, vma); } @@ -2473,7 +2480,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; struct address_space *mapping; - struct prio_tree_iter iter; pgoff_t pgoff; /* @@ -2481,7 +2487,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * from page cache lookup which is in HPAGE_SIZE units. */ address = address & huge_page_mask(h); - pgoff = vma_hugecache_offset(h, vma, address); + pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + + vma->vm_pgoff; mapping = vma->vm_file->f_dentry->d_inode->i_mapping; /* @@ -2490,7 +2497,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * __unmap_hugepage_range() is called as the lock is already held */ mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { /* Do not unmap the current VMA */ if (iter_vma == vma) continue; @@ -2525,6 +2532,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, struct page *old_page, *new_page; int avoidcopy; int outside_reserve = 0; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ old_page = pte_page(pte); @@ -2611,6 +2620,9 @@ retry_avoidcopy: pages_per_huge_page(h)); __SetPageUptodate(new_page); + mmun_start = address & huge_page_mask(h); + mmun_end = mmun_start + huge_page_size(h); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); /* * Retake the page_table_lock to check for racing updates * before the page tables are altered @@ -2619,9 +2631,6 @@ retry_avoidcopy: ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { /* Break COW */ - mmu_notifier_invalidate_range_start(mm, - address & huge_page_mask(h), - (address & huge_page_mask(h)) + huge_page_size(h)); huge_ptep_clear_flush(vma, address, ptep); set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); @@ -2629,10 +2638,11 @@ retry_avoidcopy: hugepage_add_new_anon_rmap(new_page, vma, address); /* Make the old page be freed below */ new_page = old_page; - mmu_notifier_invalidate_range_end(mm, - address & huge_page_mask(h), - (address & huge_page_mask(h)) + huge_page_size(h)); } + spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + /* Caller expects lock to be held */ + spin_lock(&mm->page_table_lock); page_cache_release(new_page); page_cache_release(old_page); return 0; diff --git a/mm/internal.h b/mm/internal.h index b8c91b3..a4fa284 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -118,26 +118,27 @@ struct compact_control { unsigned long nr_freepages; /* Number of isolated free pages */ unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ - unsigned long start_free_pfn; /* where we started the search */ unsigned long migrate_pfn; /* isolate_migratepages search base */ bool sync; /* Synchronous migration */ - bool wrapped; /* Order > 0 compactions are - incremental, once free_pfn - and migrate_pfn meet, we restart - from the top of the zone; - remember we wrapped around. */ + bool ignore_skip_hint; /* Scan blocks even if marked skip */ + bool finished_update_free; /* True when the zone cached pfns are + * no longer being updated + */ + bool finished_update_migrate; int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; - bool *contended; /* True if a lock was contended */ + bool contended; /* True if a lock was contended */ + struct page **page; /* Page captured of requested size */ }; unsigned long -isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn); +isolate_freepages_range(struct compact_control *cc, + unsigned long start_pfn, unsigned long end_pfn); unsigned long isolate_migratepages_range(struct zone *zone, struct compact_control *cc, - unsigned long low_pfn, unsigned long end_pfn); + unsigned long low_pfn, unsigned long end_pfn, bool unevictable); #endif @@ -167,9 +168,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) } /* - * Called only in fault path via page_evictable() for a new page - * to determine if it's being mapped into a LOCKED vma. - * If so, mark page as mlocked. + * Called only in fault path, to determine if a new page is being + * mapped into a LOCKED vma. If it is, mark page as mlocked. */ static inline int mlocked_vma_newpage(struct vm_area_struct *vma, struct page *page) @@ -180,7 +180,8 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma, return 0; if (!TestSetPageMlocked(page)) { - inc_zone_page_state(page, NR_MLOCK); + mod_zone_page_state(page_zone(page), NR_MLOCK, + hpage_nr_pages(page)); count_vm_event(UNEVICTABLE_PGMLOCKED); } return 1; @@ -201,12 +202,7 @@ extern void munlock_vma_page(struct page *page); * If called for a page that is still mapped by mlocked vmas, all we do * is revert to lazy LRU behaviour -- semantics are not broken. */ -extern void __clear_page_mlock(struct page *page); -static inline void clear_page_mlock(struct page *page) -{ - if (unlikely(TestClearPageMlocked(page))) - __clear_page_mlock(page); -} +extern void clear_page_mlock(struct page *page); /* * mlock_migrate_page - called only from migrate_page_copy() to @@ -340,7 +336,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, #define ZONE_RECLAIM_FULL -1 #define ZONE_RECLAIM_SOME 0 #define ZONE_RECLAIM_SUCCESS 1 -#endif extern int hwpoison_filter(struct page *p); @@ -356,3 +351,20 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); +unsigned long reclaim_clean_pages_from_list(struct zone *zone, + struct list_head *page_list); +/* The ALLOC_WMARK bits are used as an index to zone->watermark */ +#define ALLOC_WMARK_MIN WMARK_MIN +#define ALLOC_WMARK_LOW WMARK_LOW +#define ALLOC_WMARK_HIGH WMARK_HIGH +#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ + +/* Mask to get the watermark bits */ +#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) + +#define ALLOC_HARDER 0x10 /* try to alloc harder */ +#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ +#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ + +#endif /* __MM_INTERNAL_H */ diff --git a/mm/interval_tree.c b/mm/interval_tree.c new file mode 100644 index 0000000..4a5822a --- /dev/null +++ b/mm/interval_tree.c @@ -0,0 +1,112 @@ +/* + * mm/interval_tree.c - interval tree for mapping->i_mmap + * + * Copyright (C) 2012, Michel Lespinasse <walken@google.com> + * + * This file is released under the GPL v2. + */ + +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/rmap.h> +#include <linux/interval_tree_generic.h> + +static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) +{ + return v->vm_pgoff; +} + +static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) +{ + return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; +} + +INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb, + unsigned long, shared.linear.rb_subtree_last, + vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) + +/* Insert node immediately after prev in the interval tree */ +void vma_interval_tree_insert_after(struct vm_area_struct *node, + struct vm_area_struct *prev, + struct rb_root *root) +{ + struct rb_node **link; + struct vm_area_struct *parent; + unsigned long last = vma_last_pgoff(node); + + VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev)); + + if (!prev->shared.linear.rb.rb_right) { + parent = prev; + link = &prev->shared.linear.rb.rb_right; + } else { + parent = rb_entry(prev->shared.linear.rb.rb_right, + struct vm_area_struct, shared.linear.rb); + if (parent->shared.linear.rb_subtree_last < last) + parent->shared.linear.rb_subtree_last = last; + while (parent->shared.linear.rb.rb_left) { + parent = rb_entry(parent->shared.linear.rb.rb_left, + struct vm_area_struct, shared.linear.rb); + if (parent->shared.linear.rb_subtree_last < last) + parent->shared.linear.rb_subtree_last = last; + } + link = &parent->shared.linear.rb.rb_left; + } + + node->shared.linear.rb_subtree_last = last; + rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link); + rb_insert_augmented(&node->shared.linear.rb, root, + &vma_interval_tree_augment); +} + +static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc) +{ + return vma_start_pgoff(avc->vma); +} + +static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc) +{ + return vma_last_pgoff(avc->vma); +} + +INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last, + avc_start_pgoff, avc_last_pgoff, + static inline, __anon_vma_interval_tree) + +void anon_vma_interval_tree_insert(struct anon_vma_chain *node, + struct rb_root *root) +{ +#ifdef CONFIG_DEBUG_VM_RB + node->cached_vma_start = avc_start_pgoff(node); + node->cached_vma_last = avc_last_pgoff(node); +#endif + __anon_vma_interval_tree_insert(node, root); +} + +void anon_vma_interval_tree_remove(struct anon_vma_chain *node, + struct rb_root *root) +{ + __anon_vma_interval_tree_remove(node, root); +} + +struct anon_vma_chain * +anon_vma_interval_tree_iter_first(struct rb_root *root, + unsigned long first, unsigned long last) +{ + return __anon_vma_interval_tree_iter_first(root, first, last); +} + +struct anon_vma_chain * +anon_vma_interval_tree_iter_next(struct anon_vma_chain *node, + unsigned long first, unsigned long last) +{ + return __anon_vma_interval_tree_iter_next(node, first, last); +} + +#ifdef CONFIG_DEBUG_VM_RB +void anon_vma_interval_tree_verify(struct anon_vma_chain *node) +{ + WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node)); + WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node)); +} +#endif diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 45eb621..a217cc5 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -29,7 +29,7 @@ * - kmemleak_lock (rwlock): protects the object_list modifications and * accesses to the object_tree_root. The object_list is the main list * holding the metadata (struct kmemleak_object) for the allocated memory - * blocks. The object_tree_root is a priority search tree used to look-up + * blocks. The object_tree_root is a red black tree used to look-up * metadata based on a pointer to the corresponding memory block. The * kmemleak_object structures are added to the object_list and * object_tree_root in the create_object() function called from the @@ -71,7 +71,7 @@ #include <linux/delay.h> #include <linux/export.h> #include <linux/kthread.h> -#include <linux/prio_tree.h> +#include <linux/rbtree.h> #include <linux/fs.h> #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -132,7 +132,7 @@ struct kmemleak_scan_area { * Structure holding the metadata for each allocated memory block. * Modifications to such objects should be made while holding the * object->lock. Insertions or deletions from object_list, gray_list or - * tree_node are already protected by the corresponding locks or mutex (see + * rb_node are already protected by the corresponding locks or mutex (see * the notes on locking above). These objects are reference-counted * (use_count) and freed using the RCU mechanism. */ @@ -141,7 +141,7 @@ struct kmemleak_object { unsigned long flags; /* object status flags */ struct list_head object_list; struct list_head gray_list; - struct prio_tree_node tree_node; + struct rb_node rb_node; struct rcu_head rcu; /* object_list lockless traversal */ /* object usage count; object freed when use_count == 0 */ atomic_t use_count; @@ -182,9 +182,9 @@ struct kmemleak_object { static LIST_HEAD(object_list); /* the list of gray-colored objects (see color_gray comment below) */ static LIST_HEAD(gray_list); -/* prio search tree for object boundaries */ -static struct prio_tree_root object_tree_root; -/* rw_lock protecting the access to object_list and prio_tree_root */ +/* search tree for object boundaries */ +static struct rb_root object_tree_root = RB_ROOT; +/* rw_lock protecting the access to object_list and object_tree_root */ static DEFINE_RWLOCK(kmemleak_lock); /* allocation caches for kmemleak internal data */ @@ -380,7 +380,7 @@ static void dump_object_info(struct kmemleak_object *object) trace.entries = object->trace; pr_notice("Object 0x%08lx (size %zu):\n", - object->tree_node.start, object->size); + object->pointer, object->size); pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", object->comm, object->pid, object->jiffies); pr_notice(" min_count = %d\n", object->min_count); @@ -392,32 +392,32 @@ static void dump_object_info(struct kmemleak_object *object) } /* - * Look-up a memory block metadata (kmemleak_object) in the priority search + * Look-up a memory block metadata (kmemleak_object) in the object search * tree based on a pointer value. If alias is 0, only values pointing to the * beginning of the memory block are allowed. The kmemleak_lock must be held * when calling this function. */ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) { - struct prio_tree_node *node; - struct prio_tree_iter iter; - struct kmemleak_object *object; - - prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr); - node = prio_tree_next(&iter); - if (node) { - object = prio_tree_entry(node, struct kmemleak_object, - tree_node); - if (!alias && object->pointer != ptr) { + struct rb_node *rb = object_tree_root.rb_node; + + while (rb) { + struct kmemleak_object *object = + rb_entry(rb, struct kmemleak_object, rb_node); + if (ptr < object->pointer) + rb = object->rb_node.rb_left; + else if (object->pointer + object->size <= ptr) + rb = object->rb_node.rb_right; + else if (object->pointer == ptr || alias) + return object; + else { kmemleak_warn("Found object by alias at 0x%08lx\n", ptr); dump_object_info(object); - object = NULL; + break; } - } else - object = NULL; - - return object; + } + return NULL; } /* @@ -471,7 +471,7 @@ static void put_object(struct kmemleak_object *object) } /* - * Look up an object in the prio search tree and increase its use_count. + * Look up an object in the object search tree and increase its use_count. */ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) { @@ -516,8 +516,8 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, int min_count, gfp_t gfp) { unsigned long flags; - struct kmemleak_object *object; - struct prio_tree_node *node; + struct kmemleak_object *object, *parent; + struct rb_node **link, *rb_parent; object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); if (!object) { @@ -560,31 +560,34 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, /* kernel backtrace */ object->trace_len = __save_stack_trace(object->trace); - INIT_PRIO_TREE_NODE(&object->tree_node); - object->tree_node.start = ptr; - object->tree_node.last = ptr + size - 1; - write_lock_irqsave(&kmemleak_lock, flags); min_addr = min(min_addr, ptr); max_addr = max(max_addr, ptr + size); - node = prio_tree_insert(&object_tree_root, &object->tree_node); - /* - * The code calling the kernel does not yet have the pointer to the - * memory block to be able to free it. However, we still hold the - * kmemleak_lock here in case parts of the kernel started freeing - * random memory blocks. - */ - if (node != &object->tree_node) { - kmemleak_stop("Cannot insert 0x%lx into the object search tree " - "(already existing)\n", ptr); - object = lookup_object(ptr, 1); - spin_lock(&object->lock); - dump_object_info(object); - spin_unlock(&object->lock); - - goto out; + link = &object_tree_root.rb_node; + rb_parent = NULL; + while (*link) { + rb_parent = *link; + parent = rb_entry(rb_parent, struct kmemleak_object, rb_node); + if (ptr + size <= parent->pointer) + link = &parent->rb_node.rb_left; + else if (parent->pointer + parent->size <= ptr) + link = &parent->rb_node.rb_right; + else { + kmemleak_stop("Cannot insert 0x%lx into the object " + "search tree (overlaps existing)\n", + ptr); + kmem_cache_free(object_cache, object); + object = parent; + spin_lock(&object->lock); + dump_object_info(object); + spin_unlock(&object->lock); + goto out; + } } + rb_link_node(&object->rb_node, rb_parent, link); + rb_insert_color(&object->rb_node, &object_tree_root); + list_add_tail_rcu(&object->object_list, &object_list); out: write_unlock_irqrestore(&kmemleak_lock, flags); @@ -600,7 +603,7 @@ static void __delete_object(struct kmemleak_object *object) unsigned long flags; write_lock_irqsave(&kmemleak_lock, flags); - prio_tree_remove(&object_tree_root, &object->tree_node); + rb_erase(&object->rb_node, &object_tree_root); list_del_rcu(&object->object_list); write_unlock_irqrestore(&kmemleak_lock, flags); @@ -1483,13 +1486,11 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct kmemleak_object *prev_obj = v; struct kmemleak_object *next_obj = NULL; - struct list_head *n = &prev_obj->object_list; + struct kmemleak_object *obj = prev_obj; ++(*pos); - list_for_each_continue_rcu(n, &object_list) { - struct kmemleak_object *obj = - list_entry(n, struct kmemleak_object, object_list); + list_for_each_entry_continue_rcu(obj, &object_list, object_list) { if (get_object(obj)) { next_obj = obj; break; @@ -1768,7 +1769,6 @@ void __init kmemleak_init(void) object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); - INIT_PRIO_TREE_ROOT(&object_tree_root); if (crt_early_log >= ARRAY_SIZE(early_log)) pr_warning("Early log buffer exceeded (%d), please increase " @@ -709,15 +709,22 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, spinlock_t *ptl; int swapped; int err = -EFAULT; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ addr = page_address_in_vma(page, vma); if (addr == -EFAULT) goto out; BUG_ON(PageTransCompound(page)); + + mmun_start = addr; + mmun_end = addr + PAGE_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) - goto out; + goto out_mn; if (pte_write(*ptep) || pte_dirty(*ptep)) { pte_t entry; @@ -752,6 +759,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, out_unlock: pte_unmap_unlock(ptep, ptl); +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: return err; } @@ -776,6 +785,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, spinlock_t *ptl; unsigned long addr; int err = -EFAULT; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ addr = page_address_in_vma(page, vma); if (addr == -EFAULT) @@ -794,10 +805,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd_present(*pmd)) goto out; + mmun_start = addr; + mmun_end = addr + PAGE_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!pte_same(*ptep, orig_pte)) { pte_unmap_unlock(ptep, ptl); - goto out; + goto out_mn; } get_page(kpage); @@ -814,6 +829,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, pte_unmap_unlock(ptep, ptl); err = 0; +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: return err; } @@ -1469,10 +1486,14 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, */ if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | - VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) + VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP)) return 0; /* just ignore the advice */ +#ifdef VM_SAO + if (*vm_flags & VM_SAO) + return 0; +#endif + if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { err = __ksm_enter(mm); if (err) @@ -1582,7 +1603,7 @@ struct page *ksm_does_need_to_copy(struct page *page, SetPageSwapBacked(new_page); __set_page_locked(new_page); - if (page_evictable(new_page, vma)) + if (!mlocked_vma_newpage(vma, new_page)) lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); else add_page_to_unevictable_list(new_page); @@ -1614,7 +1635,8 @@ again: struct vm_area_struct *vma; anon_vma_lock(anon_vma); - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, + 0, ULONG_MAX) { vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) @@ -1667,7 +1689,8 @@ again: struct vm_area_struct *vma; anon_vma_lock(anon_vma); - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, + 0, ULONG_MAX) { vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) @@ -1719,7 +1742,8 @@ again: struct vm_area_struct *vma; anon_vma_lock(anon_vma); - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, + 0, ULONG_MAX) { vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) diff --git a/mm/madvise.c b/mm/madvise.c index 14d260f..03dfa5c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -69,10 +69,14 @@ static long madvise_behavior(struct vm_area_struct * vma, new_flags &= ~VM_DONTCOPY; break; case MADV_DONTDUMP: - new_flags |= VM_NODUMP; + new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: - new_flags &= ~VM_NODUMP; + if (new_flags & VM_SPECIAL) { + error = -EINVAL; + goto out; + } + new_flags &= ~VM_DONTDUMP; break; case MADV_MERGEABLE: case MADV_UNMERGEABLE: diff --git a/mm/memblock.c b/mm/memblock.c index 4d9393c..931eef1 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -41,7 +41,8 @@ static int memblock_memory_in_slab __initdata_memblock = 0; static int memblock_reserved_in_slab __initdata_memblock = 0; /* inline so we don't get a warning when pr_debug is compiled out */ -static inline const char *memblock_type_name(struct memblock_type *type) +static __init_memblock const char * +memblock_type_name(struct memblock_type *type) { if (type == &memblock.memory) return "memory"; @@ -246,7 +247,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, min(new_area_start, memblock.current_limit), new_alloc_size, PAGE_SIZE); - new_array = addr ? __va(addr) : 0; + new_array = addr ? __va(addr) : NULL; } if (!addr) { pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", @@ -756,7 +757,7 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, return ret; for (i = start_rgn; i < end_rgn; i++) - type->regions[i].nid = nid; + memblock_set_region_node(&type->regions[i], nid); memblock_merge_regions(type); return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 795e525..7acf43b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -51,6 +51,7 @@ #include <linux/oom.h> #include "internal.h" #include <net/sock.h> +#include <net/ip.h> #include <net/tcp_memcontrol.h> #include <asm/uaccess.h> @@ -326,7 +327,7 @@ struct mem_cgroup { struct mem_cgroup_stat_cpu nocpu_base; spinlock_t pcp_counter_lock; -#ifdef CONFIG_INET +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) struct tcp_memcontrol tcp_mem; #endif }; @@ -411,12 +412,14 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) return container_of(s, struct mem_cgroup, css); } +static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) +{ + return (memcg == root_mem_cgroup); +} + /* Writing them here to avoid exposing memcg's inner layout */ -#ifdef CONFIG_MEMCG_KMEM -#include <net/sock.h> -#include <net/ip.h> +#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) -static bool mem_cgroup_is_root(struct mem_cgroup *memcg); void sock_update_memcg(struct sock *sk) { if (mem_cgroup_sockets_enabled) { @@ -461,7 +464,6 @@ void sock_release_memcg(struct sock *sk) } } -#ifdef CONFIG_INET struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) { if (!memcg || mem_cgroup_is_root(memcg)) @@ -470,10 +472,7 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) return &memcg->tcp_mem.cg_proto; } EXPORT_SYMBOL(tcp_proto_cgroup); -#endif /* CONFIG_INET */ -#endif /* CONFIG_MEMCG_KMEM */ -#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) static void disarm_sock_keys(struct mem_cgroup *memcg) { if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) @@ -1016,11 +1015,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) -static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) -{ - return (memcg == root_mem_cgroup); -} - void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) { struct mem_cgroup *memcg; @@ -4973,6 +4967,13 @@ mem_cgroup_create(struct cgroup *cont) } else { res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); + /* + * Deeper hierachy with use_hierarchy == false doesn't make + * much sense so let cgroup subsystem know about this + * unfortunate state in our controller. + */ + if (parent && parent != root_mem_cgroup) + mem_cgroup_subsys.broken_hierarchy = true; } memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a6e2141..6c5899b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -400,18 +400,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct vm_area_struct *vma; struct task_struct *tsk; struct anon_vma *av; + pgoff_t pgoff; av = page_lock_anon_vma(page); if (av == NULL) /* Not actually mapped anymore */ return; + pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); read_lock(&tasklist_lock); for_each_process (tsk) { struct anon_vma_chain *vmac; if (!task_early_kill(tsk)) continue; - list_for_each_entry(vmac, &av->head, same_anon_vma) { + anon_vma_interval_tree_foreach(vmac, &av->rb_root, + pgoff, pgoff) { vma = vmac->vma; if (!page_mapped_in_vma(page, vma)) continue; @@ -431,7 +434,6 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, { struct vm_area_struct *vma; struct task_struct *tsk; - struct prio_tree_iter iter; struct address_space *mapping = page->mapping; mutex_lock(&mapping->i_mmap_mutex); @@ -442,7 +444,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, if (!task_early_kill(tsk)) continue; - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { /* * Send early kill signal to tasks where a vma covers diff --git a/mm/memory.c b/mm/memory.c index 5736170..fb135ba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -712,7 +712,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, add_taint(TAINT_BAD_PAGE); } -static inline int is_cow_mapping(vm_flags_t flags) +static inline bool is_cow_mapping(vm_flags_t flags) { return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } @@ -1039,6 +1039,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long next; unsigned long addr = vma->vm_start; unsigned long end = vma->vm_end; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + bool is_cow; int ret; /* @@ -1047,7 +1050,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */ - if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { + if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR | + VM_PFNMAP | VM_MIXEDMAP))) { if (!vma->anon_vma) return 0; } @@ -1055,12 +1059,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); - if (unlikely(is_pfn_mapping(vma))) { + if (unlikely(vma->vm_flags & VM_PFNMAP)) { /* * We do not free on error cases below as remove_vma * gets called on error from higher level routine */ - ret = track_pfn_vma_copy(vma); + ret = track_pfn_copy(vma); if (ret) return ret; } @@ -1071,8 +1075,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * parent mm. And a permission downgrade will only happen if * is_cow_mapping() returns true. */ - if (is_cow_mapping(vma->vm_flags)) - mmu_notifier_invalidate_range_start(src_mm, addr, end); + is_cow = is_cow_mapping(vma->vm_flags); + mmun_start = addr; + mmun_end = end; + if (is_cow) + mmu_notifier_invalidate_range_start(src_mm, mmun_start, + mmun_end); ret = 0; dst_pgd = pgd_offset(dst_mm, addr); @@ -1088,9 +1096,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, } } while (dst_pgd++, src_pgd++, addr = next, addr != end); - if (is_cow_mapping(vma->vm_flags)) - mmu_notifier_invalidate_range_end(src_mm, - vma->vm_start, end); + if (is_cow) + mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end); return ret; } @@ -1327,8 +1334,8 @@ static void unmap_single_vma(struct mmu_gather *tlb, if (vma->vm_file) uprobe_munmap(vma, start, end); - if (unlikely(is_pfn_mapping(vma))) - untrack_pfn_vma(vma, 0, 0); + if (unlikely(vma->vm_flags & VM_PFNMAP)) + untrack_pfn(vma, 0, 0); if (start != end) { if (unlikely(is_vm_hugetlb_page(vma))) { @@ -1521,7 +1528,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, spin_unlock(&mm->page_table_lock); wait_split_huge_page(vma->anon_vma, pmd); } else { - page = follow_trans_huge_pmd(mm, address, + page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(&mm->page_table_lock); goto out; @@ -1576,12 +1583,12 @@ split_fallthrough: if (page->mapping && trylock_page(page)) { lru_add_drain(); /* push cached pages to LRU */ /* - * Because we lock page here and migration is - * blocked by the pte's page reference, we need - * only check for file-cache page truncation. + * Because we lock page here, and migration is + * blocked by the pte's page reference, and we + * know the page is still mapped, we don't even + * need to check for file-cache page truncation. */ - if (page->mapping) - mlock_vma_page(page); + mlock_vma_page(page); unlock_page(page); } } @@ -2085,6 +2092,11 @@ out: * ask for a shared writable mapping! * * The page does not need to be reserved. + * + * Usually this function is called from f_op->mmap() handler + * under mm->mmap_sem write-lock, so it can change vma->vm_flags. + * Caller must set VM_MIXEDMAP on vma if it wants to call this + * function from other places, for example from page-fault handler. */ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) @@ -2093,7 +2105,11 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, return -EFAULT; if (!page_count(page)) return -EINVAL; - vma->vm_flags |= VM_INSERTPAGE; + if (!(vma->vm_flags & VM_MIXEDMAP)) { + BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem)); + BUG_ON(vma->vm_flags & VM_PFNMAP); + vma->vm_flags |= VM_MIXEDMAP; + } return insert_page(vma, addr, page, vma->vm_page_prot); } EXPORT_SYMBOL(vm_insert_page); @@ -2132,7 +2148,7 @@ out: * @addr: target user address of this page * @pfn: source kernel pfn * - * Similar to vm_inert_page, this allows drivers to insert individual pages + * Similar to vm_insert_page, this allows drivers to insert individual pages * they've allocated into a user vma. Same comments apply. * * This function should only be called from a vm_ops->fault handler, and @@ -2162,14 +2178,11 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; - if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE)) + if (track_pfn_insert(vma, &pgprot, pfn)) return -EINVAL; ret = insert_pfn(vma, addr, pfn, pgprot); - if (ret) - untrack_pfn_vma(vma, pfn, PAGE_SIZE); - return ret; } EXPORT_SYMBOL(vm_insert_pfn); @@ -2290,37 +2303,30 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * rest of the world about it: * VM_IO tells people not to look at these pages * (accesses can have side effects). - * VM_RESERVED is specified all over the place, because - * in 2.4 it kept swapout's vma scan off this vma; but - * in 2.6 the LRU scan won't even find its pages, so this - * flag means no more than count its pages in reserved_vm, - * and omit it from core dump, even when VM_IO turned off. * VM_PFNMAP tells the core MM that the base pages are just * raw PFN mappings, and do not have a "struct page" associated * with them. + * VM_DONTEXPAND + * Disable vma merging and expanding with mremap(). + * VM_DONTDUMP + * Omit vma from core dump, even when VM_IO turned off. * * There's a horrible special case to handle copy-on-write * behaviour that some programs depend on. We mark the "original" * un-COW'ed pages by matching them up with "vma->vm_pgoff". + * See vm_normal_page() for details. */ - if (addr == vma->vm_start && end == vma->vm_end) { + if (is_cow_mapping(vma->vm_flags)) { + if (addr != vma->vm_start || end != vma->vm_end) + return -EINVAL; vma->vm_pgoff = pfn; - vma->vm_flags |= VM_PFN_AT_MMAP; - } else if (is_cow_mapping(vma->vm_flags)) - return -EINVAL; - - vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + } - err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); - if (err) { - /* - * To indicate that track_pfn related cleanup is not - * needed from higher level routine calling unmap_vmas - */ - vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); - vma->vm_flags &= ~VM_PFN_AT_MMAP; + err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); + if (err) return -EINVAL; - } + + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; @@ -2335,7 +2341,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } while (pgd++, addr = next, addr != end); if (err) - untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size)); + untrack_pfn(vma, pfn, PAGE_ALIGN(size)); return err; } @@ -2516,11 +2522,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl, pte_t orig_pte) __releases(ptl) { - struct page *old_page, *new_page; + struct page *old_page, *new_page = NULL; pte_t entry; int ret = 0; int page_mkwrite = 0; struct page *dirty_page = NULL; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + bool mmun_called = false; /* For mmu_notifiers */ old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) { @@ -2698,6 +2707,11 @@ gotten: if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; + mmun_start = address & PAGE_MASK; + mmun_end = (address & PAGE_MASK) + PAGE_SIZE; + mmun_called = true; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + /* * Re-check the pte - we dropped the lock */ @@ -2764,6 +2778,8 @@ gotten: page_cache_release(new_page); unlock: pte_unmap_unlock(page_table, ptl); + if (mmun_called) + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (old_page) { /* * Don't let another task, with possibly unlocked vma, @@ -2801,14 +2817,13 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma, zap_page_range_single(vma, start_addr, end_addr - start_addr, details); } -static inline void unmap_mapping_range_tree(struct prio_tree_root *root, +static inline void unmap_mapping_range_tree(struct rb_root *root, struct zap_details *details) { struct vm_area_struct *vma; - struct prio_tree_iter iter; pgoff_t vba, vea, zba, zea; - vma_prio_tree_foreach(vma, &iter, root, + vma_interval_tree_foreach(vma, root, details->first_index, details->last_index) { vba = vma->vm_pgoff; @@ -2839,7 +2854,7 @@ static inline void unmap_mapping_range_list(struct list_head *head, * across *all* the pages in each nonlinear VMA, not just the pages * whose virtual address lies outside the file truncation point. */ - list_for_each_entry(vma, head, shared.vm_set.list) { + list_for_each_entry(vma, head, shared.nonlinear) { details->nonlinear_vma = vma; unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); } @@ -2883,7 +2898,7 @@ void unmap_mapping_range(struct address_space *mapping, mutex_lock(&mapping->i_mmap_mutex); - if (unlikely(!prio_tree_empty(&mapping->i_mmap))) + if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3ad25f9..56b758a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page, void __ref put_page_bootmem(struct page *page) { unsigned long type; + struct zone *zone; type = (unsigned long) page->lru.next; BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || @@ -116,6 +117,12 @@ void __ref put_page_bootmem(struct page *page) set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); __free_pages_bootmem(page, 0); + + zone = page_zone(page); + zone_span_writelock(zone); + zone->present_pages++; + zone_span_writeunlock(zone); + totalram_pages++; } } @@ -126,9 +133,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) struct mem_section *ms; struct page *page, *memmap; - if (!pfn_valid(start_pfn)) - return; - section_nr = pfn_to_section_nr(start_pfn); ms = __nr_to_section(section_nr); @@ -187,9 +191,16 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) end_pfn = pfn + pgdat->node_spanned_pages; /* register_section info */ - for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) - register_page_bootmem_info_section(pfn); - + for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + /* + * Some platforms can assign the same pfn to multiple nodes - on + * node0 as well as nodeN. To avoid registering a pfn against + * multiple nodes we check that this pfn does not already + * reside in some other node. + */ + if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) + register_page_bootmem_info_section(pfn); + } } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ @@ -358,11 +369,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); BUG_ON(nr_pages % PAGES_PER_SECTION); + release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE); + sections_to_remove = nr_pages / PAGES_PER_SECTION; for (i = 0; i < sections_to_remove; i++) { unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; - release_mem_region(pfn << PAGE_SHIFT, - PAGES_PER_SECTION << PAGE_SHIFT); ret = __remove_section(zone, __pfn_to_section(pfn)); if (ret) break; @@ -752,13 +763,6 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end) return 0; } -static struct page * -hotremove_migrate_alloc(struct page *page, unsigned long private, int **x) -{ - /* This should be improooooved!! */ - return alloc_page(GFP_HIGHUSER_MOVABLE); -} - #define NR_OFFLINE_AT_ONCE_PAGES (256) static int do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) @@ -809,8 +813,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) putback_lru_pages(&source); goto out; } - /* this function returns # of failed pages */ - ret = migrate_pages(&source, hotremove_migrate_alloc, 0, + + /* + * alloc_migrate_target should be improooooved!! + * migrate_pages returns # of failed pages. + */ + ret = migrate_pages(&source, alloc_migrate_target, 0, true, MIGRATE_SYNC); if (ret) putback_lru_pages(&source); @@ -866,7 +874,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) return offlined; } -static int __ref offline_pages(unsigned long start_pfn, +static int __ref __offline_pages(unsigned long start_pfn, unsigned long end_pfn, unsigned long timeout) { unsigned long pfn, nr_pages, expire; @@ -966,8 +974,13 @@ repeat: init_per_zone_wmark_min(); - if (!populated_zone(zone)) + if (!populated_zone(zone)) { zone_pcp_reset(zone); + mutex_lock(&zonelists_mutex); + build_all_zonelists(NULL, NULL); + mutex_unlock(&zonelists_mutex); + } else + zone_pcp_update(zone); if (!node_present_pages(node)) { node_clear_state(node, N_HIGH_MEMORY); @@ -994,15 +1007,55 @@ out: return ret; } +int offline_pages(unsigned long start_pfn, unsigned long nr_pages) +{ + return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); +} + int remove_memory(u64 start, u64 size) { + struct memory_block *mem = NULL; + struct mem_section *section; unsigned long start_pfn, end_pfn; + unsigned long pfn, section_nr; + int ret; start_pfn = PFN_DOWN(start); end_pfn = start_pfn + PFN_DOWN(size); - return offline_pages(start_pfn, end_pfn, 120 * HZ); + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + section_nr = pfn_to_section_nr(pfn); + if (!present_section_nr(section_nr)) + continue; + + section = __nr_to_section(section_nr); + /* same memblock? */ + if (mem) + if ((section_nr >= mem->start_section_nr) && + (section_nr <= mem->end_section_nr)) + continue; + + mem = find_memory_block_hinted(section, mem); + if (!mem) + continue; + + ret = offline_memory_block(mem); + if (ret) { + kobject_put(&mem->dev.kobj); + return ret; + } + } + + if (mem) + kobject_put(&mem->dev.kobj); + + return 0; } #else +int offline_pages(unsigned long start_pfn, unsigned long nr_pages) +{ + return -EINVAL; +} int remove_memory(u64 start, u64 size) { return -EINVAL; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index bd92431..0b78fb9 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -607,6 +607,42 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, return first; } +/* + * Apply policy to a single VMA + * This must be called with the mmap_sem held for writing. + */ +static int vma_replace_policy(struct vm_area_struct *vma, + struct mempolicy *pol) +{ + int err; + struct mempolicy *old; + struct mempolicy *new; + + pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", + vma->vm_start, vma->vm_end, vma->vm_pgoff, + vma->vm_ops, vma->vm_file, + vma->vm_ops ? vma->vm_ops->set_policy : NULL); + + new = mpol_dup(pol); + if (IS_ERR(new)) + return PTR_ERR(new); + + if (vma->vm_ops && vma->vm_ops->set_policy) { + err = vma->vm_ops->set_policy(vma, new); + if (err) + goto err_out; + } + + old = vma->vm_policy; + vma->vm_policy = new; /* protected by mmap_sem */ + mpol_put(old); + + return 0; + err_out: + mpol_put(new); + return err; +} + /* Step 2: apply policy to a range and do splits. */ static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct mempolicy *new_pol) @@ -655,23 +691,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, if (err) goto out; } - - /* - * Apply policy to a single VMA. The reference counting of - * policy for vma_policy linkages has already been handled by - * vma_merge and split_vma as necessary. If this is a shared - * policy then ->set_policy will increment the reference count - * for an sp node. - */ - pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", - vma->vm_start, vma->vm_end, vma->vm_pgoff, - vma->vm_ops, vma->vm_file, - vma->vm_ops ? vma->vm_ops->set_policy : NULL); - if (vma->vm_ops && vma->vm_ops->set_policy) { - err = vma->vm_ops->set_policy(vma, new_pol); - if (err) - goto out; - } + err = vma_replace_policy(vma, new_pol); + if (err) + goto out; } out: @@ -924,15 +946,18 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, nodemask_t nmask; LIST_HEAD(pagelist); int err = 0; - struct vm_area_struct *vma; nodes_clear(nmask); node_set(source, nmask); - vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, + /* + * This does not "check" the range but isolates all pages that + * need migration. Between passing in the full user address + * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. + */ + VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); + check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); - if (IS_ERR(vma)) - return PTR_ERR(vma); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_node_page, dest, @@ -1530,8 +1555,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task, addr); if (vpol) pol = vpol; - } else if (vma->vm_policy) + } else if (vma->vm_policy) { pol = vma->vm_policy; + + /* + * shmem_alloc_page() passes MPOL_F_SHARED policy with + * a pseudo vma whose vma->vm_ops=NULL. Take a reference + * count on these policies which will be dropped by + * mpol_cond_put() later + */ + if (mpol_needs_cond_ref(pol)) + mpol_get(pol); + } } if (!pol) pol = &default_policy; @@ -2061,7 +2096,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) */ /* lookup first element intersecting start-end */ -/* Caller holds sp->lock */ +/* Caller holds sp->mutex */ static struct sp_node * sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) { @@ -2125,36 +2160,50 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) if (!sp->root.rb_node) return NULL; - spin_lock(&sp->lock); + mutex_lock(&sp->mutex); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } - spin_unlock(&sp->lock); + mutex_unlock(&sp->mutex); return pol; } +static void sp_free(struct sp_node *n) +{ + mpol_put(n->policy); + kmem_cache_free(sn_cache, n); +} + static void sp_delete(struct shared_policy *sp, struct sp_node *n) { pr_debug("deleting %lx-l%lx\n", n->start, n->end); rb_erase(&n->nd, &sp->root); - mpol_put(n->policy); - kmem_cache_free(sn_cache, n); + sp_free(n); } static struct sp_node *sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) { - struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); + struct sp_node *n; + struct mempolicy *newpol; + n = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n) return NULL; + + newpol = mpol_dup(pol); + if (IS_ERR(newpol)) { + kmem_cache_free(sn_cache, n); + return NULL; + } + newpol->flags |= MPOL_F_SHARED; + n->start = start; n->end = end; - mpol_get(pol); - pol->flags |= MPOL_F_SHARED; /* for unref */ - n->policy = pol; + n->policy = newpol; + return n; } @@ -2162,10 +2211,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, static int shared_policy_replace(struct shared_policy *sp, unsigned long start, unsigned long end, struct sp_node *new) { - struct sp_node *n, *new2 = NULL; + struct sp_node *n; + int ret = 0; -restart: - spin_lock(&sp->lock); + mutex_lock(&sp->mutex); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { @@ -2178,16 +2227,14 @@ restart: } else { /* Old policy spanning whole new range. */ if (n->end > end) { + struct sp_node *new2; + new2 = sp_alloc(end, n->end, n->policy); if (!new2) { - spin_unlock(&sp->lock); - new2 = sp_alloc(end, n->end, n->policy); - if (!new2) - return -ENOMEM; - goto restart; + ret = -ENOMEM; + goto out; } n->end = start; sp_insert(sp, new2); - new2 = NULL; break; } else n->end = start; @@ -2198,12 +2245,9 @@ restart: } if (new) sp_insert(sp, new); - spin_unlock(&sp->lock); - if (new2) { - mpol_put(new2->policy); - kmem_cache_free(sn_cache, new2); - } - return 0; +out: + mutex_unlock(&sp->mutex); + return ret; } /** @@ -2221,7 +2265,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ - spin_lock_init(&sp->lock); + mutex_init(&sp->mutex); if (mpol) { struct vm_area_struct pvma; @@ -2275,7 +2319,7 @@ int mpol_set_shared_policy(struct shared_policy *info, } err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); if (err && new) - kmem_cache_free(sn_cache, new); + sp_free(new); return err; } @@ -2287,16 +2331,14 @@ void mpol_free_shared_policy(struct shared_policy *p) if (!p->root.rb_node) return; - spin_lock(&p->lock); + mutex_lock(&p->mutex); next = rb_first(&p->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); - rb_erase(&n->nd, &p->root); - mpol_put(n->policy); - kmem_cache_free(sn_cache, n); + sp_delete(p, n); } - spin_unlock(&p->lock); + mutex_unlock(&p->mutex); } /* assumes fs == KERNEL_DS */ @@ -2562,7 +2604,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) break; default: - BUG(); + return -EINVAL; } l = strlen(policy_modes[mode]); @@ -51,15 +51,13 @@ EXPORT_SYMBOL(can_do_mlock); /* * LRU accounting for clear_page_mlock() */ -void __clear_page_mlock(struct page *page) +void clear_page_mlock(struct page *page) { - VM_BUG_ON(!PageLocked(page)); - - if (!page->mapping) { /* truncated ? */ + if (!TestClearPageMlocked(page)) return; - } - dec_zone_page_state(page, NR_MLOCK); + mod_zone_page_state(page_zone(page), NR_MLOCK, + -hpage_nr_pages(page)); count_vm_event(UNEVICTABLE_PGCLEARED); if (!isolate_lru_page(page)) { putback_lru_page(page); @@ -81,7 +79,8 @@ void mlock_vma_page(struct page *page) BUG_ON(!PageLocked(page)); if (!TestSetPageMlocked(page)) { - inc_zone_page_state(page, NR_MLOCK); + mod_zone_page_state(page_zone(page), NR_MLOCK, + hpage_nr_pages(page)); count_vm_event(UNEVICTABLE_PGMLOCKED); if (!isolate_lru_page(page)) putback_lru_page(page); @@ -108,7 +107,8 @@ void munlock_vma_page(struct page *page) BUG_ON(!PageLocked(page)); if (TestClearPageMlocked(page)) { - dec_zone_page_state(page, NR_MLOCK); + mod_zone_page_state(page_zone(page), NR_MLOCK, + -hpage_nr_pages(page)); if (!isolate_lru_page(page)) { int ret = SWAP_AGAIN; @@ -227,7 +227,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, if (vma->vm_flags & (VM_IO | VM_PFNMAP)) goto no_mlock; - if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || + if (!((vma->vm_flags & VM_DONTEXPAND) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))) { @@ -290,14 +290,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); if (page && !IS_ERR(page)) { lock_page(page); - /* - * Like in __mlock_vma_pages_range(), - * because we lock page here and migration is - * blocked by the elevated reference, we need - * only check for file-cache page truncation. - */ - if (page->mapping) - munlock_vma_page(page); + munlock_vma_page(page); unlock_page(page); put_page(page); } @@ -51,12 +51,6 @@ static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); -/* - * WARNING: the debugging will use recursive algorithms so never enable this - * unless you know what you are doing. - */ -#undef DEBUG_MM_RB - /* description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected * behavior is in parens: @@ -199,14 +193,14 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, flush_dcache_mmap_lock(mapping); if (unlikely(vma->vm_flags & VM_NONLINEAR)) - list_del_init(&vma->shared.vm_set.list); + list_del_init(&vma->shared.nonlinear); else - vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } /* - * Unlink a file-based vm structure from its prio_tree, to hide + * Unlink a file-based vm structure from its interval tree, to hide * vma from rmap and vmtruncate before freeing its page tables. */ void unlink_file_vma(struct vm_area_struct *vma) @@ -231,11 +225,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) might_sleep(); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); - if (vma->vm_file) { + if (vma->vm_file) fput(vma->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(vma->vm_mm); - } mpol_put(vma_policy(vma)); kmem_cache_free(vm_area_cachep, vma); return next; @@ -306,7 +297,7 @@ out: return retval; } -#ifdef DEBUG_MM_RB +#ifdef CONFIG_DEBUG_VM_RB static int browse_rb(struct rb_root *root) { int i = 0, j; @@ -340,9 +331,12 @@ void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; - struct vm_area_struct *tmp = mm->mmap; - while (tmp) { - tmp = tmp->vm_next; + struct vm_area_struct *vma = mm->mmap; + while (vma) { + struct anon_vma_chain *avc; + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_verify(avc); + vma = vma->vm_next; i++; } if (i != mm->map_count) @@ -356,17 +350,46 @@ void validate_mm(struct mm_struct *mm) #define validate_mm(mm) do { } while (0) #endif -static struct vm_area_struct * -find_vma_prepare(struct mm_struct *mm, unsigned long addr, - struct vm_area_struct **pprev, struct rb_node ***rb_link, - struct rb_node ** rb_parent) +/* + * vma has some anon_vma assigned, and is already inserted on that + * anon_vma's interval trees. + * + * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the + * vma must be removed from the anon_vma's interval trees using + * anon_vma_interval_tree_pre_update_vma(). + * + * After the update, the vma will be reinserted using + * anon_vma_interval_tree_post_update_vma(). + * + * The entire update must be protected by exclusive mmap_sem and by + * the root anon_vma's mutex. + */ +static inline void +anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); +} + +static inline void +anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) { - struct vm_area_struct * vma; - struct rb_node ** __rb_link, * __rb_parent, * rb_prev; + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); +} + +static int find_vma_links(struct mm_struct *mm, unsigned long addr, + unsigned long end, struct vm_area_struct **pprev, + struct rb_node ***rb_link, struct rb_node **rb_parent) +{ + struct rb_node **__rb_link, *__rb_parent, *rb_prev; __rb_link = &mm->mm_rb.rb_node; rb_prev = __rb_parent = NULL; - vma = NULL; while (*__rb_link) { struct vm_area_struct *vma_tmp; @@ -375,9 +398,9 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr, vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); if (vma_tmp->vm_end > addr) { - vma = vma_tmp; - if (vma_tmp->vm_start <= addr) - break; + /* Fail if an existing vma overlaps the area */ + if (vma_tmp->vm_start < end) + return -ENOMEM; __rb_link = &__rb_parent->rb_left; } else { rb_prev = __rb_parent; @@ -390,7 +413,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr, *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); *rb_link = __rb_link; *rb_parent = __rb_parent; - return vma; + return 0; } void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, @@ -417,7 +440,7 @@ static void __vma_link_file(struct vm_area_struct *vma) if (unlikely(vma->vm_flags & VM_NONLINEAR)) vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); else - vma_prio_tree_insert(vma, &mapping->i_mmap); + vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } } @@ -455,15 +478,16 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, /* * Helper for vma_adjust() in the split_vma insert case: insert a vma into the - * mm's list and rbtree. It has already been inserted into the prio_tree. + * mm's list and rbtree. It has already been inserted into the interval tree. */ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *__vma, *prev; + struct vm_area_struct *prev; struct rb_node **rb_link, *rb_parent; - __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); - BUG_ON(__vma && __vma->vm_start < vma->vm_end); + if (find_vma_links(mm, vma->vm_start, vma->vm_end, + &prev, &rb_link, &rb_parent)) + BUG(); __vma_link(mm, vma, prev, rb_link, rb_parent); mm->map_count++; } @@ -496,7 +520,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, struct vm_area_struct *next = vma->vm_next; struct vm_area_struct *importer = NULL; struct address_space *mapping = NULL; - struct prio_tree_root *root = NULL; + struct rb_root *root = NULL; struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; long adjust_next = 0; @@ -559,7 +583,7 @@ again: remove_next = 1 + (end > next->vm_end); mutex_lock(&mapping->i_mmap_mutex); if (insert) { /* - * Put into prio_tree now, so instantiated pages + * Put into interval tree now, so instantiated pages * are visible to arm/parisc __flush_dcache_page * throughout; but we cannot insert into address * space until vma start or end is updated. @@ -570,22 +594,23 @@ again: remove_next = 1 + (end > next->vm_end); vma_adjust_trans_huge(vma, start, end, adjust_next); - /* - * When changing only vma->vm_end, we don't really need anon_vma - * lock. This is a fairly rare case by itself, but the anon_vma - * lock may be shared between many sibling processes. Skipping - * the lock for brk adjustments makes a difference sometimes. - */ - if (vma->anon_vma && (importer || start != vma->vm_start)) { - anon_vma = vma->anon_vma; + anon_vma = vma->anon_vma; + if (!anon_vma && adjust_next) + anon_vma = next->anon_vma; + if (anon_vma) { + VM_BUG_ON(adjust_next && next->anon_vma && + anon_vma != next->anon_vma); anon_vma_lock(anon_vma); + anon_vma_interval_tree_pre_update_vma(vma); + if (adjust_next) + anon_vma_interval_tree_pre_update_vma(next); } if (root) { flush_dcache_mmap_lock(mapping); - vma_prio_tree_remove(vma, root); + vma_interval_tree_remove(vma, root); if (adjust_next) - vma_prio_tree_remove(next, root); + vma_interval_tree_remove(next, root); } vma->vm_start = start; @@ -598,8 +623,8 @@ again: remove_next = 1 + (end > next->vm_end); if (root) { if (adjust_next) - vma_prio_tree_insert(next, root); - vma_prio_tree_insert(vma, root); + vma_interval_tree_insert(next, root); + vma_interval_tree_insert(vma, root); flush_dcache_mmap_unlock(mapping); } @@ -620,8 +645,12 @@ again: remove_next = 1 + (end > next->vm_end); __insert_vm_struct(mm, insert); } - if (anon_vma) + if (anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + if (adjust_next) + anon_vma_interval_tree_post_update_vma(next); anon_vma_unlock(anon_vma); + } if (mapping) mutex_unlock(&mapping->i_mmap_mutex); @@ -636,8 +665,6 @@ again: remove_next = 1 + (end > next->vm_end); if (file) { uprobe_munmap(next, next->vm_start, next->vm_end); fput(file); - if (next->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(mm); } if (next->anon_vma) anon_vma_merge(vma, next); @@ -669,8 +696,7 @@ again: remove_next = 1 + (end > next->vm_end); static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) { - /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */ - if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR) + if (vma->vm_flags ^ vm_flags) return 0; if (vma->vm_file != file) return 0; @@ -951,8 +977,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, mm->exec_vm += pages; } else if (flags & stack_flags) mm->stack_vm += pages; - if (flags & (VM_RESERVED|VM_IO)) - mm->reserved_vm += pages; } #endif /* CONFIG_PROC_FS */ @@ -1190,7 +1214,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma) return 0; /* Specialty mapping? */ - if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) + if (vm_flags & VM_PFNMAP) return 0; /* Can the mapping track the dirty pages? */ @@ -1229,8 +1253,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Clear old maps */ error = -ENOMEM; munmap_back: - vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); - if (vma && vma->vm_start < addr + len) { + if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { if (do_munmap(mm, addr, len)) return -ENOMEM; goto munmap_back; @@ -1301,13 +1324,10 @@ munmap_back: goto free_vma; correct_wcount = 1; } - vma->vm_file = file; - get_file(file); + vma->vm_file = get_file(file); error = file->f_op->mmap(file, vma); if (error) goto unmap_and_free_vma; - if (vm_flags & VM_EXECUTABLE) - added_exe_file_vma(mm); /* Can addr have changed?? * @@ -1356,9 +1376,8 @@ out: } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) make_pages_present(addr, addr + len); - if (file && uprobe_mmap(vma)) - /* matching probes but cannot insert */ - goto unmap_and_free_vma; + if (file) + uprobe_mmap(vma); return addr; @@ -1759,13 +1778,16 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { + anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; + anon_vma_interval_tree_post_update_vma(vma); perf_event_mmap(vma); } } } vma_unlock_anon_vma(vma); khugepaged_enter_vma_merge(vma); + validate_mm(vma->vm_mm); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -1809,14 +1831,17 @@ int expand_downwards(struct vm_area_struct *vma, if (grow <= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { + anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; + anon_vma_interval_tree_post_update_vma(vma); perf_event_mmap(vma); } } } vma_unlock_anon_vma(vma); khugepaged_enter_vma_merge(vma); + validate_mm(vma->vm_mm); return error; } @@ -1990,11 +2015,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, if (anon_vma_clone(new, vma)) goto out_free_mpol; - if (new->vm_file) { + if (new->vm_file) get_file(new->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - added_exe_file_vma(mm); - } if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); @@ -2012,11 +2034,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, /* Clean everything up if vma_adjust failed. */ if (new->vm_ops && new->vm_ops->close) new->vm_ops->close(new); - if (new->vm_file) { - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(mm); + if (new->vm_file) fput(new->vm_file); - } unlink_anon_vmas(new); out_free_mpol: mpol_put(pol); @@ -2201,8 +2220,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) * Clear old maps. this also does some error checking for us */ munmap_back: - vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); - if (vma && vma->vm_start < addr + len) { + if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { if (do_munmap(mm, addr, len)) return -ENOMEM; goto munmap_back; @@ -2316,10 +2334,10 @@ void exit_mmap(struct mm_struct *mm) * and into the inode's i_mmap tree. If vm_file is non-NULL * then i_mmap_mutex is taken here. */ -int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) +int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct * __vma, * prev; - struct rb_node ** rb_link, * rb_parent; + struct vm_area_struct *prev; + struct rb_node **rb_link, *rb_parent; /* * The vm_pgoff of a purely anonymous vma should be irrelevant @@ -2337,8 +2355,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) BUG_ON(vma->anon_vma); vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; } - __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); - if (__vma && __vma->vm_start < vma->vm_end) + if (find_vma_links(mm, vma->vm_start, vma->vm_end, + &prev, &rb_link, &rb_parent)) return -ENOMEM; if ((vma->vm_flags & VM_ACCOUNT) && security_vm_enough_memory_mm(mm, vma_pages(vma))) @@ -2353,7 +2371,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) * prior to moving page table entries, to effect an mremap move. */ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, - unsigned long addr, unsigned long len, pgoff_t pgoff) + unsigned long addr, unsigned long len, pgoff_t pgoff, + bool *need_rmap_locks) { struct vm_area_struct *vma = *vmap; unsigned long vma_start = vma->vm_start; @@ -2372,7 +2391,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, faulted_in_anon_vma = false; } - find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) + return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); if (new_vma) { @@ -2394,32 +2414,29 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, * linear if there are no pages mapped yet. */ VM_BUG_ON(faulted_in_anon_vma); - *vmap = new_vma; - } else - anon_vma_moveto_tail(new_vma); + *vmap = vma = new_vma; + } + *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); } else { new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new_vma) { *new_vma = *vma; + new_vma->vm_start = addr; + new_vma->vm_end = addr + len; + new_vma->vm_pgoff = pgoff; pol = mpol_dup(vma_policy(vma)); if (IS_ERR(pol)) goto out_free_vma; + vma_set_policy(new_vma, pol); INIT_LIST_HEAD(&new_vma->anon_vma_chain); if (anon_vma_clone(new_vma, vma)) goto out_free_mempol; - vma_set_policy(new_vma, pol); - new_vma->vm_start = addr; - new_vma->vm_end = addr + len; - new_vma->vm_pgoff = pgoff; - if (new_vma->vm_file) { + if (new_vma->vm_file) get_file(new_vma->vm_file); - - if (vma->vm_flags & VM_EXECUTABLE) - added_exe_file_vma(mm); - } if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); vma_link(mm, new_vma, prev, rb_link, rb_parent); + *need_rmap_locks = false; } } return new_vma; @@ -2537,7 +2554,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex); static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { - if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { + if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { /* * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. @@ -2553,7 +2570,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) * anon_vma->root->mutex. */ if (__test_and_set_bit(0, (unsigned long *) - &anon_vma->root->head.next)) + &anon_vma->root->rb_root.rb_node)) BUG(); } } @@ -2594,7 +2611,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) * A single task can't take more than one mm_take_all_locks() in a row * or it would deadlock. * - * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in + * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in * mapping->flags avoid to take the same lock twice, if more than one * vma in this mm is backed by the same anon_vma or address_space. * @@ -2641,13 +2658,13 @@ out_unlock: static void vm_unlock_anon_vma(struct anon_vma *anon_vma) { - if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { + if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { /* * The LSB of head.next can't change to 0 from under * us because we hold the mm_all_locks_mutex. * * We must however clear the bitflag before unlocking - * the vma so the users using the anon_vma->head will + * the vma so the users using the anon_vma->rb_root will * never see our bitflag. * * No need of atomic instructions here, head.next @@ -2655,7 +2672,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) * anon_vma->root->mutex. */ if (!__test_and_clear_bit(0, (unsigned long *) - &anon_vma->root->head.next)) + &anon_vma->root->rb_root.rb_node)) BUG(); anon_vma_unlock(anon_vma); } diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 862b608..479a1e7 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -14,10 +14,14 @@ #include <linux/export.h> #include <linux/mm.h> #include <linux/err.h> +#include <linux/srcu.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/slab.h> +/* global SRCU for all MMs */ +static struct srcu_struct srcu; + /* * This function can't run concurrently against mmu_notifier_register * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap @@ -25,8 +29,8 @@ * in parallel despite there being no task using this mm any more, * through the vmas outside of the exit_mmap context, such as with * vmtruncate. This serializes against mmu_notifier_unregister with - * the mmu_notifier_mm->lock in addition to RCU and it serializes - * against the other mmu notifiers with RCU. struct mmu_notifier_mm + * the mmu_notifier_mm->lock in addition to SRCU and it serializes + * against the other mmu notifiers with SRCU. struct mmu_notifier_mm * can't go away from under us as exit_mmap holds an mm_count pin * itself. */ @@ -34,12 +38,13 @@ void __mmu_notifier_release(struct mm_struct *mm) { struct mmu_notifier *mn; struct hlist_node *n; + int id; /* - * RCU here will block mmu_notifier_unregister until + * SRCU here will block mmu_notifier_unregister until * ->release returns. */ - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) /* * if ->release runs before mmu_notifier_unregister it @@ -50,7 +55,7 @@ void __mmu_notifier_release(struct mm_struct *mm) */ if (mn->ops->release) mn->ops->release(mn, mm); - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); spin_lock(&mm->mmu_notifier_mm->lock); while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { @@ -68,7 +73,7 @@ void __mmu_notifier_release(struct mm_struct *mm) spin_unlock(&mm->mmu_notifier_mm->lock); /* - * synchronize_rcu here prevents mmu_notifier_release to + * synchronize_srcu here prevents mmu_notifier_release to * return to exit_mmap (which would proceed freeing all pages * in the mm) until the ->release method returns, if it was * invoked by mmu_notifier_unregister. @@ -76,7 +81,7 @@ void __mmu_notifier_release(struct mm_struct *mm) * The mmu_notifier_mm can't go away from under us because one * mm_count is hold by exit_mmap. */ - synchronize_rcu(); + synchronize_srcu(&srcu); } /* @@ -89,14 +94,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, { struct mmu_notifier *mn; struct hlist_node *n; - int young = 0; + int young = 0, id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->clear_flush_young) young |= mn->ops->clear_flush_young(mn, mm, address); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); return young; } @@ -106,9 +111,9 @@ int __mmu_notifier_test_young(struct mm_struct *mm, { struct mmu_notifier *mn; struct hlist_node *n; - int young = 0; + int young = 0, id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->test_young) { young = mn->ops->test_young(mn, mm, address); @@ -116,7 +121,7 @@ int __mmu_notifier_test_young(struct mm_struct *mm, break; } } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); return young; } @@ -126,19 +131,14 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, { struct mmu_notifier *mn; struct hlist_node *n; + int id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->change_pte) mn->ops->change_pte(mn, mm, address, pte); - /* - * Some drivers don't have change_pte, - * so we must call invalidate_page in that case. - */ - else if (mn->ops->invalidate_page) - mn->ops->invalidate_page(mn, mm, address); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); } void __mmu_notifier_invalidate_page(struct mm_struct *mm, @@ -146,13 +146,14 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm, { struct mmu_notifier *mn; struct hlist_node *n; + int id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_page) mn->ops->invalidate_page(mn, mm, address); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); } void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, @@ -160,13 +161,14 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, { struct mmu_notifier *mn; struct hlist_node *n; + int id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_range_start) mn->ops->invalidate_range_start(mn, mm, start, end); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); } void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, @@ -174,13 +176,14 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, { struct mmu_notifier *mn; struct hlist_node *n; + int id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_range_end) mn->ops->invalidate_range_end(mn, mm, start, end); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); } static int do_mmu_notifier_register(struct mmu_notifier *mn, @@ -192,22 +195,29 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, BUG_ON(atomic_read(&mm->mm_users) <= 0); - ret = -ENOMEM; - mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); - if (unlikely(!mmu_notifier_mm)) - goto out; + /* + * Verify that mmu_notifier_init() already run and the global srcu is + * initialized. + */ + BUG_ON(!srcu.per_cpu_ref); if (take_mmap_sem) down_write(&mm->mmap_sem); ret = mm_take_all_locks(mm); if (unlikely(ret)) - goto out_cleanup; + goto out; if (!mm_has_notifiers(mm)) { + mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), + GFP_KERNEL); + if (unlikely(!mmu_notifier_mm)) { + ret = -ENOMEM; + goto out_of_mem; + } INIT_HLIST_HEAD(&mmu_notifier_mm->list); spin_lock_init(&mmu_notifier_mm->lock); + mm->mmu_notifier_mm = mmu_notifier_mm; - mmu_notifier_mm = NULL; } atomic_inc(&mm->mm_count); @@ -223,13 +233,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); spin_unlock(&mm->mmu_notifier_mm->lock); +out_of_mem: mm_drop_all_locks(mm); -out_cleanup: +out: if (take_mmap_sem) up_write(&mm->mmap_sem); - /* kfree() does nothing if mmu_notifier_mm is NULL */ - kfree(mmu_notifier_mm); -out: + BUG_ON(atomic_read(&mm->mm_users) <= 0); return ret; } @@ -274,8 +283,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm) /* * This releases the mm_count pin automatically and frees the mm * structure if it was the last user of it. It serializes against - * running mmu notifiers with RCU and against mmu_notifier_unregister - * with the unregister lock + RCU. All sptes must be dropped before + * running mmu notifiers with SRCU and against mmu_notifier_unregister + * with the unregister lock + SRCU. All sptes must be dropped before * calling mmu_notifier_unregister. ->release or any other notifier * method may be invoked concurrently with mmu_notifier_unregister, * and only after mmu_notifier_unregister returned we're guaranteed @@ -287,11 +296,12 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) if (!hlist_unhashed(&mn->hlist)) { /* - * RCU here will force exit_mmap to wait ->release to finish + * SRCU here will force exit_mmap to wait ->release to finish * before freeing the pages. */ - rcu_read_lock(); + int id; + id = srcu_read_lock(&srcu); /* * exit_mmap will block in mmu_notifier_release to * guarantee ->release is called before freeing the @@ -299,7 +309,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) */ if (mn->ops->release) mn->ops->release(mn, mm); - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); spin_lock(&mm->mmu_notifier_mm->lock); hlist_del_rcu(&mn->hlist); @@ -310,10 +320,17 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) * Wait any running method to finish, of course including * ->release if it was run by mmu_notifier_relase instead of us. */ - synchronize_rcu(); + synchronize_srcu(&srcu); BUG_ON(atomic_read(&mm->mm_count) <= 0); mmdrop(mm); } EXPORT_SYMBOL_GPL(mmu_notifier_unregister); + +static int __init mmu_notifier_init(void) +{ + return init_srcu_struct(&srcu); +} + +module_init(mmu_notifier_init); diff --git a/mm/mremap.c b/mm/mremap.c index cc06d0e..1b61c2d 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -71,22 +71,41 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long old_addr, unsigned long old_end, struct vm_area_struct *new_vma, pmd_t *new_pmd, - unsigned long new_addr) + unsigned long new_addr, bool need_rmap_locks) { struct address_space *mapping = NULL; + struct anon_vma *anon_vma = NULL; struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; - if (vma->vm_file) { - /* - * Subtle point from Rajesh Venkatasubramanian: before - * moving file-based ptes, we must lock truncate_pagecache - * out, since it might clean the dst vma before the src vma, - * and we propagate stale pages into the dst afterward. - */ - mapping = vma->vm_file->f_mapping; - mutex_lock(&mapping->i_mmap_mutex); + /* + * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma + * locks to ensure that rmap will always observe either the old or the + * new ptes. This is the easiest way to avoid races with + * truncate_pagecache(), page migration, etc... + * + * When need_rmap_locks is false, we use other ways to avoid + * such races: + * + * - During exec() shift_arg_pages(), we use a specially tagged vma + * which rmap call sites look for using is_vma_temporary_stack(). + * + * - During mremap(), new_vma is often known to be placed after vma + * in rmap traversal order. This ensures rmap will always observe + * either the old pte, or the new pte, or both (the page table locks + * serialize access to individual ptes, but only rmap traversal + * order guarantees that we won't miss both the old and new ptes). + */ + if (need_rmap_locks) { + if (vma->vm_file) { + mapping = vma->vm_file->f_mapping; + mutex_lock(&mapping->i_mmap_mutex); + } + if (vma->anon_vma) { + anon_vma = vma->anon_vma; + anon_vma_lock(anon_vma); + } } /* @@ -114,6 +133,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, spin_unlock(new_ptl); pte_unmap(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); + if (anon_vma) + anon_vma_unlock(anon_vma); if (mapping) mutex_unlock(&mapping->i_mmap_mutex); } @@ -122,16 +143,21 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, - unsigned long new_addr, unsigned long len) + unsigned long new_addr, unsigned long len, + bool need_rmap_locks) { unsigned long extent, next, old_end; pmd_t *old_pmd, *new_pmd; bool need_flush = false; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ old_end = old_addr + len; flush_cache_range(vma, old_addr, old_end); - mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end); + mmun_start = old_addr; + mmun_end = old_end; + mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); for (; old_addr < old_end; old_addr += extent, new_addr += extent) { cond_resched(); @@ -169,13 +195,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (extent > LATENCY_LIMIT) extent = LATENCY_LIMIT; move_ptes(vma, old_pmd, old_addr, old_addr + extent, - new_vma, new_pmd, new_addr); + new_vma, new_pmd, new_addr, need_rmap_locks); need_flush = true; } if (likely(need_flush)) flush_tlb_range(vma, old_end-len, old_addr); - mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end); + mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); return len + old_addr - old_end; /* how much done */ } @@ -193,6 +219,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long hiwater_vm; int split = 0; int err; + bool need_rmap_locks; /* * We'd prefer to avoid failure later on in do_munmap: @@ -214,27 +241,21 @@ static unsigned long move_vma(struct vm_area_struct *vma, return err; new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); - new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); + new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, + &need_rmap_locks); if (!new_vma) return -ENOMEM; - moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); + moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, + need_rmap_locks); if (moved_len < old_len) { /* - * Before moving the page tables from the new vma to - * the old vma, we need to be sure the old vma is - * queued after new vma in the same_anon_vma list to - * prevent SMP races with rmap_walk (that could lead - * rmap_walk to miss some page table). - */ - anon_vma_moveto_tail(vma); - - /* * On error, move entries back from new area to old, * which will succeed since page tables still there, * and then proceed to unmap new area instead of old. */ - move_page_tables(new_vma, new_addr, vma, old_addr, moved_len); + move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, + true); vma = new_vma; old_len = new_len; old_addr = new_addr; diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 4055730..714d5d6 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -116,6 +116,8 @@ static unsigned long __init __free_memory_core(phys_addr_t start, return 0; __free_pages_memory(start_pfn, end_pfn); + fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT), + start_pfn, end_pfn); return end_pfn - start_pfn; } @@ -126,6 +128,7 @@ unsigned long __init free_low_memory_core_early(int nodeid) phys_addr_t start, end, size; u64 i; + reset_zone_present_pages(); for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) count += __free_memory_core(start, end); @@ -162,8 +165,6 @@ unsigned long __init free_all_bootmem(void) * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id * because in some case like Node0 doesn't have RAM installed * low ram will be on Node1 - * Use MAX_NUMNODES will make sure all ranges in early_node_map[] - * will be used instead of only Node0 related */ return free_low_memory_core_early(MAX_NUMNODES); } @@ -698,7 +698,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); - vma_prio_tree_insert(vma, &mapping->i_mmap); + vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } @@ -764,7 +764,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); - vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } @@ -789,11 +789,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) kenter("%p", vma); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); - if (vma->vm_file) { + if (vma->vm_file) fput(vma->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(mm); - } put_nommu_region(vma->vm_region); kmem_cache_free(vm_area_cachep, vma); } @@ -1282,14 +1279,8 @@ unsigned long do_mmap_pgoff(struct file *file, vma->vm_pgoff = pgoff; if (file) { - region->vm_file = file; - get_file(file); - vma->vm_file = file; - get_file(file); - if (vm_flags & VM_EXECUTABLE) { - added_exe_file_vma(current->mm); - vma->vm_mm = current->mm; - } + region->vm_file = get_file(file); + vma->vm_file = get_file(file); } down_write(&nommu_region_sem); @@ -1442,8 +1433,6 @@ error: kmem_cache_free(vm_region_jar, region); if (vma->vm_file) fput(vma->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(vma->vm_mm); kmem_cache_free(vm_area_cachep, vma); kleave(" = %d", ret); return ret; @@ -1822,7 +1811,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, if (addr != (pfn << PAGE_SHIFT)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; return 0; } EXPORT_SYMBOL(remap_pfn_range); @@ -1963,6 +1952,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); +int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, + unsigned long size, pgoff_t pgoff) +{ + BUG(); + return 0; +} +EXPORT_SYMBOL(generic_file_remap_pages); + static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, unsigned long addr, void *buf, int len, int write) { @@ -2047,7 +2044,6 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, size_t newsize) { struct vm_area_struct *vma; - struct prio_tree_iter iter; struct vm_region *region; pgoff_t low, high; size_t r_size, r_top; @@ -2059,8 +2055,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, mutex_lock(&inode->i_mapping->i_mmap_mutex); /* search for VMAs that fall within the dead zone */ - vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, - low, high) { + vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { /* found one - only interested if it's shared out of the page * cache */ if (vma->vm_flags & VM_SHARED) { @@ -2076,8 +2071,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, * we don't check for any regions that start beyond the EOF as there * shouldn't be any */ - vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, - 0, ULONG_MAX) { + vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, + 0, ULONG_MAX) { if (!(vma->vm_flags & VM_SHARED)) continue; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1986008..79e0f3e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -428,8 +428,8 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, { task_lock(current); pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " - "oom_adj=%d, oom_score_adj=%d\n", - current->comm, gfp_mask, order, current->signal->oom_adj, + "oom_score_adj=%d\n", + current->comm, gfp_mask, order, current->signal->oom_score_adj); cpuset_print_task_mems_allowed(current); task_unlock(current); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c66fb87..bb90971 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -558,7 +558,8 @@ static inline void __free_one_page(struct page *page, if (page_is_guard(buddy)) { clear_page_guard_flag(buddy); set_page_private(page, 0); - __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); + __mod_zone_freepage_state(zone, 1 << order, + migratetype); } else { list_del(&buddy->lru); zone->free_area[order].nr_free--; @@ -584,7 +585,7 @@ static inline void __free_one_page(struct page *page, combined_idx = buddy_idx & page_idx; higher_page = page + (combined_idx - page_idx); buddy_idx = __find_buddy_index(combined_idx, order + 1); - higher_buddy = page + (buddy_idx - combined_idx); + higher_buddy = higher_page + (buddy_idx - combined_idx); if (page_is_buddy(higher_page, higher_buddy, order + 1)) { list_add_tail(&page->lru, &zone->free_area[order].free_list[migratetype]); @@ -597,17 +598,6 @@ out: zone->free_area[order].nr_free++; } -/* - * free_page_mlock() -- clean up attempts to free and mlocked() page. - * Page should not be on lru, so no need to fix that up. - * free_pages_check() will verify... - */ -static inline void free_page_mlock(struct page *page) -{ - __dec_zone_page_state(page, NR_MLOCK); - __count_vm_event(UNEVICTABLE_MLOCKFREED); -} - static inline int free_pages_check(struct page *page) { if (unlikely(page_mapcount(page) | @@ -668,12 +658,17 @@ static void free_pcppages_bulk(struct zone *zone, int count, batch_free = to_free; do { + int mt; /* migratetype of the to-be-freed page */ + page = list_entry(list->prev, struct page, lru); /* must delete as __free_one_page list manipulates */ list_del(&page->lru); + mt = get_freepage_migratetype(page); /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ - __free_one_page(page, zone, 0, page_private(page)); - trace_mm_page_pcpu_drain(page, 0, page_private(page)); + __free_one_page(page, zone, 0, mt); + trace_mm_page_pcpu_drain(page, 0, mt); + if (is_migrate_cma(mt)) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); } while (--to_free && --batch_free && !list_empty(list)); } __mod_zone_page_state(zone, NR_FREE_PAGES, count); @@ -688,7 +683,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order, zone->pages_scanned = 0; __free_one_page(page, zone, order, migratetype); - __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); + if (unlikely(migratetype != MIGRATE_ISOLATE)) + __mod_zone_freepage_state(zone, 1 << order, migratetype); spin_unlock(&zone->lock); } @@ -721,17 +717,16 @@ static bool free_pages_prepare(struct page *page, unsigned int order) static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; - int wasMlocked = __TestClearPageMlocked(page); + int migratetype; if (!free_pages_prepare(page, order)) return; local_irq_save(flags); - if (unlikely(wasMlocked)) - free_page_mlock(page); __count_vm_events(PGFREE, 1 << order); - free_one_page(page_zone(page), page, order, - get_pageblock_migratetype(page)); + migratetype = get_pageblock_migratetype(page); + set_freepage_migratetype(page, migratetype); + free_one_page(page_zone(page), page, order, migratetype); local_irq_restore(flags); } @@ -811,7 +806,8 @@ static inline void expand(struct zone *zone, struct page *page, set_page_guard_flag(&page[size]); set_page_private(&page[size], high); /* Guard pages are not available for any usage */ - __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); + __mod_zone_freepage_state(zone, -(1 << high), + migratetype); continue; } #endif @@ -915,7 +911,7 @@ static int fallbacks[MIGRATE_TYPES][4] = { * Note that start_page and end_pages are not aligned on a pageblock * boundary. If alignment is required, use move_freepages_block() */ -static int move_freepages(struct zone *zone, +int move_freepages(struct zone *zone, struct page *start_page, struct page *end_page, int migratetype) { @@ -951,6 +947,7 @@ static int move_freepages(struct zone *zone, order = page_order(page); list_move(&page->lru, &zone->free_area[order].free_list[migratetype]); + set_freepage_migratetype(page, migratetype); page += 1 << order; pages_moved += 1 << order; } @@ -1135,8 +1132,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) mt = migratetype; } - set_page_private(page, mt); + set_freepage_migratetype(page, mt); list = &page->lru; + if (is_migrate_cma(mt)) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, + -(1 << order)); } __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock(&zone->lock); @@ -1296,16 +1296,13 @@ void free_hot_cold_page(struct page *page, int cold) struct per_cpu_pages *pcp; unsigned long flags; int migratetype; - int wasMlocked = __TestClearPageMlocked(page); if (!free_pages_prepare(page, 0)) return; migratetype = get_pageblock_migratetype(page); - set_page_private(page, migratetype); + set_freepage_migratetype(page, migratetype); local_irq_save(flags); - if (unlikely(wasMlocked)) - free_page_mlock(page); __count_vm_event(PGFREE); /* @@ -1380,20 +1377,16 @@ void split_page(struct page *page, unsigned int order) } /* - * Similar to split_page except the page is already free. As this is only - * being used for migration, the migratetype of the block also changes. - * As this is called with interrupts disabled, the caller is responsible - * for calling arch_alloc_page() and kernel_map_page() after interrupts - * are enabled. - * - * Note: this is probably too low level an operation for use in drivers. - * Please consult with lkml before using this in your driver. + * Similar to the split_page family of functions except that the page + * required at the given order and being isolated now to prevent races + * with parallel allocators */ -int split_free_page(struct page *page) +int capture_free_page(struct page *page, int alloc_order, int migratetype) { unsigned int order; unsigned long watermark; struct zone *zone; + int mt; BUG_ON(!PageBuddy(page)); @@ -1409,12 +1402,16 @@ int split_free_page(struct page *page) list_del(&page->lru); zone->free_area[order].nr_free--; rmv_page_order(page); - __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); - /* Split into individual pages */ - set_page_refcounted(page); - split_page(page, order); + mt = get_pageblock_migratetype(page); + if (unlikely(mt != MIGRATE_ISOLATE)) + __mod_zone_freepage_state(zone, -(1UL << order), mt); + if (alloc_order != order) + expand(zone, page, alloc_order, order, + &zone->free_area[order], migratetype); + + /* Set the pageblock if the captured page is at least a pageblock */ if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { @@ -1425,7 +1422,35 @@ int split_free_page(struct page *page) } } - return 1 << order; + return 1UL << order; +} + +/* + * Similar to split_page except the page is already free. As this is only + * being used for migration, the migratetype of the block also changes. + * As this is called with interrupts disabled, the caller is responsible + * for calling arch_alloc_page() and kernel_map_page() after interrupts + * are enabled. + * + * Note: this is probably too low level an operation for use in drivers. + * Please consult with lkml before using this in your driver. + */ +int split_free_page(struct page *page) +{ + unsigned int order; + int nr_pages; + + BUG_ON(!PageBuddy(page)); + order = page_order(page); + + nr_pages = capture_free_page(page, order, 0); + if (!nr_pages) + return 0; + + /* Split into individual pages */ + set_page_refcounted(page); + split_page(page, order); + return nr_pages; } /* @@ -1484,7 +1509,8 @@ again: spin_unlock(&zone->lock); if (!page) goto failed; - __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); + __mod_zone_freepage_state(zone, -(1 << order), + get_pageblock_migratetype(page)); } __count_zone_vm_events(PGALLOC, zone, 1 << order); @@ -1501,19 +1527,6 @@ failed: return NULL; } -/* The ALLOC_WMARK bits are used as an index to zone->watermark */ -#define ALLOC_WMARK_MIN WMARK_MIN -#define ALLOC_WMARK_LOW WMARK_LOW -#define ALLOC_WMARK_HIGH WMARK_HIGH -#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ - -/* Mask to get the watermark bits */ -#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) - -#define ALLOC_HARDER 0x10 /* try to alloc harder */ -#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ -#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ - #ifdef CONFIG_FAIL_PAGE_ALLOC static struct { @@ -1608,7 +1621,11 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, min -= min / 2; if (alloc_flags & ALLOC_HARDER) min -= min / 4; - +#ifdef CONFIG_CMA + /* If allocation can't use CMA areas don't use free CMA pages */ + if (!(alloc_flags & ALLOC_CMA)) + free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); +#endif if (free_pages <= min + lowmem_reserve) return false; for (o = 0; o < order; o++) { @@ -1782,6 +1799,22 @@ static void zlc_clear_zones_full(struct zonelist *zonelist) bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); } +static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) +{ + return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); +} + +static void __paginginit init_zone_allows_reclaim(int nid) +{ + int i; + + for_each_online_node(i) + if (node_distance(nid, i) <= RECLAIM_DISTANCE) { + node_set(i, NODE_DATA(nid)->reclaim_nodes); + zone_reclaim_mode = 1; + } +} + #else /* CONFIG_NUMA */ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) @@ -1802,6 +1835,15 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) static void zlc_clear_zones_full(struct zonelist *zonelist) { } + +static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) +{ + return true; +} + +static inline void init_zone_allows_reclaim(int nid) +{ +} #endif /* CONFIG_NUMA */ /* @@ -1886,7 +1928,8 @@ zonelist_scan: did_zlc_setup = 1; } - if (zone_reclaim_mode == 0) + if (zone_reclaim_mode == 0 || + !zone_allows_reclaim(preferred_zone, zone)) goto this_zone_full; /* @@ -2105,7 +2148,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { - struct page *page; + struct page *page = NULL; if (!order) return NULL; @@ -2118,10 +2161,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, sync_migration, - contended_compaction); + contended_compaction, &page); current->flags &= ~PF_MEMALLOC; - if (*did_some_progress != COMPACT_SKIPPED) { + /* If compaction captured a page, prep and use it */ + if (page) { + prep_new_page(page, order, gfp_mask); + goto got_page; + } + + if (*did_some_progress != COMPACT_SKIPPED) { /* Page migration frees to the PCP lists but we want merging */ drain_pages(get_cpu()); put_cpu(); @@ -2131,6 +2180,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, alloc_flags & ~ALLOC_NO_WATERMARKS, preferred_zone, migratetype); if (page) { +got_page: + preferred_zone->compact_blockskip_flush = false; preferred_zone->compact_considered = 0; preferred_zone->compact_defer_shift = 0; if (order >= preferred_zone->compact_order_failed) @@ -2315,7 +2366,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask) unlikely(test_thread_flag(TIF_MEMDIE)))) alloc_flags |= ALLOC_NO_WATERMARKS; } - +#ifdef CONFIG_CMA + if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; +#endif return alloc_flags; } @@ -2362,9 +2416,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - if (!(gfp_mask & __GFP_NO_KSWAPD)) - wake_all_kswapd(order, zonelist, high_zoneidx, - zone_idx(preferred_zone)); + wake_all_kswapd(order, zonelist, high_zoneidx, + zone_idx(preferred_zone)); /* * OK, we're below the kswapd watermark and have kicked background @@ -2441,7 +2494,7 @@ rebalance: * system then fail the allocation instead of entering direct reclaim. */ if ((deferred_compaction || contended_compaction) && - (gfp_mask & __GFP_NO_KSWAPD)) + (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) goto nopage; /* Try direct reclaim and then allocating */ @@ -2541,6 +2594,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct page *page = NULL; int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; + int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; gfp_mask &= gfp_allowed_mask; @@ -2569,9 +2623,13 @@ retry_cpuset: if (!preferred_zone) goto out; +#ifdef CONFIG_CMA + if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; +#endif /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, - zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, + zonelist, high_zoneidx, alloc_flags, preferred_zone, migratetype); if (unlikely(!page)) page = __alloc_pages_slowpath(gfp_mask, order, @@ -2852,7 +2910,8 @@ void show_free_areas(unsigned int filter) " unevictable:%lu" " dirty:%lu writeback:%lu unstable:%lu\n" " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" - " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", + " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" + " free_cma:%lu\n", global_page_state(NR_ACTIVE_ANON), global_page_state(NR_INACTIVE_ANON), global_page_state(NR_ISOLATED_ANON), @@ -2869,7 +2928,8 @@ void show_free_areas(unsigned int filter) global_page_state(NR_FILE_MAPPED), global_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), - global_page_state(NR_BOUNCE)); + global_page_state(NR_BOUNCE), + global_page_state(NR_FREE_CMA_PAGES)); for_each_populated_zone(zone) { int i; @@ -2901,6 +2961,7 @@ void show_free_areas(unsigned int filter) " pagetables:%lukB" " unstable:%lukB" " bounce:%lukB" + " free_cma:%lukB" " writeback_tmp:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" @@ -2930,6 +2991,7 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_UNSTABLE_NFS)), K(zone_page_state(zone, NR_BOUNCE)), + K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), zone->pages_scanned, (zone->all_unreclaimable ? "yes" : "no") @@ -3328,21 +3390,13 @@ static void build_zonelists(pg_data_t *pgdat) j = 0; while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { - int distance = node_distance(local_node, node); - - /* - * If another node is sufficiently far away then it is better - * to reclaim pages in a zone before going off node. - */ - if (distance > RECLAIM_DISTANCE) - zone_reclaim_mode = 1; - /* * We don't want to pressure a particular node. * So adding penalty to the first node in same * distance group to make it round-robin. */ - if (distance != node_distance(local_node, prev_node)) + if (node_distance(local_node, node) != + node_distance(local_node, prev_node)) node_load[node] = load; prev_node = node; @@ -4438,11 +4492,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->spanned_pages = size; zone->present_pages = realsize; -#if defined CONFIG_COMPACTION || defined CONFIG_CMA - zone->compact_cached_free_pfn = zone->zone_start_pfn + - zone->spanned_pages; - zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1); -#endif #ifdef CONFIG_NUMA zone->node = nid; zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) @@ -4521,6 +4570,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; + init_zone_allows_reclaim(nid); calculate_node_totalpages(pgdat, zones_size, zholes_size); alloc_node_mem_map(pgdat); @@ -4879,7 +4929,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) zone_movable_pfn[i] << PAGE_SHIFT); } - /* Print out the early_node_map[] */ + /* Print out the early node map */ printk("Early memory node ranges\n"); for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, @@ -5619,47 +5669,28 @@ static unsigned long pfn_max_align_up(unsigned long pfn) pageblock_nr_pages)); } -static struct page * -__alloc_contig_migrate_alloc(struct page *page, unsigned long private, - int **resultp) -{ - gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; - - if (PageHighMem(page)) - gfp_mask |= __GFP_HIGHMEM; - - return alloc_page(gfp_mask); -} - /* [start, end) must belong to a single zone. */ -static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) +static int __alloc_contig_migrate_range(struct compact_control *cc, + unsigned long start, unsigned long end) { /* This function is based on compact_zone() from compaction.c. */ - + unsigned long nr_reclaimed; unsigned long pfn = start; unsigned int tries = 0; int ret = 0; - struct compact_control cc = { - .nr_migratepages = 0, - .order = -1, - .zone = page_zone(pfn_to_page(start)), - .sync = true, - }; - INIT_LIST_HEAD(&cc.migratepages); - migrate_prep_local(); - while (pfn < end || !list_empty(&cc.migratepages)) { + while (pfn < end || !list_empty(&cc->migratepages)) { if (fatal_signal_pending(current)) { ret = -EINTR; break; } - if (list_empty(&cc.migratepages)) { - cc.nr_migratepages = 0; - pfn = isolate_migratepages_range(cc.zone, &cc, - pfn, end); + if (list_empty(&cc->migratepages)) { + cc->nr_migratepages = 0; + pfn = isolate_migratepages_range(cc->zone, cc, + pfn, end, true); if (!pfn) { ret = -EINTR; break; @@ -5670,12 +5701,16 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) break; } - ret = migrate_pages(&cc.migratepages, - __alloc_contig_migrate_alloc, + nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, + &cc->migratepages); + cc->nr_migratepages -= nr_reclaimed; + + ret = migrate_pages(&cc->migratepages, + alloc_migrate_target, 0, false, MIGRATE_SYNC); } - putback_lru_pages(&cc.migratepages); + putback_lru_pages(&cc->migratepages); return ret > 0 ? 0 : ret; } @@ -5754,6 +5789,15 @@ int alloc_contig_range(unsigned long start, unsigned long end, unsigned long outer_start, outer_end; int ret = 0, order; + struct compact_control cc = { + .nr_migratepages = 0, + .order = -1, + .zone = page_zone(pfn_to_page(start)), + .sync = true, + .ignore_skip_hint = true, + }; + INIT_LIST_HEAD(&cc.migratepages); + /* * What we do here is we mark all pageblocks in range as * MIGRATE_ISOLATE. Because pageblock and max order pages may @@ -5783,7 +5827,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, if (ret) goto done; - ret = __alloc_contig_migrate_range(start, end); + ret = __alloc_contig_migrate_range(&cc, start, end); if (ret) goto done; @@ -5832,7 +5876,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); /* Grab isolated pages from freelists. */ - outer_end = isolate_freepages_range(outer_start, end); + outer_end = isolate_freepages_range(&cc, outer_start, end); if (!outer_end) { ret = -EBUSY; goto done; @@ -5874,6 +5918,7 @@ static int __meminit __zone_pcp_update(void *data) local_irq_save(flags); if (pcp->count > 0) free_pcppages_bulk(zone, pcp->count, pcp); + drain_zonestat(zone, pset); setup_pageset(pset, batch); local_irq_restore(flags); } @@ -5890,10 +5935,16 @@ void __meminit zone_pcp_update(struct zone *zone) void zone_pcp_reset(struct zone *zone) { unsigned long flags; + int cpu; + struct per_cpu_pageset *pset; /* avoid races with drain_pages() */ local_irq_save(flags); if (zone->pageset != &boot_pageset) { + for_each_online_cpu(cpu) { + pset = per_cpu_ptr(zone->pageset, cpu); + drain_zonestat(zone, pset); + } free_percpu(zone->pageset); zone->pageset = &boot_pageset; } @@ -6047,3 +6098,37 @@ void dump_page(struct page *page) dump_page_flags(page->flags); mem_cgroup_print_bad_page(page); } + +/* reset zone->present_pages */ +void reset_zone_present_pages(void) +{ + struct zone *z; + int i, nid; + + for_each_node_state(nid, N_HIGH_MEMORY) { + for (i = 0; i < MAX_NR_ZONES; i++) { + z = NODE_DATA(nid)->node_zones + i; + z->present_pages = 0; + } + } +} + +/* calculate zone's present pages in buddy system */ +void fixup_zone_present_pages(int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + struct zone *z; + unsigned long zone_start_pfn, zone_end_pfn; + int i; + + for (i = 0; i < MAX_NR_ZONES; i++) { + z = NODE_DATA(nid)->node_zones + i; + zone_start_pfn = z->zone_start_pfn; + zone_end_pfn = zone_start_pfn + z->spanned_pages; + + /* if the two regions intersect */ + if (!(zone_start_pfn >= end_pfn || zone_end_pfn <= start_pfn)) + z->present_pages += min(end_pfn, zone_end_pfn) - + max(start_pfn, zone_start_pfn); + } +} diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 247d1f1..f2f5b48 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -76,8 +76,13 @@ int set_migratetype_isolate(struct page *page) out: if (!ret) { + unsigned long nr_pages; + int migratetype = get_pageblock_migratetype(page); + set_pageblock_isolate(page); - move_freepages_block(zone, page, MIGRATE_ISOLATE); + nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); + + __mod_zone_freepage_state(zone, -nr_pages, migratetype); } spin_unlock_irqrestore(&zone->lock, flags); @@ -89,12 +94,14 @@ out: void unset_migratetype_isolate(struct page *page, unsigned migratetype) { struct zone *zone; - unsigned long flags; + unsigned long flags, nr_pages; + zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) goto out; - move_freepages_block(zone, page, migratetype); + nr_pages = move_freepages_block(zone, page, migratetype); + __mod_zone_freepage_state(zone, nr_pages, migratetype); restore_pageblock_isolate(page, migratetype); out: spin_unlock_irqrestore(&zone->lock, flags); @@ -193,10 +200,25 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) continue; } page = pfn_to_page(pfn); - if (PageBuddy(page)) + if (PageBuddy(page)) { + /* + * If race between isolatation and allocation happens, + * some free pages could be in MIGRATE_MOVABLE list + * although pageblock's migratation type of the page + * is MIGRATE_ISOLATE. Catch it and move the page into + * MIGRATE_ISOLATE list. + */ + if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) { + struct page *end_page; + + end_page = page + (1 << page_order(page)) - 1; + move_freepages(page_zone(page), page, end_page, + MIGRATE_ISOLATE); + } pfn += 1 << page_order(page); + } else if (page_count(page) == 0 && - page_private(page) == MIGRATE_ISOLATE) + get_freepage_migratetype(page) == MIGRATE_ISOLATE) pfn += 1; else break; @@ -233,3 +255,14 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) spin_unlock_irqrestore(&zone->lock, flags); return ret ? 0 : -EBUSY; } + +struct page *alloc_migrate_target(struct page *page, unsigned long private, + int **resultp) +{ + gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; + + if (PageHighMem(page)) + gfp_mask |= __GFP_HIGHMEM; + + return alloc_page(gfp_mask); +} diff --git a/mm/percpu.c b/mm/percpu.c index bb4be74..ddc5efb 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1370,7 +1370,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, #ifdef CONFIG_SMP -const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { +const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = { [PCPU_FC_AUTO] = "auto", [PCPU_FC_EMBED] = "embed", [PCPU_FC_PAGE] = "page", diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 74c0dda..e642627 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -120,3 +120,53 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif + +#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) +{ + assert_spin_locked(&mm->page_table_lock); + + /* FIFO */ + if (!mm->pmd_huge_pte) + INIT_LIST_HEAD(&pgtable->lru); + else + list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); + mm->pmd_huge_pte = pgtable; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif + +#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* no "address" argument so destroys page coloring of some arch */ +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) +{ + pgtable_t pgtable; + + assert_spin_locked(&mm->page_table_lock); + + /* FIFO */ + pgtable = mm->pmd_huge_pte; + if (list_empty(&pgtable->lru)) + mm->pmd_huge_pte = NULL; + else { + mm->pmd_huge_pte = list_entry(pgtable->lru.next, + struct page, lru); + list_del(&pgtable->lru); + } + return pgtable; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif + +#ifndef __HAVE_ARCH_PMDP_INVALIDATE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp)); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif diff --git a/mm/prio_tree.c b/mm/prio_tree.c deleted file mode 100644 index 799dcfd..0000000 --- a/mm/prio_tree.c +++ /dev/null @@ -1,208 +0,0 @@ -/* - * mm/prio_tree.c - priority search tree for mapping->i_mmap - * - * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu> - * - * This file is released under the GPL v2. - * - * Based on the radix priority search tree proposed by Edward M. McCreight - * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985 - * - * 02Feb2004 Initial version - */ - -#include <linux/mm.h> -#include <linux/prio_tree.h> -#include <linux/prefetch.h> - -/* - * See lib/prio_tree.c for details on the general radix priority search tree - * code. - */ - -/* - * The following #defines are mirrored from lib/prio_tree.c. They're only used - * for debugging, and should be removed (along with the debugging code using - * them) when switching also VMAs to the regular prio_tree code. - */ - -#define RADIX_INDEX(vma) ((vma)->vm_pgoff) -#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT) -/* avoid overflow */ -#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1)) - -/* - * Radix priority search tree for address_space->i_mmap - * - * For each vma that map a unique set of file pages i.e., unique [radix_index, - * heap_index] value, we have a corresponding priority search tree node. If - * multiple vmas have identical [radix_index, heap_index] value, then one of - * them is used as a tree node and others are stored in a vm_set list. The tree - * node points to the first vma (head) of the list using vm_set.head. - * - * prio_tree_root - * | - * A vm_set.head - * / \ / - * L R -> H-I-J-K-M-N-O-P-Q-S - * ^ ^ <-- vm_set.list --> - * tree nodes - * - * We need some way to identify whether a vma is a tree node, head of a vm_set - * list, or just a member of a vm_set list. We cannot use vm_flags to store - * such information. The reason is, in the above figure, it is possible that - * vm_flags' of R and H are covered by the different mmap_sems. When R is - * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold - * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now. - * That's why some trick involving shared.vm_set.parent is used for identifying - * tree nodes and list head nodes. - * - * vma radix priority search tree node rules: - * - * vma->shared.vm_set.parent != NULL ==> a tree node - * vma->shared.vm_set.head != NULL ==> list of others mapping same range - * vma->shared.vm_set.head == NULL ==> no others map the same range - * - * vma->shared.vm_set.parent == NULL - * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range - * vma->shared.vm_set.head == NULL ==> a list node - */ - -/* - * Add a new vma known to map the same set of pages as the old vma: - * useful for fork's dup_mmap as well as vma_prio_tree_insert below. - * Note that it just happens to work correctly on i_mmap_nonlinear too. - */ -void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old) -{ - /* Leave these BUG_ONs till prio_tree patch stabilizes */ - BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old)); - BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old)); - - vma->shared.vm_set.head = NULL; - vma->shared.vm_set.parent = NULL; - - if (!old->shared.vm_set.parent) - list_add(&vma->shared.vm_set.list, - &old->shared.vm_set.list); - else if (old->shared.vm_set.head) - list_add_tail(&vma->shared.vm_set.list, - &old->shared.vm_set.head->shared.vm_set.list); - else { - INIT_LIST_HEAD(&vma->shared.vm_set.list); - vma->shared.vm_set.head = old; - old->shared.vm_set.head = vma; - } -} - -void vma_prio_tree_insert(struct vm_area_struct *vma, - struct prio_tree_root *root) -{ - struct prio_tree_node *ptr; - struct vm_area_struct *old; - - vma->shared.vm_set.head = NULL; - - ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node); - if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) { - old = prio_tree_entry(ptr, struct vm_area_struct, - shared.prio_tree_node); - vma_prio_tree_add(vma, old); - } -} - -void vma_prio_tree_remove(struct vm_area_struct *vma, - struct prio_tree_root *root) -{ - struct vm_area_struct *node, *head, *new_head; - - if (!vma->shared.vm_set.head) { - if (!vma->shared.vm_set.parent) - list_del_init(&vma->shared.vm_set.list); - else - raw_prio_tree_remove(root, &vma->shared.prio_tree_node); - } else { - /* Leave this BUG_ON till prio_tree patch stabilizes */ - BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma); - if (vma->shared.vm_set.parent) { - head = vma->shared.vm_set.head; - if (!list_empty(&head->shared.vm_set.list)) { - new_head = list_entry( - head->shared.vm_set.list.next, - struct vm_area_struct, - shared.vm_set.list); - list_del_init(&head->shared.vm_set.list); - } else - new_head = NULL; - - raw_prio_tree_replace(root, &vma->shared.prio_tree_node, - &head->shared.prio_tree_node); - head->shared.vm_set.head = new_head; - if (new_head) - new_head->shared.vm_set.head = head; - - } else { - node = vma->shared.vm_set.head; - if (!list_empty(&vma->shared.vm_set.list)) { - new_head = list_entry( - vma->shared.vm_set.list.next, - struct vm_area_struct, - shared.vm_set.list); - list_del_init(&vma->shared.vm_set.list); - node->shared.vm_set.head = new_head; - new_head->shared.vm_set.head = node; - } else - node->shared.vm_set.head = NULL; - } - } -} - -/* - * Helper function to enumerate vmas that map a given file page or a set of - * contiguous file pages. The function returns vmas that at least map a single - * page in the given range of contiguous file pages. - */ -struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, - struct prio_tree_iter *iter) -{ - struct prio_tree_node *ptr; - struct vm_area_struct *next; - - if (!vma) { - /* - * First call is with NULL vma - */ - ptr = prio_tree_next(iter); - if (ptr) { - next = prio_tree_entry(ptr, struct vm_area_struct, - shared.prio_tree_node); - prefetch(next->shared.vm_set.head); - return next; - } else - return NULL; - } - - if (vma->shared.vm_set.parent) { - if (vma->shared.vm_set.head) { - next = vma->shared.vm_set.head; - prefetch(next->shared.vm_set.list.next); - return next; - } - } else { - next = list_entry(vma->shared.vm_set.list.next, - struct vm_area_struct, shared.vm_set.list); - if (!next->shared.vm_set.head) { - prefetch(next->shared.vm_set.list.next); - return next; - } - } - - ptr = prio_tree_next(iter); - if (ptr) { - next = prio_tree_entry(ptr, struct vm_area_struct, - shared.prio_tree_node); - prefetch(next->shared.vm_set.head); - return next; - } else - return NULL; -} diff --git a/mm/readahead.c b/mm/readahead.c index ea8f8fa..7963f23 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -579,19 +579,19 @@ do_readahead(struct address_space *mapping, struct file *filp, SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) { ssize_t ret; - struct file *file; + struct fd f; ret = -EBADF; - file = fget(fd); - if (file) { - if (file->f_mode & FMODE_READ) { - struct address_space *mapping = file->f_mapping; + f = fdget(fd); + if (f.file) { + if (f.file->f_mode & FMODE_READ) { + struct address_space *mapping = f.file->f_mapping; pgoff_t start = offset >> PAGE_CACHE_SHIFT; pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; unsigned long len = end - start + 1; - ret = do_readahead(mapping, file, start, len); + ret = do_readahead(mapping, f.file, start, len); } - fput(file); + fdput(f); } return ret; } @@ -127,12 +127,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, avc->vma = vma; avc->anon_vma = anon_vma; list_add(&avc->same_vma, &vma->anon_vma_chain); - - /* - * It's critical to add new vmas to the tail of the anon_vma, - * see comment in huge_memory.c:__split_huge_page(). - */ - list_add_tail(&avc->same_anon_vma, &anon_vma->head); + anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); } /** @@ -269,51 +264,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) } /* - * Some rmap walk that needs to find all ptes/hugepmds without false - * negatives (like migrate and split_huge_page) running concurrent - * with operations that copy or move pagetables (like mremap() and - * fork()) to be safe. They depend on the anon_vma "same_anon_vma" - * list to be in a certain order: the dst_vma must be placed after the - * src_vma in the list. This is always guaranteed by fork() but - * mremap() needs to call this function to enforce it in case the - * dst_vma isn't newly allocated and chained with the anon_vma_clone() - * function but just an extension of a pre-existing vma through - * vma_merge. - * - * NOTE: the same_anon_vma list can still be changed by other - * processes while mremap runs because mremap doesn't hold the - * anon_vma mutex to prevent modifications to the list while it - * runs. All we need to enforce is that the relative order of this - * process vmas isn't changing (we don't care about other vmas - * order). Each vma corresponds to an anon_vma_chain structure so - * there's no risk that other processes calling anon_vma_moveto_tail() - * and changing the same_anon_vma list under mremap() will screw with - * the relative order of this process vmas in the list, because we - * they can't alter the order of any vma that belongs to this - * process. And there can't be another anon_vma_moveto_tail() running - * concurrently with mremap() coming from this process because we hold - * the mmap_sem for the whole mremap(). fork() ordering dependency - * also shouldn't be affected because fork() only cares that the - * parent vmas are placed in the list before the child vmas and - * anon_vma_moveto_tail() won't reorder vmas from either the fork() - * parent or child. - */ -void anon_vma_moveto_tail(struct vm_area_struct *dst) -{ - struct anon_vma_chain *pavc; - struct anon_vma *root = NULL; - - list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) { - struct anon_vma *anon_vma = pavc->anon_vma; - VM_BUG_ON(pavc->vma != dst); - root = lock_anon_vma_root(root, anon_vma); - list_del(&pavc->same_anon_vma); - list_add_tail(&pavc->same_anon_vma, &anon_vma->head); - } - unlock_anon_vma_root(root); -} - -/* * Attach vma to its own anon_vma, as well as to the anon_vmas that * the corresponding VMA in the parent process is attached to. * Returns 0 on success, non-zero on failure. @@ -381,13 +331,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma) struct anon_vma *anon_vma = avc->anon_vma; root = lock_anon_vma_root(root, anon_vma); - list_del(&avc->same_anon_vma); + anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); /* * Leave empty anon_vmas on the list - we'll need * to free them outside the lock. */ - if (list_empty(&anon_vma->head)) + if (RB_EMPTY_ROOT(&anon_vma->rb_root)) continue; list_del(&avc->same_vma); @@ -416,7 +366,7 @@ static void anon_vma_ctor(void *data) mutex_init(&anon_vma->mutex); atomic_set(&anon_vma->refcount, 0); - INIT_LIST_HEAD(&anon_vma->head); + anon_vma->rb_root = RB_ROOT; } void __init anon_vma_init(void) @@ -560,22 +510,26 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma) /* * At what user virtual address is page expected in @vma? - * Returns virtual address or -EFAULT if page's index/offset is not - * within the range mapped the @vma. */ -inline unsigned long -vma_address(struct page *page, struct vm_area_struct *vma) +static inline unsigned long +__vma_address(struct page *page, struct vm_area_struct *vma) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - unsigned long address; if (unlikely(is_vm_hugetlb_page(vma))) pgoff = page->index << huge_page_order(page_hstate(page)); - address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { - /* page should be within @vma mapping range */ - return -EFAULT; - } + + return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); +} + +inline unsigned long +vma_address(struct page *page, struct vm_area_struct *vma) +{ + unsigned long address = __vma_address(page, vma); + + /* page should be within @vma mapping range */ + VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); + return address; } @@ -585,6 +539,7 @@ vma_address(struct page *page, struct vm_area_struct *vma) */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { + unsigned long address; if (PageAnon(page)) { struct anon_vma *page__anon_vma = page_anon_vma(page); /* @@ -600,7 +555,10 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) return -EFAULT; } else return -EFAULT; - return vma_address(page, vma); + address = __vma_address(page, vma); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + return -EFAULT; + return address; } /* @@ -674,8 +632,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) pte_t *pte; spinlock_t *ptl; - address = vma_address(page, vma); - if (address == -EFAULT) /* out of vma range */ + address = __vma_address(page, vma); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) return 0; pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); if (!pte) /* the page is not in this mm */ @@ -769,6 +727,7 @@ static int page_referenced_anon(struct page *page, { unsigned int mapcount; struct anon_vma *anon_vma; + pgoff_t pgoff; struct anon_vma_chain *avc; int referenced = 0; @@ -777,11 +736,10 @@ static int page_referenced_anon(struct page *page, return referenced; mapcount = page_mapcount(page); - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; /* * If we are reclaiming on behalf of a cgroup, skip * counting on behalf of references from different @@ -820,7 +778,6 @@ static int page_referenced_file(struct page *page, struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int referenced = 0; /* @@ -846,10 +803,8 @@ static int page_referenced_file(struct page *page, */ mapcount = page_mapcount(page); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; /* * If we are reclaiming on behalf of a cgroup, skip * counting on behalf of references from different @@ -929,7 +884,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, pte_t entry; flush_cache_page(vma, address, pte_pfn(*pte)); - entry = ptep_clear_flush_notify(vma, address, pte); + entry = ptep_clear_flush(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(mm, address, pte, entry); @@ -937,6 +892,9 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, } pte_unmap_unlock(pte, ptl); + + if (ret) + mmu_notifier_invalidate_page(mm, address); out: return ret; } @@ -945,17 +903,14 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = 0; BUG_ON(PageAnon(page)); mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (vma->vm_flags & VM_SHARED) { unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; ret += page_mkclean_one(page, vma, address); } } @@ -1128,7 +1083,7 @@ void page_add_new_anon_rmap(struct page *page, else __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __page_set_anon_rmap(page, vma, address, 1); - if (page_evictable(page, vma)) + if (!mlocked_vma_newpage(vma, page)) lru_cache_add_lru(page, LRU_ACTIVE_ANON); else add_page_to_unevictable_list(page); @@ -1203,7 +1158,10 @@ void page_remove_rmap(struct page *page) } else { __dec_zone_page_state(page, NR_FILE_MAPPED); mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); + mem_cgroup_end_update_page_stat(page, &locked, &flags); } + if (unlikely(PageMlocked(page))) + clear_page_mlock(page); /* * It would be tidy to reset the PageAnon mapping here, * but that might overwrite a racing page_add_anon_rmap @@ -1213,6 +1171,7 @@ void page_remove_rmap(struct page *page) * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ + return; out: if (!anon) mem_cgroup_end_update_page_stat(page, &locked, &flags); @@ -1256,7 +1215,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* Nuke the page table entry. */ flush_cache_page(vma, address, page_to_pfn(page)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) @@ -1318,6 +1277,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, out_unmap: pte_unmap_unlock(pte, ptl); + if (ret != SWAP_FAIL) + mmu_notifier_invalidate_page(mm, address); out: return ret; @@ -1382,6 +1343,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, spinlock_t *ptl; struct page *page; unsigned long address; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ unsigned long end; int ret = SWAP_AGAIN; int locked_vma = 0; @@ -1405,6 +1368,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, if (!pmd_present(*pmd)) return ret; + mmun_start = address; + mmun_end = end; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + /* * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, * keep the sem while scanning the cluster for mlocking pages. @@ -1438,7 +1405,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); /* If nonlinear, store the file page offset in the pte. */ if (page->index != linear_page_index(vma, address)) @@ -1454,6 +1421,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, (*mapcount)--; } pte_unmap_unlock(pte - 1, ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (locked_vma) up_read(&vma->vm_mm->mmap_sem); return ret; @@ -1492,6 +1460,7 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma) static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) { struct anon_vma *anon_vma; + pgoff_t pgoff; struct anon_vma_chain *avc; int ret = SWAP_AGAIN; @@ -1499,7 +1468,8 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) if (!anon_vma) return ret; - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address; @@ -1516,8 +1486,6 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) continue; address = vma_address(page, vma); - if (address == -EFAULT) - continue; ret = try_to_unmap_one(page, vma, address, flags); if (ret != SWAP_AGAIN || !page_mapped(page)) break; @@ -1547,7 +1515,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = SWAP_AGAIN; unsigned long cursor; unsigned long max_nl_cursor = 0; @@ -1555,10 +1522,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) unsigned int mapcount; mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; ret = try_to_unmap_one(page, vma, address, flags); if (ret != SWAP_AGAIN || !page_mapped(page)) goto out; @@ -1576,7 +1541,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) goto out; list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.vm_set.list) { + shared.nonlinear) { cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) max_nl_cursor = cursor; @@ -1608,7 +1573,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) do { list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.vm_set.list) { + shared.nonlinear) { cursor = (unsigned long) vma->vm_private_data; while ( cursor < max_nl_cursor && cursor < vma->vm_end - vma->vm_start) { @@ -1631,7 +1596,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) * in locked vmas). Reset cursor on all unreserved nonlinear * vmas, now forgetting on which ones it had fallen behind. */ - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) vma->vm_private_data = NULL; out: mutex_unlock(&mapping->i_mmap_mutex); @@ -1716,6 +1681,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, struct vm_area_struct *, unsigned long, void *), void *arg) { struct anon_vma *anon_vma; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct anon_vma_chain *avc; int ret = SWAP_AGAIN; @@ -1729,11 +1695,9 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, if (!anon_vma) return ret; anon_vma_lock(anon_vma); - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; ret = rmap_one(page, vma, address, arg); if (ret != SWAP_AGAIN) break; @@ -1748,16 +1712,13 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = SWAP_AGAIN; if (!mapping) return ret; mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; ret = rmap_one(page, vma, address, arg); if (ret != SWAP_AGAIN) break; @@ -77,13 +77,6 @@ static struct vfsmount *shm_mnt; /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ #define SHORT_SYMLINK_LEN 128 -struct shmem_xattr { - struct list_head list; /* anchored by shmem_inode_info->xattr_list */ - char *name; /* xattr name */ - size_t size; - char value[0]; -}; - /* * shmem_fallocate and shmem_writepage communicate via inode->i_private * (with i_mutex making sure that it has only one user at a time): @@ -636,7 +629,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) static void shmem_evict_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_xattr *xattr, *nxattr; if (inode->i_mapping->a_ops == &shmem_aops) { shmem_unacct_size(info->flags, inode->i_size); @@ -650,10 +642,7 @@ static void shmem_evict_inode(struct inode *inode) } else kfree(info->symlink); - list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { - kfree(xattr->name); - kfree(xattr); - } + simple_xattrs_free(&info->xattrs); BUG_ON(inode->i_blocks); shmem_free_inode(inode->i_sb); clear_inode(inode); @@ -1350,7 +1339,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &shmem_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } @@ -1377,7 +1365,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode spin_lock_init(&info->lock); info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->swaplist); - INIT_LIST_HEAD(&info->xattr_list); + simple_xattrs_init(&info->xattrs); cache_no_acl(inode); switch (mode & S_IFMT) { @@ -2060,28 +2048,6 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co */ /* - * Allocate new xattr and copy in the value; but leave the name to callers. - */ -static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size) -{ - struct shmem_xattr *new_xattr; - size_t len; - - /* wrap around? */ - len = sizeof(*new_xattr) + size; - if (len <= sizeof(*new_xattr)) - return NULL; - - new_xattr = kmalloc(len, GFP_KERNEL); - if (!new_xattr) - return NULL; - - new_xattr->size = size; - memcpy(new_xattr->value, value, size); - return new_xattr; -} - -/* * Callback for security_inode_init_security() for acquiring xattrs. */ static int shmem_initxattrs(struct inode *inode, @@ -2090,11 +2056,11 @@ static int shmem_initxattrs(struct inode *inode, { struct shmem_inode_info *info = SHMEM_I(inode); const struct xattr *xattr; - struct shmem_xattr *new_xattr; + struct simple_xattr *new_xattr; size_t len; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len); + new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); if (!new_xattr) return -ENOMEM; @@ -2111,91 +2077,12 @@ static int shmem_initxattrs(struct inode *inode, memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, xattr->name, len); - spin_lock(&info->lock); - list_add(&new_xattr->list, &info->xattr_list); - spin_unlock(&info->lock); + simple_xattr_list_add(&info->xattrs, new_xattr); } return 0; } -static int shmem_xattr_get(struct dentry *dentry, const char *name, - void *buffer, size_t size) -{ - struct shmem_inode_info *info; - struct shmem_xattr *xattr; - int ret = -ENODATA; - - info = SHMEM_I(dentry->d_inode); - - spin_lock(&info->lock); - list_for_each_entry(xattr, &info->xattr_list, list) { - if (strcmp(name, xattr->name)) - continue; - - ret = xattr->size; - if (buffer) { - if (size < xattr->size) - ret = -ERANGE; - else - memcpy(buffer, xattr->value, xattr->size); - } - break; - } - spin_unlock(&info->lock); - return ret; -} - -static int shmem_xattr_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_xattr *xattr; - struct shmem_xattr *new_xattr = NULL; - int err = 0; - - /* value == NULL means remove */ - if (value) { - new_xattr = shmem_xattr_alloc(value, size); - if (!new_xattr) - return -ENOMEM; - - new_xattr->name = kstrdup(name, GFP_KERNEL); - if (!new_xattr->name) { - kfree(new_xattr); - return -ENOMEM; - } - } - - spin_lock(&info->lock); - list_for_each_entry(xattr, &info->xattr_list, list) { - if (!strcmp(name, xattr->name)) { - if (flags & XATTR_CREATE) { - xattr = new_xattr; - err = -EEXIST; - } else if (new_xattr) { - list_replace(&xattr->list, &new_xattr->list); - } else { - list_del(&xattr->list); - } - goto out; - } - } - if (flags & XATTR_REPLACE) { - xattr = new_xattr; - err = -ENODATA; - } else { - list_add(&new_xattr->list, &info->xattr_list); - xattr = NULL; - } -out: - spin_unlock(&info->lock); - if (xattr) - kfree(xattr->name); - kfree(xattr); - return err; -} - static const struct xattr_handler *shmem_xattr_handlers[] = { #ifdef CONFIG_TMPFS_POSIX_ACL &generic_acl_access_handler, @@ -2226,6 +2113,7 @@ static int shmem_xattr_validate(const char *name) static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) { + struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); int err; /* @@ -2240,12 +2128,13 @@ static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, if (err) return err; - return shmem_xattr_get(dentry, name, buffer, size); + return simple_xattr_get(&info->xattrs, name, buffer, size); } static int shmem_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { + struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); int err; /* @@ -2260,15 +2149,12 @@ static int shmem_setxattr(struct dentry *dentry, const char *name, if (err) return err; - if (size == 0) - value = ""; /* empty EA, do not remove */ - - return shmem_xattr_set(dentry->d_inode, name, value, size, flags); - + return simple_xattr_set(&info->xattrs, name, value, size, flags); } static int shmem_removexattr(struct dentry *dentry, const char *name) { + struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); int err; /* @@ -2283,45 +2169,13 @@ static int shmem_removexattr(struct dentry *dentry, const char *name) if (err) return err; - return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); -} - -static bool xattr_is_trusted(const char *name) -{ - return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); + return simple_xattr_remove(&info->xattrs, name); } static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) { - bool trusted = capable(CAP_SYS_ADMIN); - struct shmem_xattr *xattr; - struct shmem_inode_info *info; - size_t used = 0; - - info = SHMEM_I(dentry->d_inode); - - spin_lock(&info->lock); - list_for_each_entry(xattr, &info->xattr_list, list) { - size_t len; - - /* skip "trusted." attributes for unprivileged callers */ - if (!trusted && xattr_is_trusted(xattr->name)) - continue; - - len = strlen(xattr->name) + 1; - used += len; - if (buffer) { - if (size < used) { - used = -ERANGE; - break; - } - memcpy(buffer, xattr->name, len); - buffer += len; - } - } - spin_unlock(&info->lock); - - return used; + struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); + return simple_xattr_list(&info->xattrs, buffer, size); } #endif /* CONFIG_TMPFS_XATTR */ @@ -2788,6 +2642,7 @@ static const struct vm_operations_struct shmem_vm_ops = { .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif + .remap_pages = generic_file_remap_pages, }; static struct dentry *shmem_mount(struct file_system_type *fs_type, @@ -2981,7 +2836,6 @@ int shmem_zero_setup(struct vm_area_struct *vma) fput(vma->vm_file); vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } @@ -498,14 +498,6 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif -#ifdef CONFIG_TRACING -size_t slab_buffer_size(struct kmem_cache *cachep) -{ - return cachep->size; -} -EXPORT_SYMBOL(slab_buffer_size); -#endif - /* * Do not go above this order unless 0 objects fit into the slab or * overridden on the command line. @@ -515,13 +507,6 @@ EXPORT_SYMBOL(slab_buffer_size); static int slab_max_order = SLAB_MAX_ORDER_LO; static bool slab_max_order_set __initdata; -static inline struct kmem_cache *page_get_cache(struct page *page) -{ - page = compound_head(page); - BUG_ON(!PageSlab(page)); - return page->slab_cache; -} - static inline struct kmem_cache *virt_to_cache(const void *obj) { struct page *page = virt_to_head_page(obj); @@ -585,9 +570,9 @@ static struct arraycache_init initarray_generic = { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; /* internal cache of cache description objs */ -static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES]; -static struct kmem_cache cache_cache = { - .nodelists = cache_cache_nodelists, +static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES]; +static struct kmem_cache kmem_cache_boot = { + .nodelists = kmem_cache_nodelists, .batchcount = 1, .limit = BOOT_CPUCACHE_ENTRIES, .shared = 1, @@ -810,6 +795,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, *left_over = slab_size - nr_objs*buffer_size - mgmt_size; } +#if DEBUG #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) static void __slab_error(const char *function, struct kmem_cache *cachep, @@ -818,7 +804,9 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", function, cachep->name, msg); dump_stack(); + add_taint(TAINT_BAD_PAGE); } +#endif /* * By default on NUMA we use alien caches to stage the freeing of @@ -900,7 +888,7 @@ static void __cpuinit start_cpu_timer(int cpu) */ if (keventd_up() && reap_work->work.func == NULL) { init_reap_node(cpu); - INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap); + INIT_DEFERRABLE_WORK(reap_work, cache_reap); schedule_delayed_work_on(cpu, reap_work, __round_jiffies_relative(HZ, cpu)); } @@ -983,7 +971,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, } /* The caller cannot use PFMEMALLOC objects, find another one */ - for (i = 1; i < ac->avail; i++) { + for (i = 0; i < ac->avail; i++) { /* If a !PFMEMALLOC object is found, swap them */ if (!is_obj_pfmemalloc(ac->entry[i])) { objp = ac->entry[i]; @@ -1000,7 +988,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, l3 = cachep->nodelists[numa_mem_id()]; if (!list_empty(&l3->slabs_free) && force_refill) { struct slab *slabp = virt_to_slab(objp); - ClearPageSlabPfmemalloc(virt_to_page(slabp->s_mem)); + ClearPageSlabPfmemalloc(virt_to_head_page(slabp->s_mem)); clear_obj_pfmemalloc(&objp); recheck_pfmemalloc_active(cachep, ac); return objp; @@ -1032,7 +1020,7 @@ static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, { if (unlikely(pfmemalloc_active)) { /* Some pfmemalloc slabs exist, check if this is one */ - struct page *page = virt_to_page(objp); + struct page *page = virt_to_head_page(objp); if (PageSlabPfmemalloc(page)) set_obj_pfmemalloc(&objp); } @@ -1601,15 +1589,17 @@ void __init kmem_cache_init(void) int order; int node; + kmem_cache = &kmem_cache_boot; + if (num_possible_nodes() == 1) use_alien_caches = 0; for (i = 0; i < NUM_INIT_LISTS; i++) { kmem_list3_init(&initkmem_list3[i]); if (i < MAX_NUMNODES) - cache_cache.nodelists[i] = NULL; + kmem_cache->nodelists[i] = NULL; } - set_up_list3s(&cache_cache, CACHE_CACHE); + set_up_list3s(kmem_cache, CACHE_CACHE); /* * Fragmentation resistance on low memory - only use bigger @@ -1621,9 +1611,9 @@ void __init kmem_cache_init(void) /* Bootstrap is tricky, because several objects are allocated * from caches that do not exist yet: - * 1) initialize the cache_cache cache: it contains the struct - * kmem_cache structures of all caches, except cache_cache itself: - * cache_cache is statically allocated. + * 1) initialize the kmem_cache cache: it contains the struct + * kmem_cache structures of all caches, except kmem_cache itself: + * kmem_cache is statically allocated. * Initially an __init data area is used for the head array and the * kmem_list3 structures, it's replaced with a kmalloc allocated * array at the end of the bootstrap. @@ -1632,43 +1622,43 @@ void __init kmem_cache_init(void) * An __init data area is used for the head array. * 3) Create the remaining kmalloc caches, with minimally sized * head arrays. - * 4) Replace the __init data head arrays for cache_cache and the first + * 4) Replace the __init data head arrays for kmem_cache and the first * kmalloc cache with kmalloc allocated arrays. - * 5) Replace the __init data for kmem_list3 for cache_cache and + * 5) Replace the __init data for kmem_list3 for kmem_cache and * the other cache's with kmalloc allocated memory. * 6) Resize the head arrays of the kmalloc caches to their final sizes. */ node = numa_mem_id(); - /* 1) create the cache_cache */ + /* 1) create the kmem_cache */ INIT_LIST_HEAD(&slab_caches); - list_add(&cache_cache.list, &slab_caches); - cache_cache.colour_off = cache_line_size(); - cache_cache.array[smp_processor_id()] = &initarray_cache.cache; - cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; + list_add(&kmem_cache->list, &slab_caches); + kmem_cache->colour_off = cache_line_size(); + kmem_cache->array[smp_processor_id()] = &initarray_cache.cache; + kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; /* * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids */ - cache_cache.size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + + kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + nr_node_ids * sizeof(struct kmem_list3 *); - cache_cache.object_size = cache_cache.size; - cache_cache.size = ALIGN(cache_cache.size, + kmem_cache->object_size = kmem_cache->size; + kmem_cache->size = ALIGN(kmem_cache->object_size, cache_line_size()); - cache_cache.reciprocal_buffer_size = - reciprocal_value(cache_cache.size); + kmem_cache->reciprocal_buffer_size = + reciprocal_value(kmem_cache->size); for (order = 0; order < MAX_ORDER; order++) { - cache_estimate(order, cache_cache.size, - cache_line_size(), 0, &left_over, &cache_cache.num); - if (cache_cache.num) + cache_estimate(order, kmem_cache->size, + cache_line_size(), 0, &left_over, &kmem_cache->num); + if (kmem_cache->num) break; } - BUG_ON(!cache_cache.num); - cache_cache.gfporder = order; - cache_cache.colour = left_over / cache_cache.colour_off; - cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + + BUG_ON(!kmem_cache->num); + kmem_cache->gfporder = order; + kmem_cache->colour = left_over / kmem_cache->colour_off; + kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) + sizeof(struct slab), cache_line_size()); /* 2+3) create the kmalloc caches */ @@ -1681,19 +1671,22 @@ void __init kmem_cache_init(void) * bug. */ - sizes[INDEX_AC].cs_cachep = __kmem_cache_create(names[INDEX_AC].name, - sizes[INDEX_AC].cs_size, - ARCH_KMALLOC_MINALIGN, - ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL); + sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name; + sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size; + sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size; + sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN; + __kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); + list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches); if (INDEX_AC != INDEX_L3) { - sizes[INDEX_L3].cs_cachep = - __kmem_cache_create(names[INDEX_L3].name, - sizes[INDEX_L3].cs_size, - ARCH_KMALLOC_MINALIGN, - ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL); + sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name; + sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size; + sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size; + sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN; + __kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); + list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches); } slab_early_init = 0; @@ -1707,20 +1700,23 @@ void __init kmem_cache_init(void) * allow tighter packing of the smaller caches. */ if (!sizes->cs_cachep) { - sizes->cs_cachep = __kmem_cache_create(names->name, - sizes->cs_size, - ARCH_KMALLOC_MINALIGN, - ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL); + sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + sizes->cs_cachep->name = names->name; + sizes->cs_cachep->size = sizes->cs_size; + sizes->cs_cachep->object_size = sizes->cs_size; + sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN; + __kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); + list_add(&sizes->cs_cachep->list, &slab_caches); } #ifdef CONFIG_ZONE_DMA - sizes->cs_dmacachep = __kmem_cache_create( - names->name_dma, - sizes->cs_size, - ARCH_KMALLOC_MINALIGN, - ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| - SLAB_PANIC, - NULL); + sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + sizes->cs_dmacachep->name = names->name_dma; + sizes->cs_dmacachep->size = sizes->cs_size; + sizes->cs_dmacachep->object_size = sizes->cs_size; + sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN; + __kmem_cache_create(sizes->cs_dmacachep, + ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC); + list_add(&sizes->cs_dmacachep->list, &slab_caches); #endif sizes++; names++; @@ -1731,15 +1727,15 @@ void __init kmem_cache_init(void) ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); - BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); - memcpy(ptr, cpu_cache_get(&cache_cache), + BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache); + memcpy(ptr, cpu_cache_get(kmem_cache), sizeof(struct arraycache_init)); /* * Do not assume that spinlocks can be initialized via memcpy: */ spin_lock_init(&ptr->lock); - cache_cache.array[smp_processor_id()] = ptr; + kmem_cache->array[smp_processor_id()] = ptr; ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); @@ -1760,7 +1756,7 @@ void __init kmem_cache_init(void) int nid; for_each_online_node(nid) { - init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid); + init_list(kmem_cache, &initkmem_list3[CACHE_CACHE + nid], nid); init_list(malloc_sizes[INDEX_AC].cs_cachep, &initkmem_list3[SIZE_AC + nid], nid); @@ -1781,9 +1777,6 @@ void __init kmem_cache_init_late(void) slab_state = UP; - /* Annotate slab for lockdep -- annotate the malloc caches */ - init_lock_keys(); - /* 6) resize the head arrays to their final sizes */ mutex_lock(&slab_mutex); list_for_each_entry(cachep, &slab_caches, list) @@ -1791,6 +1784,9 @@ void __init kmem_cache_init_late(void) BUG(); mutex_unlock(&slab_mutex); + /* Annotate slab for lockdep -- annotate the malloc caches */ + init_lock_keys(); + /* Done! */ slab_state = FULL; @@ -2209,27 +2205,6 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) } } -static void __kmem_cache_destroy(struct kmem_cache *cachep) -{ - int i; - struct kmem_list3 *l3; - - for_each_online_cpu(i) - kfree(cachep->array[i]); - - /* NUMA: free the list3 structures */ - for_each_online_node(i) { - l3 = cachep->nodelists[i]; - if (l3) { - kfree(l3->shared); - free_alien_cache(l3->alien); - kfree(l3); - } - } - kmem_cache_free(&cache_cache, cachep); -} - - /** * calculate_slab_order - calculate size (page order) of slabs * @cachep: pointer to the cache that is being created @@ -2366,9 +2341,6 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) * Cannot be called within a int, but can be interrupted. * The @ctor is run when new pages are allocated by the cache. * - * @name must be valid until the cache is destroyed. This implies that - * the module calling this has to destroy the cache before getting unloaded. - * * The flags are * * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) @@ -2381,13 +2353,13 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) * cacheline. This can be beneficial if you're counting cycles as closely * as davem. */ -struct kmem_cache * -__kmem_cache_create (const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) +int +__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) { size_t left_over, slab_size, ralign; - struct kmem_cache *cachep = NULL; gfp_t gfp; + int err; + size_t size = cachep->size; #if DEBUG #if FORCED_DEBUG @@ -2459,8 +2431,8 @@ __kmem_cache_create (const char *name, size_t size, size_t align, ralign = ARCH_SLAB_MINALIGN; } /* 3) caller mandated alignment */ - if (ralign < align) { - ralign = align; + if (ralign < cachep->align) { + ralign = cachep->align; } /* disable debug if necessary */ if (ralign > __alignof__(unsigned long long)) @@ -2468,21 +2440,14 @@ __kmem_cache_create (const char *name, size_t size, size_t align, /* * 4) Store it. */ - align = ralign; + cachep->align = ralign; if (slab_is_available()) gfp = GFP_KERNEL; else gfp = GFP_NOWAIT; - /* Get cache's description obj. */ - cachep = kmem_cache_zalloc(&cache_cache, gfp); - if (!cachep) - return NULL; - cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; - cachep->object_size = size; - cachep->align = align; #if DEBUG /* @@ -2506,8 +2471,9 @@ __kmem_cache_create (const char *name, size_t size, size_t align, } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) if (size >= malloc_sizes[INDEX_L3 + 1].cs_size - && cachep->object_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { - cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); + && cachep->object_size > cache_line_size() + && ALIGN(size, cachep->align) < PAGE_SIZE) { + cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); size = PAGE_SIZE; } #endif @@ -2527,18 +2493,15 @@ __kmem_cache_create (const char *name, size_t size, size_t align, */ flags |= CFLGS_OFF_SLAB; - size = ALIGN(size, align); + size = ALIGN(size, cachep->align); - left_over = calculate_slab_order(cachep, size, align, flags); + left_over = calculate_slab_order(cachep, size, cachep->align, flags); + + if (!cachep->num) + return -E2BIG; - if (!cachep->num) { - printk(KERN_ERR - "kmem_cache_create: couldn't create cache %s.\n", name); - kmem_cache_free(&cache_cache, cachep); - return NULL; - } slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) - + sizeof(struct slab), align); + + sizeof(struct slab), cachep->align); /* * If the slab has been placed off-slab, and we have enough space then @@ -2566,8 +2529,8 @@ __kmem_cache_create (const char *name, size_t size, size_t align, cachep->colour_off = cache_line_size(); /* Offset must be a multiple of the alignment. */ - if (cachep->colour_off < align) - cachep->colour_off = align; + if (cachep->colour_off < cachep->align) + cachep->colour_off = cachep->align; cachep->colour = left_over / cachep->colour_off; cachep->slab_size = slab_size; cachep->flags = flags; @@ -2588,12 +2551,11 @@ __kmem_cache_create (const char *name, size_t size, size_t align, */ BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); } - cachep->ctor = ctor; - cachep->name = name; - if (setup_cpu_cache(cachep, gfp)) { - __kmem_cache_destroy(cachep); - return NULL; + err = setup_cpu_cache(cachep, gfp); + if (err) { + __kmem_cache_shutdown(cachep); + return err; } if (flags & SLAB_DEBUG_OBJECTS) { @@ -2606,9 +2568,7 @@ __kmem_cache_create (const char *name, size_t size, size_t align, slab_set_debugobj_lock_classes(cachep); } - /* cache setup completed, link it into the list */ - list_add(&cachep->list, &slab_caches); - return cachep; + return 0; } #if DEBUG @@ -2767,49 +2727,29 @@ int kmem_cache_shrink(struct kmem_cache *cachep) } EXPORT_SYMBOL(kmem_cache_shrink); -/** - * kmem_cache_destroy - delete a cache - * @cachep: the cache to destroy - * - * Remove a &struct kmem_cache object from the slab cache. - * - * It is expected this function will be called by a module when it is - * unloaded. This will remove the cache completely, and avoid a duplicate - * cache being allocated each time a module is loaded and unloaded, if the - * module doesn't have persistent in-kernel storage across loads and unloads. - * - * The cache must be empty before calling this function. - * - * The caller must guarantee that no one will allocate memory from the cache - * during the kmem_cache_destroy(). - */ -void kmem_cache_destroy(struct kmem_cache *cachep) +int __kmem_cache_shutdown(struct kmem_cache *cachep) { - BUG_ON(!cachep || in_interrupt()); + int i; + struct kmem_list3 *l3; + int rc = __cache_shrink(cachep); - /* Find the cache in the chain of caches. */ - get_online_cpus(); - mutex_lock(&slab_mutex); - /* - * the chain is never empty, cache_cache is never destroyed - */ - list_del(&cachep->list); - if (__cache_shrink(cachep)) { - slab_error(cachep, "Can't free all objects"); - list_add(&cachep->list, &slab_caches); - mutex_unlock(&slab_mutex); - put_online_cpus(); - return; - } + if (rc) + return rc; - if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) - rcu_barrier(); + for_each_online_cpu(i) + kfree(cachep->array[i]); - __kmem_cache_destroy(cachep); - mutex_unlock(&slab_mutex); - put_online_cpus(); + /* NUMA: free the list3 structures */ + for_each_online_node(i) { + l3 = cachep->nodelists[i]; + if (l3) { + kfree(l3->shared); + free_alien_cache(l3->alien); + kfree(l3); + } + } + return 0; } -EXPORT_SYMBOL(kmem_cache_destroy); /* * Get the memory for a slab management obj. @@ -3098,7 +3038,7 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) } static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, - void *caller) + unsigned long caller) { struct page *page; unsigned int objnr; @@ -3118,7 +3058,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, *dbg_redzone2(cachep, objp) = RED_INACTIVE; } if (cachep->flags & SLAB_STORE_USER) - *dbg_userword(cachep, objp) = caller; + *dbg_userword(cachep, objp) = (void *)caller; objnr = obj_to_index(cachep, slabp, objp); @@ -3131,7 +3071,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { - store_stackinfo(cachep, objp, (unsigned long)caller); + store_stackinfo(cachep, objp, caller); kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, 0); } else { @@ -3260,6 +3200,7 @@ force_grow: /* cache_grow can reenable interrupts, then ac could change. */ ac = cpu_cache_get(cachep); + node = numa_mem_id(); /* no objects in sight? abort */ if (!x && (ac->avail == 0 || force_refill)) @@ -3284,7 +3225,7 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, #if DEBUG static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, - gfp_t flags, void *objp, void *caller) + gfp_t flags, void *objp, unsigned long caller) { if (!objp) return objp; @@ -3301,7 +3242,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, poison_obj(cachep, objp, POISON_INUSE); } if (cachep->flags & SLAB_STORE_USER) - *dbg_userword(cachep, objp) = caller; + *dbg_userword(cachep, objp) = (void *)caller; if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || @@ -3342,7 +3283,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) { - if (cachep == &cache_cache) + if (cachep == kmem_cache) return false; return should_failslab(cachep->object_size, flags, cachep->flags); @@ -3575,8 +3516,8 @@ done: * Fallback to other node is possible if __GFP_THISNODE is not set. */ static __always_inline void * -__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, - void *caller) +slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, + unsigned long caller) { unsigned long save_flags; void *ptr; @@ -3662,7 +3603,7 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) #endif /* CONFIG_NUMA */ static __always_inline void * -__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) +slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) { unsigned long save_flags; void *objp; @@ -3798,7 +3739,7 @@ free_done: * be in this state _before_ it is released. Called with disabled ints. */ static inline void __cache_free(struct kmem_cache *cachep, void *objp, - void *caller) + unsigned long caller) { struct array_cache *ac = cpu_cache_get(cachep); @@ -3838,7 +3779,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, */ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { - void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); + void *ret = slab_alloc(cachep, flags, _RET_IP_); trace_kmem_cache_alloc(_RET_IP_, ret, cachep->object_size, cachep->size, flags); @@ -3849,14 +3790,14 @@ EXPORT_SYMBOL(kmem_cache_alloc); #ifdef CONFIG_TRACING void * -kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags) +kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) { void *ret; - ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); + ret = slab_alloc(cachep, flags, _RET_IP_); trace_kmalloc(_RET_IP_, ret, - size, slab_buffer_size(cachep), flags); + size, cachep->size, flags); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_trace); @@ -3865,8 +3806,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace); #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - void *ret = __cache_alloc_node(cachep, flags, nodeid, - __builtin_return_address(0)); + void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep->object_size, cachep->size, @@ -3877,17 +3817,17 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) EXPORT_SYMBOL(kmem_cache_alloc_node); #ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_trace(size_t size, - struct kmem_cache *cachep, +void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, gfp_t flags, - int nodeid) + int nodeid, + size_t size) { void *ret; - ret = __cache_alloc_node(cachep, flags, nodeid, - __builtin_return_address(0)); + ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); + trace_kmalloc_node(_RET_IP_, ret, - size, slab_buffer_size(cachep), + size, cachep->size, flags, nodeid); return ret; } @@ -3895,34 +3835,33 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace); #endif static __always_inline void * -__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) +__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) { struct kmem_cache *cachep; cachep = kmem_find_general_cachep(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - return kmem_cache_alloc_node_trace(size, cachep, flags, node); + return kmem_cache_alloc_node_trace(cachep, flags, node, size); } #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) void *__kmalloc_node(size_t size, gfp_t flags, int node) { - return __do_kmalloc_node(size, flags, node, - __builtin_return_address(0)); + return __do_kmalloc_node(size, flags, node, _RET_IP_); } EXPORT_SYMBOL(__kmalloc_node); void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, unsigned long caller) { - return __do_kmalloc_node(size, flags, node, (void *)caller); + return __do_kmalloc_node(size, flags, node, caller); } EXPORT_SYMBOL(__kmalloc_node_track_caller); #else void *__kmalloc_node(size_t size, gfp_t flags, int node) { - return __do_kmalloc_node(size, flags, node, NULL); + return __do_kmalloc_node(size, flags, node, 0); } EXPORT_SYMBOL(__kmalloc_node); #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ @@ -3935,7 +3874,7 @@ EXPORT_SYMBOL(__kmalloc_node); * @caller: function caller for debug tracking of the caller */ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, - void *caller) + unsigned long caller) { struct kmem_cache *cachep; void *ret; @@ -3948,9 +3887,9 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, cachep = __find_general_cachep(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - ret = __cache_alloc(cachep, flags, caller); + ret = slab_alloc(cachep, flags, caller); - trace_kmalloc((unsigned long) caller, ret, + trace_kmalloc(caller, ret, size, cachep->size, flags); return ret; @@ -3960,20 +3899,20 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) void *__kmalloc(size_t size, gfp_t flags) { - return __do_kmalloc(size, flags, __builtin_return_address(0)); + return __do_kmalloc(size, flags, _RET_IP_); } EXPORT_SYMBOL(__kmalloc); void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) { - return __do_kmalloc(size, flags, (void *)caller); + return __do_kmalloc(size, flags, caller); } EXPORT_SYMBOL(__kmalloc_track_caller); #else void *__kmalloc(size_t size, gfp_t flags) { - return __do_kmalloc(size, flags, NULL); + return __do_kmalloc(size, flags, 0); } EXPORT_SYMBOL(__kmalloc); #endif @@ -3994,7 +3933,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) debug_check_no_locks_freed(objp, cachep->object_size); if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(objp, cachep->object_size); - __cache_free(cachep, objp, __builtin_return_address(0)); + __cache_free(cachep, objp, _RET_IP_); local_irq_restore(flags); trace_kmem_cache_free(_RET_IP_, objp); @@ -4025,7 +3964,7 @@ void kfree(const void *objp) debug_check_no_locks_freed(objp, c->object_size); debug_check_no_obj_freed(objp, c->object_size); - __cache_free(c, (void *)objp, __builtin_return_address(0)); + __cache_free(c, (void *)objp, _RET_IP_); local_irq_restore(flags); } EXPORT_SYMBOL(kfree); @@ -25,9 +25,26 @@ extern enum slab_state slab_state; /* The slab cache mutex protects the management structures during changes */ extern struct mutex slab_mutex; + +/* The list of all slab caches on the system */ extern struct list_head slab_caches; -struct kmem_cache *__kmem_cache_create(const char *name, size_t size, +/* The slab cache that manages slab cache information */ +extern struct kmem_cache *kmem_cache; + +/* Functions provided by the slab allocators */ +extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); + +#ifdef CONFIG_SLUB +struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)); +#else +static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)) +{ return NULL; } +#endif + + +int __kmem_cache_shutdown(struct kmem_cache *); #endif diff --git a/mm/slab_common.c b/mm/slab_common.c index aa3ca5b..9c21725 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -22,6 +22,53 @@ enum slab_state slab_state; LIST_HEAD(slab_caches); DEFINE_MUTEX(slab_mutex); +struct kmem_cache *kmem_cache; + +#ifdef CONFIG_DEBUG_VM +static int kmem_cache_sanity_check(const char *name, size_t size) +{ + struct kmem_cache *s = NULL; + + if (!name || in_interrupt() || size < sizeof(void *) || + size > KMALLOC_MAX_SIZE) { + pr_err("kmem_cache_create(%s) integrity check failed\n", name); + return -EINVAL; + } + + list_for_each_entry(s, &slab_caches, list) { + char tmp; + int res; + + /* + * This happens when the module gets unloaded and doesn't + * destroy its slab cache and no-one else reuses the vmalloc + * area of the module. Print a warning. + */ + res = probe_kernel_address(s->name, tmp); + if (res) { + pr_err("Slab cache with size %d has lost its name\n", + s->object_size); + continue; + } + + if (!strcmp(s->name, name)) { + pr_err("%s (%s): Cache name already exists.\n", + __func__, name); + dump_stack(); + s = NULL; + return -EINVAL; + } + } + + WARN_ON(strchr(name, ' ')); /* It confuses parsers */ + return 0; +} +#else +static inline int kmem_cache_sanity_check(const char *name, size_t size) +{ + return 0; +} +#endif /* * kmem_cache_create - Create a cache. @@ -52,68 +99,92 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s = NULL; - -#ifdef CONFIG_DEBUG_VM - if (!name || in_interrupt() || size < sizeof(void *) || - size > KMALLOC_MAX_SIZE) { - printk(KERN_ERR "kmem_cache_create(%s) integrity check" - " failed\n", name); - goto out; - } -#endif + int err = 0; get_online_cpus(); mutex_lock(&slab_mutex); -#ifdef CONFIG_DEBUG_VM - list_for_each_entry(s, &slab_caches, list) { - char tmp; - int res; + if (!kmem_cache_sanity_check(name, size) == 0) + goto out_locked; - /* - * This happens when the module gets unloaded and doesn't - * destroy its slab cache and no-one else reuses the vmalloc - * area of the module. Print a warning. - */ - res = probe_kernel_address(s->name, tmp); - if (res) { - printk(KERN_ERR - "Slab cache with size %d has lost its name\n", - s->object_size); - continue; - } - if (!strcmp(s->name, name)) { - printk(KERN_ERR "kmem_cache_create(%s): Cache name" - " already exists.\n", - name); - dump_stack(); - s = NULL; - goto oops; + s = __kmem_cache_alias(name, size, align, flags, ctor); + if (s) + goto out_locked; + + s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); + if (s) { + s->object_size = s->size = size; + s->align = align; + s->ctor = ctor; + s->name = kstrdup(name, GFP_KERNEL); + if (!s->name) { + kmem_cache_free(kmem_cache, s); + err = -ENOMEM; + goto out_locked; } - } - WARN_ON(strchr(name, ' ')); /* It confuses parsers */ -#endif + err = __kmem_cache_create(s, flags); + if (!err) { - s = __kmem_cache_create(name, size, align, flags, ctor); + s->refcount = 1; + list_add(&s->list, &slab_caches); -#ifdef CONFIG_DEBUG_VM -oops: -#endif + } else { + kfree(s->name); + kmem_cache_free(kmem_cache, s); + } + } else + err = -ENOMEM; + +out_locked: mutex_unlock(&slab_mutex); put_online_cpus(); -#ifdef CONFIG_DEBUG_VM -out: -#endif - if (!s && (flags & SLAB_PANIC)) - panic("kmem_cache_create: Failed to create slab '%s'\n", name); + if (err) { + + if (flags & SLAB_PANIC) + panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", + name, err); + else { + printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d", + name, err); + dump_stack(); + } + + return NULL; + } return s; } EXPORT_SYMBOL(kmem_cache_create); +void kmem_cache_destroy(struct kmem_cache *s) +{ + get_online_cpus(); + mutex_lock(&slab_mutex); + s->refcount--; + if (!s->refcount) { + list_del(&s->list); + + if (!__kmem_cache_shutdown(s)) { + if (s->flags & SLAB_DESTROY_BY_RCU) + rcu_barrier(); + + kfree(s->name); + kmem_cache_free(kmem_cache, s); + } else { + list_add(&s->list, &slab_caches); + printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n", + s->name); + dump_stack(); + } + } + mutex_unlock(&slab_mutex); + put_online_cpus(); +} +EXPORT_SYMBOL(kmem_cache_destroy); + int slab_is_available(void) { return slab_state >= UP; @@ -194,7 +194,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) void *page; #ifdef CONFIG_NUMA - if (node != -1) + if (node != NUMA_NO_NODE) page = alloc_pages_exact_node(node, gfp, order); else #endif @@ -290,7 +290,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) * If there's a node specification, search for a partial * page with a matching node id in the freelist. */ - if (node != -1 && page_to_nid(sp) != node) + if (node != NUMA_NO_NODE && page_to_nid(sp) != node) continue; #endif /* Enough room on this page? */ @@ -425,7 +425,8 @@ out: * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. */ -void *__kmalloc_node(size_t size, gfp_t gfp, int node) +static __always_inline void * +__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) { unsigned int *m; int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); @@ -446,7 +447,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) *m = size; ret = (void *)m + align; - trace_kmalloc_node(_RET_IP_, ret, + trace_kmalloc_node(caller, ret, size, size + align, gfp, node); } else { unsigned int order = get_order(size); @@ -460,15 +461,35 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) page->private = size; } - trace_kmalloc_node(_RET_IP_, ret, + trace_kmalloc_node(caller, ret, size, PAGE_SIZE << order, gfp, node); } kmemleak_alloc(ret, size, 1, gfp); return ret; } + +void *__kmalloc_node(size_t size, gfp_t gfp, int node) +{ + return __do_kmalloc_node(size, gfp, node, _RET_IP_); +} EXPORT_SYMBOL(__kmalloc_node); +#ifdef CONFIG_TRACING +void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) +{ + return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); +} + +#ifdef CONFIG_NUMA +void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, + int node, unsigned long caller) +{ + return __do_kmalloc_node(size, gfp, node, caller); +} +#endif +#endif + void kfree(const void *block) { struct page *sp; @@ -508,44 +529,24 @@ size_t ksize(const void *block) } EXPORT_SYMBOL(ksize); -struct kmem_cache *__kmem_cache_create(const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) { - struct kmem_cache *c; - - c = slob_alloc(sizeof(struct kmem_cache), - GFP_KERNEL, ARCH_KMALLOC_MINALIGN, -1); + size_t align = c->size; - if (c) { - c->name = name; - c->size = size; - if (flags & SLAB_DESTROY_BY_RCU) { - /* leave room for rcu footer at the end of object */ - c->size += sizeof(struct slob_rcu); - } - c->flags = flags; - c->ctor = ctor; - /* ignore alignment unless it's forced */ - c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; - if (c->align < ARCH_SLAB_MINALIGN) - c->align = ARCH_SLAB_MINALIGN; - if (c->align < align) - c->align = align; - - kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL); - c->refcount = 1; + if (flags & SLAB_DESTROY_BY_RCU) { + /* leave room for rcu footer at the end of object */ + c->size += sizeof(struct slob_rcu); } - return c; -} + c->flags = flags; + /* ignore alignment unless it's forced */ + c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; + if (c->align < ARCH_SLAB_MINALIGN) + c->align = ARCH_SLAB_MINALIGN; + if (c->align < align) + c->align = align; -void kmem_cache_destroy(struct kmem_cache *c) -{ - kmemleak_free(c); - if (c->flags & SLAB_DESTROY_BY_RCU) - rcu_barrier(); - slob_free(c, sizeof(struct kmem_cache)); + return 0; } -EXPORT_SYMBOL(kmem_cache_destroy); void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) { @@ -613,14 +614,28 @@ unsigned int kmem_cache_size(struct kmem_cache *c) } EXPORT_SYMBOL(kmem_cache_size); +int __kmem_cache_shutdown(struct kmem_cache *c) +{ + /* No way to check for remaining objects */ + return 0; +} + int kmem_cache_shrink(struct kmem_cache *d) { return 0; } EXPORT_SYMBOL(kmem_cache_shrink); +struct kmem_cache kmem_cache_boot = { + .name = "kmem_cache", + .size = sizeof(struct kmem_cache), + .flags = SLAB_PANIC, + .align = ARCH_KMALLOC_MINALIGN, +}; + void __init kmem_cache_init(void) { + kmem_cache = &kmem_cache_boot; slab_state = UP; } @@ -210,11 +210,7 @@ static void sysfs_slab_remove(struct kmem_cache *); static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } -static inline void sysfs_slab_remove(struct kmem_cache *s) -{ - kfree(s->name); - kfree(s); -} +static inline void sysfs_slab_remove(struct kmem_cache *s) { } #endif @@ -568,6 +564,8 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...) printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); printk(KERN_ERR "----------------------------------------" "-------------------------------------\n\n"); + + add_taint(TAINT_BAD_PAGE); } static void slab_fix(struct kmem_cache *s, char *fmt, ...) @@ -624,7 +622,7 @@ static void object_err(struct kmem_cache *s, struct page *page, print_trailer(s, page, object); } -static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) +static void slab_err(struct kmem_cache *s, struct page *page, const char *fmt, ...) { va_list args; char buf[100]; @@ -1069,13 +1067,13 @@ bad: return 0; } -static noinline int free_debug_processing(struct kmem_cache *s, - struct page *page, void *object, unsigned long addr) +static noinline struct kmem_cache_node *free_debug_processing( + struct kmem_cache *s, struct page *page, void *object, + unsigned long addr, unsigned long *flags) { - unsigned long flags; - int rc = 0; + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - local_irq_save(flags); + spin_lock_irqsave(&n->list_lock, *flags); slab_lock(page); if (!check_slab(s, page)) @@ -1113,15 +1111,19 @@ static noinline int free_debug_processing(struct kmem_cache *s, set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); init_object(s, object, SLUB_RED_INACTIVE); - rc = 1; out: slab_unlock(page); - local_irq_restore(flags); - return rc; + /* + * Keep node_lock to preserve integrity + * until the object is actually freed + */ + return n; fail: + slab_unlock(page); + spin_unlock_irqrestore(&n->list_lock, *flags); slab_fix(s, "Object at 0x%p not freed", object); - goto out; + return NULL; } static int __init setup_slub_debug(char *str) @@ -1214,8 +1216,9 @@ static inline void setup_object_debug(struct kmem_cache *s, static inline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { return 0; } -static inline int free_debug_processing(struct kmem_cache *s, - struct page *page, void *object, unsigned long addr) { return 0; } +static inline struct kmem_cache_node *free_debug_processing( + struct kmem_cache *s, struct page *page, void *object, + unsigned long addr, unsigned long *flags) { return NULL; } static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } @@ -1524,12 +1527,13 @@ static inline void *acquire_slab(struct kmem_cache *s, } static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); +static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags); /* * Try to allocate a partial slab from a specific node. */ -static void *get_partial_node(struct kmem_cache *s, - struct kmem_cache_node *n, struct kmem_cache_cpu *c) +static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, + struct kmem_cache_cpu *c, gfp_t flags) { struct page *page, *page2; void *object = NULL; @@ -1545,9 +1549,13 @@ static void *get_partial_node(struct kmem_cache *s, spin_lock(&n->list_lock); list_for_each_entry_safe(page, page2, &n->partial, lru) { - void *t = acquire_slab(s, n, page, object == NULL); + void *t; int available; + if (!pfmemalloc_match(page, flags)) + continue; + + t = acquire_slab(s, n, page, object == NULL); if (!t) break; @@ -1614,7 +1622,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, if (n && cpuset_zone_allowed_hardwall(zone, flags) && n->nr_partial > s->min_partial) { - object = get_partial_node(s, n, c); + object = get_partial_node(s, n, c, flags); if (object) { /* * Return the object even if @@ -1643,7 +1651,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, void *object; int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; - object = get_partial_node(s, get_node(s, searchnode), c); + object = get_partial_node(s, get_node(s, searchnode), c, flags); if (object || node != NUMA_NO_NODE) return object; @@ -1709,7 +1717,7 @@ static inline void note_cmpxchg_failure(const char *n, stat(s, CMPXCHG_DOUBLE_CPU_FAIL); } -void init_kmem_cache_cpus(struct kmem_cache *s) +static void init_kmem_cache_cpus(struct kmem_cache *s) { int cpu; @@ -1934,7 +1942,7 @@ static void unfreeze_partials(struct kmem_cache *s) * If we did not find a slot then simply move all the partials to the * per node partial list. */ -int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) +static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) { struct page *oldpage; int pages; @@ -1957,6 +1965,7 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) local_irq_save(flags); unfreeze_partials(s); local_irq_restore(flags); + oldpage = NULL; pobjects = 0; pages = 0; stat(s, CPU_PARTIAL_DRAIN); @@ -2305,7 +2314,7 @@ new_slab: * * Otherwise we can simply pick the next object from the lockless free list. */ -static __always_inline void *slab_alloc(struct kmem_cache *s, +static __always_inline void *slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr) { void **object; @@ -2375,9 +2384,15 @@ redo: return object; } +static __always_inline void *slab_alloc(struct kmem_cache *s, + gfp_t gfpflags, unsigned long addr) +{ + return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr); +} + void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, _RET_IP_); trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags); @@ -2388,7 +2403,7 @@ EXPORT_SYMBOL(kmem_cache_alloc); #ifdef CONFIG_TRACING void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { - void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, _RET_IP_); trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); return ret; } @@ -2406,7 +2421,7 @@ EXPORT_SYMBOL(kmalloc_order_trace); #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { - void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); trace_kmem_cache_alloc_node(_RET_IP_, ret, s->object_size, s->size, gfpflags, node); @@ -2420,7 +2435,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, int node, size_t size) { - void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); trace_kmalloc_node(_RET_IP_, ret, size, s->size, gfpflags, node); @@ -2452,7 +2467,8 @@ static void __slab_free(struct kmem_cache *s, struct page *page, stat(s, FREE_SLOWPATH); - if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) + if (kmem_cache_debug(s) && + !(n = free_debug_processing(s, page, x, addr, &flags))) return; do { @@ -2607,6 +2623,13 @@ void kmem_cache_free(struct kmem_cache *s, void *x) page = virt_to_head_page(x); + if (kmem_cache_debug(s) && page->slab != s) { + pr_err("kmem_cache_free: Wrong slab cache. %s but object" + " is from %s\n", page->slab->name, s->name); + WARN_ON_ONCE(1); + return; + } + slab_free(s, page, x, _RET_IP_); trace_kmem_cache_free(_RET_IP_, x); @@ -3021,17 +3044,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) } -static int kmem_cache_open(struct kmem_cache *s, - const char *name, size_t size, - size_t align, unsigned long flags, - void (*ctor)(void *)) +static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) { - memset(s, 0, kmem_size); - s->name = name; - s->ctor = ctor; - s->object_size = size; - s->align = align; - s->flags = kmem_cache_flags(size, flags, name, ctor); + s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); s->reserved = 0; if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) @@ -3093,7 +3108,6 @@ static int kmem_cache_open(struct kmem_cache *s, else s->cpu_partial = 30; - s->refcount = 1; #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; #endif @@ -3101,16 +3115,16 @@ static int kmem_cache_open(struct kmem_cache *s, goto error; if (alloc_kmem_cache_cpus(s)) - return 1; + return 0; free_kmem_cache_nodes(s); error: if (flags & SLAB_PANIC) panic("Cannot create slab %s size=%lu realsize=%u " "order=%u offset=%u flags=%lx\n", - s->name, (unsigned long)size, s->size, oo_order(s->oo), + s->name, (unsigned long)s->size, s->size, oo_order(s->oo), s->offset, flags); - return 0; + return -EINVAL; } /* @@ -3132,7 +3146,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, sizeof(long), GFP_ATOMIC); if (!map) return; - slab_err(s, page, "%s", text); + slab_err(s, page, text, s->name); slab_lock(page); get_map(s, page, map); @@ -3164,7 +3178,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) discard_slab(s, page); } else { list_slab_objects(s, page, - "Objects remaining on kmem_cache_close()"); + "Objects remaining in %s on kmem_cache_close()"); } } } @@ -3177,7 +3191,6 @@ static inline int kmem_cache_close(struct kmem_cache *s) int node; flush_all(s); - free_percpu(s->cpu_slab); /* Attempt to free all objects */ for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); @@ -3186,33 +3199,20 @@ static inline int kmem_cache_close(struct kmem_cache *s) if (n->nr_partial || slabs_node(s, node)) return 1; } + free_percpu(s->cpu_slab); free_kmem_cache_nodes(s); return 0; } -/* - * Close a cache and release the kmem_cache structure - * (must be used for caches created using kmem_cache_create) - */ -void kmem_cache_destroy(struct kmem_cache *s) +int __kmem_cache_shutdown(struct kmem_cache *s) { - mutex_lock(&slab_mutex); - s->refcount--; - if (!s->refcount) { - list_del(&s->list); - mutex_unlock(&slab_mutex); - if (kmem_cache_close(s)) { - printk(KERN_ERR "SLUB %s: %s called for cache that " - "still has objects.\n", s->name, __func__); - dump_stack(); - } - if (s->flags & SLAB_DESTROY_BY_RCU) - rcu_barrier(); + int rc = kmem_cache_close(s); + + if (!rc) sysfs_slab_remove(s); - } else - mutex_unlock(&slab_mutex); + + return rc; } -EXPORT_SYMBOL(kmem_cache_destroy); /******************************************************************** * Kmalloc subsystem @@ -3221,8 +3221,6 @@ EXPORT_SYMBOL(kmem_cache_destroy); struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT]; EXPORT_SYMBOL(kmalloc_caches); -static struct kmem_cache *kmem_cache; - #ifdef CONFIG_ZONE_DMA static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT]; #endif @@ -3268,14 +3266,17 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name, { struct kmem_cache *s; - s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); + s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + + s->name = name; + s->size = s->object_size = size; + s->align = ARCH_KMALLOC_MINALIGN; /* * This function is called with IRQs disabled during early-boot on * single CPU so there's no need to take slab_mutex here. */ - if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, - flags, NULL)) + if (kmem_cache_open(s, flags)) goto panic; list_add(&s->list, &slab_caches); @@ -3357,7 +3358,7 @@ void *__kmalloc(size_t size, gfp_t flags) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_); + ret = slab_alloc(s, flags, _RET_IP_); trace_kmalloc(_RET_IP_, ret, size, s->size, flags); @@ -3400,7 +3401,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, flags, node, _RET_IP_); + ret = slab_alloc_node(s, flags, node, _RET_IP_); trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); @@ -3477,7 +3478,7 @@ void kfree(const void *x) if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kmemleak_free(x); - put_page(page); + __free_pages(page, compound_order(page)); return; } slab_free(page->slab, page, object, _RET_IP_); @@ -3714,12 +3715,12 @@ void __init kmem_cache_init(void) slub_max_order = 0; kmem_size = offsetof(struct kmem_cache, node) + - nr_node_ids * sizeof(struct kmem_cache_node *); + nr_node_ids * sizeof(struct kmem_cache_node *); /* Allocate two kmem_caches from the page allocator */ kmalloc_size = ALIGN(kmem_size, cache_line_size()); order = get_order(2 * kmalloc_size); - kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order); + kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order); /* * Must first have the slab cache available for the allocations of the @@ -3728,9 +3729,10 @@ void __init kmem_cache_init(void) */ kmem_cache_node = (void *)kmem_cache + kmalloc_size; - kmem_cache_open(kmem_cache_node, "kmem_cache_node", - sizeof(struct kmem_cache_node), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + kmem_cache_node->name = "kmem_cache_node"; + kmem_cache_node->size = kmem_cache_node->object_size = + sizeof(struct kmem_cache_node); + kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC); hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); @@ -3738,8 +3740,10 @@ void __init kmem_cache_init(void) slab_state = PARTIAL; temp_kmem_cache = kmem_cache; - kmem_cache_open(kmem_cache, "kmem_cache", kmem_size, - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + kmem_cache->name = "kmem_cache"; + kmem_cache->size = kmem_cache->object_size = kmem_size; + kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC); + kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); memcpy(kmem_cache, temp_kmem_cache, kmem_size); @@ -3928,11 +3932,10 @@ static struct kmem_cache *find_mergeable(size_t size, return NULL; } -struct kmem_cache *__kmem_cache_create(const char *name, size_t size, +struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; - char *n; s = find_mergeable(size, align, flags, name, ctor); if (s) { @@ -3946,36 +3949,29 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size, if (sysfs_slab_alias(s, name)) { s->refcount--; - return NULL; + s = NULL; } - return s; } - n = kstrdup(name, GFP_KERNEL); - if (!n) - return NULL; + return s; +} - s = kmalloc(kmem_size, GFP_KERNEL); - if (s) { - if (kmem_cache_open(s, n, - size, align, flags, ctor)) { - int r; +int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) +{ + int err; - list_add(&s->list, &slab_caches); - mutex_unlock(&slab_mutex); - r = sysfs_slab_add(s); - mutex_lock(&slab_mutex); + err = kmem_cache_open(s, flags); + if (err) + return err; - if (!r) - return s; + mutex_unlock(&slab_mutex); + err = sysfs_slab_add(s); + mutex_lock(&slab_mutex); - list_del(&s->list); - kmem_cache_close(s); - } - kfree(s); - } - kfree(n); - return NULL; + if (err) + kmem_cache_close(s); + + return err; } #ifdef CONFIG_SMP @@ -4028,7 +4024,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller); + ret = slab_alloc(s, gfpflags, caller); /* Honor the call site pointer we received. */ trace_kmalloc(caller, ret, size, s->size, gfpflags); @@ -4058,7 +4054,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, gfpflags, node, caller); + ret = slab_alloc_node(s, gfpflags, node, caller); /* Honor the call site pointer we received. */ trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); @@ -5205,14 +5201,6 @@ static ssize_t slab_attr_store(struct kobject *kobj, return err; } -static void kmem_cache_release(struct kobject *kobj) -{ - struct kmem_cache *s = to_slab(kobj); - - kfree(s->name); - kfree(s); -} - static const struct sysfs_ops slab_sysfs_ops = { .show = slab_attr_show, .store = slab_attr_store, @@ -5220,7 +5208,6 @@ static const struct sysfs_ops slab_sysfs_ops = { static struct kobj_type slab_ktype = { .sysfs_ops = &slab_sysfs_ops, - .release = kmem_cache_release }; static int uevent_filter(struct kset *kset, struct kobject *kobj) @@ -446,13 +446,22 @@ void mark_page_accessed(struct page *page) } EXPORT_SYMBOL(mark_page_accessed); +/* + * Order of operations is important: flush the pagevec when it's already + * full, not when adding the last page, to make sure that last page is + * not added to the LRU directly when passed to this function. Because + * mark_page_accessed() (called after this when writing) only activates + * pages that are on the LRU, linear writes in subpage chunks would see + * every PAGEVEC_SIZE page activated, which is unexpected. + */ void __lru_cache_add(struct page *page, enum lru_list lru) { struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; page_cache_get(page); - if (!pagevec_add(pvec, page)) + if (!pagevec_space(pvec)) __pagevec_lru_add(pvec, lru); + pagevec_add(pvec, page); put_cpu_var(lru_add_pvecs); } EXPORT_SYMBOL(__lru_cache_add); @@ -742,7 +751,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, SetPageLRU(page_tail); - if (page_evictable(page_tail, NULL)) { + if (page_evictable(page_tail)) { if (PageActive(page)) { SetPageActive(page_tail); active = 1; diff --git a/mm/truncate.c b/mm/truncate.c index 75801ac..d51ce92 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -107,7 +107,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page) cancel_dirty_page(page, PAGE_CACHE_SIZE); - clear_page_mlock(page); ClearPageMappedToDisk(page); delete_from_page_cache(page); return 0; @@ -132,7 +131,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, 0)) return 0; - clear_page_mlock(page); ret = remove_mapping(mapping, page); return ret; @@ -398,7 +396,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (PageDirty(page)) goto failed; - clear_page_mlock(page); BUG_ON(page_has_private(page)); __delete_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); @@ -105,6 +105,25 @@ void *memdup_user(const void __user *src, size_t len) } EXPORT_SYMBOL(memdup_user); +static __always_inline void *__do_krealloc(const void *p, size_t new_size, + gfp_t flags) +{ + void *ret; + size_t ks = 0; + + if (p) + ks = ksize(p); + + if (ks >= new_size) + return (void *)p; + + ret = kmalloc_track_caller(new_size, flags); + if (ret && p) + memcpy(ret, p, ks); + + return ret; +} + /** * __krealloc - like krealloc() but don't free @p. * @p: object to reallocate memory for. @@ -117,23 +136,11 @@ EXPORT_SYMBOL(memdup_user); */ void *__krealloc(const void *p, size_t new_size, gfp_t flags) { - void *ret; - size_t ks = 0; - if (unlikely(!new_size)) return ZERO_SIZE_PTR; - if (p) - ks = ksize(p); + return __do_krealloc(p, new_size, flags); - if (ks >= new_size) - return (void *)p; - - ret = kmalloc_track_caller(new_size, flags); - if (ret && p) - memcpy(ret, p, ks); - - return ret; } EXPORT_SYMBOL(__krealloc); @@ -157,7 +164,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) return ZERO_SIZE_PTR; } - ret = __krealloc(p, new_size, flags); + ret = __do_krealloc(p, new_size, flags); if (ret && p != ret) kfree(p); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2bb90b1..78e0830 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2163,8 +2163,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, usize -= PAGE_SIZE; } while (usize > 0); - /* Prevent "things" like memory migration? VM_flags need a cleanup... */ - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; return 0; } @@ -2572,7 +2571,7 @@ static int s_show(struct seq_file *m, void *p) { struct vm_struct *v = p; - seq_printf(m, "0x%p-0x%p %7ld", + seq_printf(m, "0x%pK-0x%pK %7ld", v->addr, v->addr + v->size, v->size); if (v->caller) diff --git a/mm/vmscan.c b/mm/vmscan.c index 8d01243..2624edc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -553,7 +553,7 @@ void putback_lru_page(struct page *page) redo: ClearPageUnevictable(page); - if (page_evictable(page, NULL)) { + if (page_evictable(page)) { /* * For evictable pages, we can use the cache. * In event of a race, worst case is we end up with an @@ -587,7 +587,7 @@ redo: * page is on unevictable list, it never be freed. To avoid that, * check after we added it to the list, again. */ - if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { + if (lru == LRU_UNEVICTABLE && page_evictable(page)) { if (!isolate_lru_page(page)) { put_page(page); goto redo; @@ -674,8 +674,10 @@ static enum page_references page_check_references(struct page *page, static unsigned long shrink_page_list(struct list_head *page_list, struct zone *zone, struct scan_control *sc, + enum ttu_flags ttu_flags, unsigned long *ret_nr_dirty, - unsigned long *ret_nr_writeback) + unsigned long *ret_nr_writeback, + bool force_reclaim) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); @@ -689,10 +691,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, mem_cgroup_uncharge_start(); while (!list_empty(page_list)) { - enum page_references references; struct address_space *mapping; struct page *page; int may_enter_fs; + enum page_references references = PAGEREF_RECLAIM_CLEAN; cond_resched(); @@ -707,7 +709,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, sc->nr_scanned++; - if (unlikely(!page_evictable(page, NULL))) + if (unlikely(!page_evictable(page))) goto cull_mlocked; if (!sc->may_unmap && page_mapped(page)) @@ -758,7 +760,9 @@ static unsigned long shrink_page_list(struct list_head *page_list, wait_on_page_writeback(page); } - references = page_check_references(page, sc); + if (!force_reclaim) + references = page_check_references(page, sc); + switch (references) { case PAGEREF_ACTIVATE: goto activate_locked; @@ -788,7 +792,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, TTU_UNMAP)) { + switch (try_to_unmap(page, ttu_flags)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -960,6 +964,33 @@ keep: return nr_reclaimed; } +unsigned long reclaim_clean_pages_from_list(struct zone *zone, + struct list_head *page_list) +{ + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, + .may_unmap = 1, + }; + unsigned long ret, dummy1, dummy2; + struct page *page, *next; + LIST_HEAD(clean_pages); + + list_for_each_entry_safe(page, next, page_list, lru) { + if (page_is_file_cache(page) && !PageDirty(page)) { + ClearPageActive(page); + list_move(&page->lru, &clean_pages); + } + } + + ret = shrink_page_list(&clean_pages, zone, &sc, + TTU_UNMAP|TTU_IGNORE_ACCESS, + &dummy1, &dummy2, true); + list_splice(&clean_pages, page_list); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); + return ret; +} + /* * Attempt to remove the specified page from its LRU. Only take this page * if it is of the appropriate PageActive status. Pages which are being @@ -978,8 +1009,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) if (!PageLRU(page)) return ret; - /* Do not give back unevictable pages for compaction */ - if (PageUnevictable(page)) + /* Compaction should not handle unevictable pages but CMA can do so */ + if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE)) return ret; ret = -EBUSY; @@ -1186,7 +1217,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) VM_BUG_ON(PageLRU(page)); list_del(&page->lru); - if (unlikely(!page_evictable(page, NULL))) { + if (unlikely(!page_evictable(page))) { spin_unlock_irq(&zone->lru_lock); putback_lru_page(page); spin_lock_irq(&zone->lru_lock); @@ -1278,8 +1309,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (nr_taken == 0) return 0; - nr_reclaimed = shrink_page_list(&page_list, zone, sc, - &nr_dirty, &nr_writeback); + nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, + &nr_dirty, &nr_writeback, false); spin_lock_irq(&zone->lru_lock); @@ -1439,7 +1470,7 @@ static void shrink_active_list(unsigned long nr_to_scan, page = lru_to_page(&l_hold); list_del(&page->lru); - if (unlikely(!page_evictable(page, NULL))) { + if (unlikely(!page_evictable(page))) { putback_lru_page(page); continue; } @@ -1729,6 +1760,28 @@ static bool in_reclaim_compaction(struct scan_control *sc) return false; } +#ifdef CONFIG_COMPACTION +/* + * If compaction is deferred for sc->order then scale the number of pages + * reclaimed based on the number of consecutive allocation failures + */ +static unsigned long scale_for_compaction(unsigned long pages_for_compaction, + struct lruvec *lruvec, struct scan_control *sc) +{ + struct zone *zone = lruvec_zone(lruvec); + + if (zone->compact_order_failed <= sc->order) + pages_for_compaction <<= zone->compact_defer_shift; + return pages_for_compaction; +} +#else +static unsigned long scale_for_compaction(unsigned long pages_for_compaction, + struct lruvec *lruvec, struct scan_control *sc) +{ + return pages_for_compaction; +} +#endif + /* * Reclaim/compaction is used for high-order allocation requests. It reclaims * order-0 pages before compacting the zone. should_continue_reclaim() returns @@ -1776,6 +1829,9 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec, * inactive lists are large enough, continue reclaiming */ pages_for_compaction = (2UL << sc->order); + + pages_for_compaction = scale_for_compaction(pages_for_compaction, + lruvec, sc); inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); if (nr_swap_pages > 0) inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); @@ -2839,6 +2895,14 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) */ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); + /* + * Compaction records what page blocks it recently failed to + * isolate pages from and skips them in the future scanning. + * When kswapd is going to sleep, it is reasonable to assume + * that pages and compaction may succeed so reset the cache. + */ + reset_isolation_suitable(pgdat); + if (!kthread_should_stop()) schedule(); @@ -3101,8 +3165,9 @@ int kswapd_run(int nid) if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ BUG_ON(system_state == SYSTEM_BOOTING); - printk("Failed to start kswapd on node %d\n",nid); - ret = -1; + pgdat->kswapd = NULL; + pr_err("Failed to start kswapd on node %d\n", nid); + ret = PTR_ERR(pgdat->kswapd); } return ret; } @@ -3349,27 +3414,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) /* * page_evictable - test whether a page is evictable * @page: the page to test - * @vma: the VMA in which the page is or will be mapped, may be NULL * * Test whether page is evictable--i.e., should be placed on active/inactive - * lists vs unevictable list. The vma argument is !NULL when called from the - * fault path to determine how to instantate a new page. + * lists vs unevictable list. * * Reasons page might not be evictable: * (1) page's mapping marked unevictable * (2) page is part of an mlocked VMA * */ -int page_evictable(struct page *page, struct vm_area_struct *vma) +int page_evictable(struct page *page) { - - if (mapping_unevictable(page_mapping(page))) - return 0; - - if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page))) - return 0; - - return 1; + return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); } #ifdef CONFIG_SHMEM @@ -3407,7 +3463,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) if (!PageLRU(page) || !PageUnevictable(page)) continue; - if (page_evictable(page, NULL)) { + if (page_evictable(page)) { enum lru_list lru = page_lru_base_type(page); VM_BUG_ON(PageActive(page)); diff --git a/mm/vmstat.c b/mm/vmstat.c index df7a674..c737057 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -495,6 +495,18 @@ void refresh_cpu_vm_stats(int cpu) atomic_long_add(global_diff[i], &vm_stat[i]); } +void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) +{ + int i; + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (pset->vm_stat_diff[i]) { + int v = pset->vm_stat_diff[i]; + pset->vm_stat_diff[i] = 0; + atomic_long_add(v, &zone->vm_stat[i]); + atomic_long_add(v, &vm_stat[i]); + } +} #endif #ifdef CONFIG_NUMA @@ -722,6 +734,7 @@ const char * const vmstat_text[] = { "numa_other", #endif "nr_anon_transparent_hugepages", + "nr_free_cma", "nr_dirty_threshold", "nr_dirty_background_threshold", @@ -781,7 +794,6 @@ const char * const vmstat_text[] = { "unevictable_pgs_munlocked", "unevictable_pgs_cleared", "unevictable_pgs_stranded", - "unevictable_pgs_mlockfreed", #ifdef CONFIG_TRANSPARENT_HUGEPAGE "thp_fault_alloc", @@ -1157,7 +1169,7 @@ static void __cpuinit start_cpu_timer(int cpu) { struct delayed_work *work = &per_cpu(vmstat_work, cpu); - INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update); + INIT_DEFERRABLE_WORK(work, vmstat_update); schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); } |