diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 37 | ||||
-rw-r--r-- | mm/Makefile | 9 | ||||
-rw-r--r-- | mm/backing-dev.c | 18 | ||||
-rw-r--r-- | mm/bounce.c | 287 | ||||
-rw-r--r-- | mm/compaction.c | 343 | ||||
-rw-r--r-- | mm/dmapool.c | 27 | ||||
-rw-r--r-- | mm/early_ioremap.c | 245 | ||||
-rw-r--r-- | mm/filemap.c | 1249 | ||||
-rw-r--r-- | mm/fremap.c | 11 | ||||
-rw-r--r-- | mm/frontswap.c | 13 | ||||
-rw-r--r-- | mm/gup.c | 662 | ||||
-rw-r--r-- | mm/huge_memory.c | 188 | ||||
-rw-r--r-- | mm/hugetlb.c | 757 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 49 | ||||
-rw-r--r-- | mm/internal.h | 52 | ||||
-rw-r--r-- | mm/iov_iter.c | 743 | ||||
-rw-r--r-- | mm/kmemleak-test.c | 36 | ||||
-rw-r--r-- | mm/kmemleak.c | 180 | ||||
-rw-r--r-- | mm/ksm.c | 9 | ||||
-rw-r--r-- | mm/list_lru.c | 16 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memblock.c | 274 | ||||
-rw-r--r-- | mm/memcontrol.c | 1223 | ||||
-rw-r--r-- | mm/memory-failure.c | 144 | ||||
-rw-r--r-- | mm/memory.c | 1227 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 148 | ||||
-rw-r--r-- | mm/mempolicy.c | 145 | ||||
-rw-r--r-- | mm/mempool.c | 12 | ||||
-rw-r--r-- | mm/migrate.c | 66 | ||||
-rw-r--r-- | mm/mincore.c | 20 | ||||
-rw-r--r-- | mm/mlock.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 178 | ||||
-rw-r--r-- | mm/mprotect.c | 56 | ||||
-rw-r--r-- | mm/mremap.c | 9 | ||||
-rw-r--r-- | mm/msync.c | 9 | ||||
-rw-r--r-- | mm/nobootmem.c | 4 | ||||
-rw-r--r-- | mm/nommu.c | 56 | ||||
-rw-r--r-- | mm/page-writeback.c | 51 | ||||
-rw-r--r-- | mm/page_alloc.c | 613 | ||||
-rw-r--r-- | mm/page_cgroup.c | 12 | ||||
-rw-r--r-- | mm/page_io.c | 42 | ||||
-rw-r--r-- | mm/percpu.c | 5 | ||||
-rw-r--r-- | mm/process_vm_access.c | 246 | ||||
-rw-r--r-- | mm/readahead.c | 31 | ||||
-rw-r--r-- | mm/rmap.c | 94 | ||||
-rw-r--r-- | mm/shmem.c | 328 | ||||
-rw-r--r-- | mm/slab.c | 314 | ||||
-rw-r--r-- | mm/slab.h | 68 | ||||
-rw-r--r-- | mm/slab_common.c | 302 | ||||
-rw-r--r-- | mm/slob.c | 13 | ||||
-rw-r--r-- | mm/slub.c | 382 | ||||
-rw-r--r-- | mm/sparse.c | 4 | ||||
-rw-r--r-- | mm/swap.c | 291 | ||||
-rw-r--r-- | mm/swap_state.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 253 | ||||
-rw-r--r-- | mm/truncate.c | 163 | ||||
-rw-r--r-- | mm/util.c | 63 | ||||
-rw-r--r-- | mm/vmacache.c | 132 | ||||
-rw-r--r-- | mm/vmalloc.c | 23 | ||||
-rw-r--r-- | mm/vmscan.c | 372 | ||||
-rw-r--r-- | mm/vmstat.c | 24 | ||||
-rw-r--r-- | mm/workingset.c | 414 | ||||
-rw-r--r-- | mm/zbud.c | 4 | ||||
-rw-r--r-- | mm/zsmalloc.c | 21 | ||||
-rw-r--r-- | mm/zswap.c | 88 |
65 files changed, 7681 insertions, 5180 deletions
@@ -134,6 +134,9 @@ config HAVE_MEMBLOCK config HAVE_MEMBLOCK_NODE_MAP boolean +config HAVE_MEMBLOCK_PHYS_MAP + boolean + config ARCH_DISCARD_MEMBLOCK boolean @@ -216,6 +219,7 @@ config PAGEFLAGS_EXTENDED # config SPLIT_PTLOCK_CPUS int + default "999999" if !MMU default "999999" if ARM && !CPU_CACHE_VIPT default "999999" if PARISC && !PA20 default "4" @@ -263,6 +267,9 @@ config MIGRATION pages as migration can relocate pages to satisfy a huge page allocation instead of reclaiming. +config ARCH_ENABLE_HUGEPAGE_MIGRATION + boolean + config PHYS_ADDR_T_64BIT def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT @@ -429,16 +436,6 @@ choice benefit. endchoice -config CROSS_MEMORY_ATTACH - bool "Cross Memory Support" - depends on MMU - default y - help - Enabling this option adds the system calls process_vm_readv and - process_vm_writev which allow a process with the correct privileges - to directly read from or write to to another process's address space. - See the man page for more details. - # # UP and nommu archs use km based percpu allocator # @@ -554,7 +551,7 @@ config MEM_SOFT_DIRTY See Documentation/vm/soft-dirty.txt for more details. config ZSMALLOC - bool "Memory allocator for compressed pages" + tristate "Memory allocator for compressed pages" depends on MMU default n help @@ -577,3 +574,21 @@ config PGTABLE_MAPPING You can check speed with zsmalloc benchmark: https://github.com/spartacus06/zsmapbench + +config GENERIC_EARLY_IOREMAP + bool + +config MAX_STACK_SIZE_MB + int "Maximum user stack size for 32-bit processes (MB)" + default 80 + range 8 256 if METAG + range 8 2048 + depends on STACK_GROWSUP && (!64BIT || COMPAT) + help + This is the maximum stack size in Megabytes in the VM layout of 32-bit + user processes when the stack grows upwards (currently only on parisc + and metag arch). The stack will be located at the highest memory + address minus the given value, unless the RLIMIT_STACK hard limit is + changed to a smaller value in which case that is used. + + A sane initial value is 80 MB. diff --git a/mm/Makefile b/mm/Makefile index 310c90a..4064f3e 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -3,7 +3,7 @@ # mmu-y := nommu.o -mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ +mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ vmalloc.o pagewalk.o pgtable-generic.o @@ -16,8 +16,9 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o balloon_compaction.o \ - interval_tree.o list_lru.o $(mmu-y) + compaction.o balloon_compaction.o vmacache.o \ + interval_tree.o list_lru.o workingset.o \ + iov_iter.o $(mmu-y) obj-y += init-mm.o @@ -29,7 +30,6 @@ endif obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o -obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_ZSWAP) += zswap.o @@ -61,3 +61,4 @@ obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o obj-$(CONFIG_ZBUD) += zbud.o obj-$(CONFIG_ZSMALLOC) += zsmalloc.o +obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index ce682f7..1706cbb 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -288,13 +288,19 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) * Note, we wouldn't bother setting up the timer, but this function is on the * fast-path (used by '__mark_inode_dirty()'), so we save few context switches * by delaying the wake-up. + * + * We have to be careful not to postpone flush work if it is scheduled for + * earlier. Thus we use queue_delayed_work(). */ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) { unsigned long timeout; timeout = msecs_to_jiffies(dirty_writeback_interval * 10); - mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); + spin_lock_bh(&bdi->wb_lock); + if (test_bit(BDI_registered, &bdi->state)) + queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); + spin_unlock_bh(&bdi->wb_lock); } /* @@ -307,9 +313,6 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) spin_unlock_bh(&bdi_lock); synchronize_rcu_expedited(); - - /* bdi_list is now unused, clear it to mark @bdi dying */ - INIT_LIST_HEAD(&bdi->bdi_list); } int bdi_register(struct backing_dev_info *bdi, struct device *parent, @@ -360,6 +363,11 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) */ bdi_remove_from_list(bdi); + /* Make sure nobody queues further work */ + spin_lock_bh(&bdi->wb_lock); + clear_bit(BDI_registered, &bdi->state); + spin_unlock_bh(&bdi->wb_lock); + /* * Drain work list and shutdown the delayed_work. At this point, * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi @@ -549,7 +557,7 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync) bit = sync ? BDI_sync_congested : BDI_async_congested; if (test_and_clear_bit(bit, &bdi->state)) atomic_dec(&nr_bdi_congested[sync]); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (waitqueue_active(wqh)) wake_up(wqh); } diff --git a/mm/bounce.c b/mm/bounce.c deleted file mode 100644 index 523918b..0000000 --- a/mm/bounce.c +++ /dev/null @@ -1,287 +0,0 @@ -/* bounce buffer handling for block devices - * - * - Split from highmem.c - */ - -#include <linux/mm.h> -#include <linux/export.h> -#include <linux/swap.h> -#include <linux/gfp.h> -#include <linux/bio.h> -#include <linux/pagemap.h> -#include <linux/mempool.h> -#include <linux/blkdev.h> -#include <linux/init.h> -#include <linux/hash.h> -#include <linux/highmem.h> -#include <linux/bootmem.h> -#include <asm/tlbflush.h> - -#include <trace/events/block.h> - -#define POOL_SIZE 64 -#define ISA_POOL_SIZE 16 - -static mempool_t *page_pool, *isa_page_pool; - -#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL) -static __init int init_emergency_pool(void) -{ -#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG) - if (max_pfn <= max_low_pfn) - return 0; -#endif - - page_pool = mempool_create_page_pool(POOL_SIZE, 0); - BUG_ON(!page_pool); - printk("bounce pool size: %d pages\n", POOL_SIZE); - - return 0; -} - -__initcall(init_emergency_pool); -#endif - -#ifdef CONFIG_HIGHMEM -/* - * highmem version, map in to vec - */ -static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) -{ - unsigned long flags; - unsigned char *vto; - - local_irq_save(flags); - vto = kmap_atomic(to->bv_page); - memcpy(vto + to->bv_offset, vfrom, to->bv_len); - kunmap_atomic(vto); - local_irq_restore(flags); -} - -#else /* CONFIG_HIGHMEM */ - -#define bounce_copy_vec(to, vfrom) \ - memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) - -#endif /* CONFIG_HIGHMEM */ - -/* - * allocate pages in the DMA region for the ISA pool - */ -static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) -{ - return mempool_alloc_pages(gfp_mask | GFP_DMA, data); -} - -/* - * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA - * as the max address, so check if the pool has already been created. - */ -int init_emergency_isa_pool(void) -{ - if (isa_page_pool) - return 0; - - isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa, - mempool_free_pages, (void *) 0); - BUG_ON(!isa_page_pool); - - printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE); - return 0; -} - -/* - * Simple bounce buffer support for highmem pages. Depending on the - * queue gfp mask set, *to may or may not be a highmem page. kmap it - * always, it will do the Right Thing - */ -static void copy_to_high_bio_irq(struct bio *to, struct bio *from) -{ - unsigned char *vfrom; - struct bio_vec tovec, *fromvec = from->bi_io_vec; - struct bvec_iter iter; - - bio_for_each_segment(tovec, to, iter) { - if (tovec.bv_page != fromvec->bv_page) { - /* - * fromvec->bv_offset and fromvec->bv_len might have - * been modified by the block layer, so use the original - * copy, bounce_copy_vec already uses tovec->bv_len - */ - vfrom = page_address(fromvec->bv_page) + - tovec.bv_offset; - - bounce_copy_vec(&tovec, vfrom); - flush_dcache_page(tovec.bv_page); - } - - fromvec++; - } -} - -static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) -{ - struct bio *bio_orig = bio->bi_private; - struct bio_vec *bvec, *org_vec; - int i; - - if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) - set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags); - - /* - * free up bounce indirect pages used - */ - bio_for_each_segment_all(bvec, bio, i) { - org_vec = bio_orig->bi_io_vec + i; - if (bvec->bv_page == org_vec->bv_page) - continue; - - dec_zone_page_state(bvec->bv_page, NR_BOUNCE); - mempool_free(bvec->bv_page, pool); - } - - bio_endio(bio_orig, err); - bio_put(bio); -} - -static void bounce_end_io_write(struct bio *bio, int err) -{ - bounce_end_io(bio, page_pool, err); -} - -static void bounce_end_io_write_isa(struct bio *bio, int err) -{ - - bounce_end_io(bio, isa_page_pool, err); -} - -static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) -{ - struct bio *bio_orig = bio->bi_private; - - if (test_bit(BIO_UPTODATE, &bio->bi_flags)) - copy_to_high_bio_irq(bio_orig, bio); - - bounce_end_io(bio, pool, err); -} - -static void bounce_end_io_read(struct bio *bio, int err) -{ - __bounce_end_io_read(bio, page_pool, err); -} - -static void bounce_end_io_read_isa(struct bio *bio, int err) -{ - __bounce_end_io_read(bio, isa_page_pool, err); -} - -#ifdef CONFIG_NEED_BOUNCE_POOL -static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) -{ - if (bio_data_dir(bio) != WRITE) - return 0; - - if (!bdi_cap_stable_pages_required(&q->backing_dev_info)) - return 0; - - return test_bit(BIO_SNAP_STABLE, &bio->bi_flags); -} -#else -static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) -{ - return 0; -} -#endif /* CONFIG_NEED_BOUNCE_POOL */ - -static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, - mempool_t *pool, int force) -{ - struct bio *bio; - int rw = bio_data_dir(*bio_orig); - struct bio_vec *to, from; - struct bvec_iter iter; - unsigned i; - - if (force) - goto bounce; - bio_for_each_segment(from, *bio_orig, iter) - if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q)) - goto bounce; - - return; -bounce: - bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set); - - bio_for_each_segment_all(to, bio, i) { - struct page *page = to->bv_page; - - if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force) - continue; - - inc_zone_page_state(to->bv_page, NR_BOUNCE); - to->bv_page = mempool_alloc(pool, q->bounce_gfp); - - if (rw == WRITE) { - char *vto, *vfrom; - - flush_dcache_page(page); - - vto = page_address(to->bv_page) + to->bv_offset; - vfrom = kmap_atomic(page) + to->bv_offset; - memcpy(vto, vfrom, to->bv_len); - kunmap_atomic(vfrom); - } - } - - trace_block_bio_bounce(q, *bio_orig); - - bio->bi_flags |= (1 << BIO_BOUNCED); - - if (pool == page_pool) { - bio->bi_end_io = bounce_end_io_write; - if (rw == READ) - bio->bi_end_io = bounce_end_io_read; - } else { - bio->bi_end_io = bounce_end_io_write_isa; - if (rw == READ) - bio->bi_end_io = bounce_end_io_read_isa; - } - - bio->bi_private = *bio_orig; - *bio_orig = bio; -} - -void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) -{ - int must_bounce; - mempool_t *pool; - - /* - * Data-less bio, nothing to bounce - */ - if (!bio_has_data(*bio_orig)) - return; - - must_bounce = must_snapshot_stable_pages(q, *bio_orig); - - /* - * for non-isa bounce case, just check if the bounce pfn is equal - * to or bigger than the highest pfn in the system -- in that case, - * don't waste time iterating over bio segments - */ - if (!(q->bounce_gfp & GFP_DMA)) { - if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce) - return; - pool = page_pool; - } else { - BUG_ON(!isa_page_pool); - pool = isa_page_pool; - } - - /* - * slow path - */ - __blk_queue_bounce(q, bio_orig, pool, must_bounce); -} - -EXPORT_SYMBOL(blk_queue_bounce); diff --git a/mm/compaction.c b/mm/compaction.c index 9185775..21bf292b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone) unsigned long end_pfn = zone_end_pfn(zone); unsigned long pfn; - zone->compact_cached_migrate_pfn = start_pfn; + zone->compact_cached_migrate_pfn[0] = start_pfn; + zone->compact_cached_migrate_pfn[1] = start_pfn; zone->compact_cached_free_pfn = end_pfn; zone->compact_blockskip_flush = false; @@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat) */ static void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long nr_isolated, - bool migrate_scanner) + bool set_unsuitable, bool migrate_scanner) { struct zone *zone = cc->zone; + unsigned long pfn; if (cc->ignore_skip_hint) return; @@ -141,20 +143,32 @@ static void update_pageblock_skip(struct compact_control *cc, if (!page) return; - if (!nr_isolated) { - unsigned long pfn = page_to_pfn(page); + if (nr_isolated) + return; + + /* + * Only skip pageblocks when all forms of compaction will be known to + * fail in the near future. + */ + if (set_unsuitable) set_pageblock_skip(page); - /* Update where compaction should restart */ - if (migrate_scanner) { - if (!cc->finished_update_migrate && - pfn > zone->compact_cached_migrate_pfn) - zone->compact_cached_migrate_pfn = pfn; - } else { - if (!cc->finished_update_free && - pfn < zone->compact_cached_free_pfn) - zone->compact_cached_free_pfn = pfn; - } + pfn = page_to_pfn(page); + + /* Update where async and sync compaction should restart */ + if (migrate_scanner) { + if (cc->finished_update_migrate) + return; + if (pfn > zone->compact_cached_migrate_pfn[0]) + zone->compact_cached_migrate_pfn[0] = pfn; + if (cc->mode != MIGRATE_ASYNC && + pfn > zone->compact_cached_migrate_pfn[1]) + zone->compact_cached_migrate_pfn[1] = pfn; + } else { + if (cc->finished_update_free) + return; + if (pfn < zone->compact_cached_free_pfn) + zone->compact_cached_free_pfn = pfn; } } #else @@ -166,7 +180,7 @@ static inline bool isolation_suitable(struct compact_control *cc, static void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long nr_isolated, - bool migrate_scanner) + bool set_unsuitable, bool migrate_scanner) { } #endif /* CONFIG_COMPACTION */ @@ -195,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, } /* async aborts if taking too long or contended */ - if (!cc->sync) { + if (cc->mode == MIGRATE_ASYNC) { cc->contended = true; return false; } @@ -208,30 +222,39 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, return true; } -static inline bool compact_trylock_irqsave(spinlock_t *lock, - unsigned long *flags, struct compact_control *cc) +/* + * Aside from avoiding lock contention, compaction also periodically checks + * need_resched() and either schedules in sync compaction or aborts async + * compaction. This is similar to what compact_checklock_irqsave() does, but + * is used where no lock is concerned. + * + * Returns false when no scheduling was needed, or sync compaction scheduled. + * Returns true when async compaction should abort. + */ +static inline bool compact_should_abort(struct compact_control *cc) { - return compact_checklock_irqsave(lock, flags, false, cc); + /* async compaction aborts if contended */ + if (need_resched()) { + if (cc->mode == MIGRATE_ASYNC) { + cc->contended = true; + return true; + } + + cond_resched(); + } + + return false; } /* Returns true if the page is within a block suitable for migration to */ static bool suitable_migration_target(struct page *page) { - int migratetype = get_pageblock_migratetype(page); - - /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ - if (migratetype == MIGRATE_RESERVE) - return false; - - if (is_migrate_isolate(migratetype)) - return false; - - /* If the page is a large free page, then allow migration */ + /* If the page is a large free page, then disallow migration */ if (PageBuddy(page) && page_order(page) >= pageblock_order) - return true; + return false; /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ - if (migrate_async_suitable(migratetype)) + if (migrate_async_suitable(get_pageblock_migratetype(page))) return true; /* Otherwise skip the block */ @@ -253,6 +276,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, struct page *cursor, *valid_page = NULL; unsigned long flags; bool locked = false; + bool checked_pageblock = false; cursor = pfn_to_page(blockpfn); @@ -284,8 +308,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, break; /* Recheck this is a suitable migration target under lock */ - if (!strict && !suitable_migration_target(page)) - break; + if (!strict && !checked_pageblock) { + /* + * We need to check suitability of pageblock only once + * and this isolate_freepages_block() is called with + * pageblock range, so just check once is sufficient. + */ + checked_pageblock = true; + if (!suitable_migration_target(page)) + break; + } /* Recheck this is a buddy page under lock */ if (!PageBuddy(page)) @@ -329,7 +361,8 @@ isolate_fail: /* Update the pageblock-skip if the whole pageblock was scanned */ if (blockpfn == end_pfn) - update_pageblock_skip(cc, valid_page, total_isolated, false); + update_pageblock_skip(cc, valid_page, total_isolated, true, + false); count_compact_events(COMPACTFREE_SCANNED, nr_scanned); if (total_isolated) @@ -460,12 +493,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, unsigned long last_pageblock_nr = 0, pageblock_nr; unsigned long nr_scanned = 0, nr_isolated = 0; struct list_head *migratelist = &cc->migratepages; - isolate_mode_t mode = 0; struct lruvec *lruvec; unsigned long flags; bool locked = false; struct page *page = NULL, *valid_page = NULL; - bool skipped_async_unsuitable = false; + bool set_unsuitable = true; + const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ? + ISOLATE_ASYNC_MIGRATE : 0) | + (unevictable ? ISOLATE_UNEVICTABLE : 0); /* * Ensure that there are not too many pages isolated from the LRU @@ -474,7 +509,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, */ while (unlikely(too_many_isolated(zone))) { /* async migration should just abort */ - if (!cc->sync) + if (cc->mode == MIGRATE_ASYNC) return 0; congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -483,11 +518,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, return 0; } + if (compact_should_abort(cc)) + return 0; + /* Time to isolate some pages for migration */ - cond_resched(); for (; low_pfn < end_pfn; low_pfn++) { /* give a chance to irqs before checking need_resched() */ - if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) { + if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { if (should_release_lock(&zone->lru_lock)) { spin_unlock_irqrestore(&zone->lru_lock, flags); locked = false; @@ -526,8 +563,25 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, /* If isolation recently failed, do not retry */ pageblock_nr = low_pfn >> pageblock_order; - if (!isolation_suitable(cc, page)) - goto next_pageblock; + if (last_pageblock_nr != pageblock_nr) { + int mt; + + last_pageblock_nr = pageblock_nr; + if (!isolation_suitable(cc, page)) + goto next_pageblock; + + /* + * For async migration, also only scan in MOVABLE + * blocks. Async migration is optimistic to see if + * the minimum amount of work satisfies the allocation + */ + mt = get_pageblock_migratetype(page); + if (cc->mode == MIGRATE_ASYNC && + !migrate_async_suitable(mt)) { + set_unsuitable = false; + goto next_pageblock; + } + } /* * Skip if free. page_order cannot be used without zone->lock @@ -537,18 +591,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, continue; /* - * For async migration, also only scan in MOVABLE blocks. Async - * migration is optimistic to see if the minimum amount of work - * satisfies the allocation - */ - if (!cc->sync && last_pageblock_nr != pageblock_nr && - !migrate_async_suitable(get_pageblock_migratetype(page))) { - cc->finished_update_migrate = true; - skipped_async_unsuitable = true; - goto next_pageblock; - } - - /* * Check may be lockless but that's ok as we recheck later. * It's possible to migrate LRU pages and balloon pages * Skip any other type of page @@ -557,11 +599,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, if (unlikely(balloon_page_movable(page))) { if (locked && balloon_page_isolate(page)) { /* Successfully isolated */ - cc->finished_update_migrate = true; - list_add(&page->lru, migratelist); - cc->nr_migratepages++; - nr_isolated++; - goto check_compact_cluster; + goto isolate_success; } } continue; @@ -584,6 +622,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, continue; } + /* + * Migration will fail if an anonymous page is pinned in memory, + * so avoid taking lru_lock and isolating it unnecessarily in an + * admittedly racy check. + */ + if (!page_mapping(page) && + page_count(page) > page_mapcount(page)) + continue; + /* Check if it is ok to still hold the lock */ locked = compact_checklock_irqsave(&zone->lru_lock, &flags, locked, cc); @@ -598,12 +645,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, continue; } - if (!cc->sync) - mode |= ISOLATE_ASYNC_MIGRATE; - - if (unevictable) - mode |= ISOLATE_UNEVICTABLE; - lruvec = mem_cgroup_page_lruvec(page, zone); /* Try isolate the page */ @@ -613,13 +654,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, VM_BUG_ON_PAGE(PageTransCompound(page), page); /* Successfully isolated */ - cc->finished_update_migrate = true; del_page_from_lru_list(page, lruvec, page_lru(page)); + +isolate_success: + cc->finished_update_migrate = true; list_add(&page->lru, migratelist); cc->nr_migratepages++; nr_isolated++; -check_compact_cluster: /* Avoid isolating too much */ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { ++low_pfn; @@ -630,7 +672,6 @@ check_compact_cluster: next_pageblock: low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; - last_pageblock_nr = pageblock_nr; } acct_isolated(zone, locked, cc); @@ -641,11 +682,10 @@ next_pageblock: /* * Update the pageblock-skip information and cached scanner pfn, * if the whole pageblock was scanned without isolating any page. - * This is not done when pageblock was skipped due to being unsuitable - * for async compaction, so that eventual sync compaction can try. */ - if (low_pfn == end_pfn && !skipped_async_unsuitable) - update_pageblock_skip(cc, valid_page, nr_isolated, true); + if (low_pfn == end_pfn) + update_pageblock_skip(cc, valid_page, nr_isolated, + set_unsuitable, true); trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); @@ -666,44 +706,48 @@ static void isolate_freepages(struct zone *zone, struct compact_control *cc) { struct page *page; - unsigned long high_pfn, low_pfn, pfn, z_end_pfn, end_pfn; + unsigned long block_start_pfn; /* start of current pageblock */ + unsigned long block_end_pfn; /* end of current pageblock */ + unsigned long low_pfn; /* lowest pfn scanner is able to scan */ int nr_freepages = cc->nr_freepages; struct list_head *freelist = &cc->freepages; /* * Initialise the free scanner. The starting point is where we last - * scanned from (or the end of the zone if starting). The low point - * is the end of the pageblock the migration scanner is using. + * successfully isolated from, zone-cached value, or the end of the + * zone when isolating for the first time. We need this aligned to + * the pageblock boundary, because we do + * block_start_pfn -= pageblock_nr_pages in the for loop. + * For ending point, take care when isolating in last pageblock of a + * a zone which ends in the middle of a pageblock. + * The low boundary is the end of the pageblock the migration scanner + * is using. */ - pfn = cc->free_pfn; + block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); + block_end_pfn = min(block_start_pfn + pageblock_nr_pages, + zone_end_pfn(zone)); low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); /* - * Take care that if the migration scanner is at the end of the zone - * that the free scanner does not accidentally move to the next zone - * in the next isolation cycle. - */ - high_pfn = min(low_pfn, pfn); - - z_end_pfn = zone_end_pfn(zone); - - /* * Isolate free pages until enough are available to migrate the * pages on cc->migratepages. We stop searching if the migrate * and free page scanners meet or enough free pages are isolated. */ - for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; - pfn -= pageblock_nr_pages) { + for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; + block_end_pfn = block_start_pfn, + block_start_pfn -= pageblock_nr_pages) { unsigned long isolated; /* * This can iterate a massively long zone without finding any * suitable migration targets, so periodically check if we need - * to schedule. + * to schedule, or even abort async compaction. */ - cond_resched(); + if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) + && compact_should_abort(cc)) + break; - if (!pfn_valid(pfn)) + if (!pfn_valid(block_start_pfn)) continue; /* @@ -713,7 +757,7 @@ static void isolate_freepages(struct zone *zone, * i.e. it's possible that all pages within a zones range of * pages do not belong to a single zone. */ - page = pfn_to_page(pfn); + page = pfn_to_page(block_start_pfn); if (page_zone(page) != zone) continue; @@ -726,29 +770,26 @@ static void isolate_freepages(struct zone *zone, continue; /* Found a block suitable for isolating free pages from */ - isolated = 0; + cc->free_pfn = block_start_pfn; + isolated = isolate_freepages_block(cc, block_start_pfn, + block_end_pfn, freelist, false); + nr_freepages += isolated; /* - * As pfn may not start aligned, pfn+pageblock_nr_page - * may cross a MAX_ORDER_NR_PAGES boundary and miss - * a pfn_valid check. Ensure isolate_freepages_block() - * only scans within a pageblock + * Set a flag that we successfully isolated in this pageblock. + * In the next loop iteration, zone->compact_cached_free_pfn + * will not be updated and thus it will effectively contain the + * highest pageblock we isolated pages from. */ - end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); - end_pfn = min(end_pfn, z_end_pfn); - isolated = isolate_freepages_block(cc, pfn, end_pfn, - freelist, false); - nr_freepages += isolated; + if (isolated) + cc->finished_update_free = true; /* - * Record the highest PFN we isolated pages from. When next - * looking for free pages, the search will restart here as - * page migration may have returned some pages to the allocator + * isolate_freepages_block() might have aborted due to async + * compaction being contended */ - if (isolated) { - cc->finished_update_free = true; - high_pfn = max(high_pfn, pfn); - } + if (cc->contended) + break; } /* split_free_page does not map the pages */ @@ -758,10 +799,9 @@ static void isolate_freepages(struct zone *zone, * If we crossed the migrate scanner, we want to keep it that way * so that compact_finished() may detect this */ - if (pfn < low_pfn) - cc->free_pfn = max(pfn, zone->zone_start_pfn); - else - cc->free_pfn = high_pfn; + if (block_start_pfn < low_pfn) + cc->free_pfn = cc->migrate_pfn; + cc->nr_freepages = nr_freepages; } @@ -776,9 +816,13 @@ static struct page *compaction_alloc(struct page *migratepage, struct compact_control *cc = (struct compact_control *)data; struct page *freepage; - /* Isolate free pages if necessary */ + /* + * Isolate free pages if necessary, and if we are not aborting due to + * contention. + */ if (list_empty(&cc->freepages)) { - isolate_freepages(cc->zone, cc); + if (!cc->contended) + isolate_freepages(cc->zone, cc); if (list_empty(&cc->freepages)) return NULL; @@ -792,23 +836,16 @@ static struct page *compaction_alloc(struct page *migratepage, } /* - * We cannot control nr_migratepages and nr_freepages fully when migration is - * running as migrate_pages() has no knowledge of compact_control. When - * migration is complete, we count the number of pages on the lists by hand. + * This is a migrate-callback that "frees" freepages back to the isolated + * freelist. All pages on the freelist are from the same zone, so there is no + * special handling needed for NUMA. */ -static void update_nr_listpages(struct compact_control *cc) +static void compaction_free(struct page *page, unsigned long data) { - int nr_migratepages = 0; - int nr_freepages = 0; - struct page *page; - - list_for_each_entry(page, &cc->migratepages, lru) - nr_migratepages++; - list_for_each_entry(page, &cc->freepages, lru) - nr_freepages++; + struct compact_control *cc = (struct compact_control *)data; - cc->nr_migratepages = nr_migratepages; - cc->nr_freepages = nr_freepages; + list_add(&page->lru, &cc->freepages); + cc->nr_freepages++; } /* possible outcome of isolate_migratepages */ @@ -855,13 +892,14 @@ static int compact_finished(struct zone *zone, unsigned int order; unsigned long watermark; - if (fatal_signal_pending(current)) + if (cc->contended || fatal_signal_pending(current)) return COMPACT_PARTIAL; /* Compaction run completes if the migrate and free scanner meet */ if (cc->free_pfn <= cc->migrate_pfn) { /* Let the next compaction start anew. */ - zone->compact_cached_migrate_pfn = zone->zone_start_pfn; + zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; + zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; zone->compact_cached_free_pfn = zone_end_pfn(zone); /* @@ -961,6 +999,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) int ret; unsigned long start_pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); + const bool sync = cc->mode != MIGRATE_ASYNC; ret = compaction_suitable(zone, cc->order); switch (ret) { @@ -986,7 +1025,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) * information on where the scanners should start but check that it * is initialised by ensuring the values are within zone boundaries. */ - cc->migrate_pfn = zone->compact_cached_migrate_pfn; + cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; cc->free_pfn = zone->compact_cached_free_pfn; if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); @@ -994,7 +1033,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) } if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { cc->migrate_pfn = start_pfn; - zone->compact_cached_migrate_pfn = cc->migrate_pfn; + zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; + zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; } trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); @@ -1002,7 +1042,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) migrate_prep_local(); while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { - unsigned long nr_migrate, nr_remaining; int err; switch (isolate_migratepages(zone, cc)) { @@ -1017,21 +1056,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ; } - nr_migrate = cc->nr_migratepages; + if (!cc->nr_migratepages) + continue; + err = migrate_pages(&cc->migratepages, compaction_alloc, - (unsigned long)cc, - cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, + compaction_free, (unsigned long)cc, cc->mode, MR_COMPACTION); - update_nr_listpages(cc); - nr_remaining = cc->nr_migratepages; - trace_mm_compaction_migratepages(nr_migrate - nr_remaining, - nr_remaining); + trace_mm_compaction_migratepages(cc->nr_migratepages, err, + &cc->migratepages); - /* Release isolated pages not migrated */ + /* All pages were either migrated or will be released */ + cc->nr_migratepages = 0; if (err) { putback_movable_pages(&cc->migratepages); - cc->nr_migratepages = 0; /* * migrate_pages() may return -ENOMEM when scanners meet * and we want compact_finished() to detect it @@ -1053,9 +1091,8 @@ out: return ret; } -static unsigned long compact_zone_order(struct zone *zone, - int order, gfp_t gfp_mask, - bool sync, bool *contended) +static unsigned long compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask, enum migrate_mode mode, bool *contended) { unsigned long ret; struct compact_control cc = { @@ -1064,7 +1101,7 @@ static unsigned long compact_zone_order(struct zone *zone, .order = order, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, - .sync = sync, + .mode = mode, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1086,7 +1123,7 @@ int sysctl_extfrag_threshold = 500; * @order: The order of the current allocation * @gfp_mask: The GFP mask of the current allocation * @nodemask: The allowed nodes to allocate from - * @sync: Whether migration is synchronous or not + * @mode: The migration mode for async, sync light, or sync migration * @contended: Return value that is true if compaction was aborted due to lock contention * @page: Optionally capture a free page of the requested order during compaction * @@ -1094,7 +1131,7 @@ int sysctl_extfrag_threshold = 500; */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync, bool *contended) + enum migrate_mode mode, bool *contended) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -1119,7 +1156,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - status = compact_zone_order(zone, order, gfp_mask, sync, + status = compact_zone_order(zone, order, gfp_mask, mode, contended); rc = max(status, rc); @@ -1158,9 +1195,6 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) if (zone_watermark_ok(zone, cc->order, low_wmark_pages(zone), 0, 0)) compaction_defer_reset(zone, cc->order, false); - /* Currently async compaction is never deferred. */ - else if (cc->sync) - defer_compaction(zone, cc->order); } VM_BUG_ON(!list_empty(&cc->freepages)); @@ -1172,7 +1206,7 @@ void compact_pgdat(pg_data_t *pgdat, int order) { struct compact_control cc = { .order = order, - .sync = false, + .mode = MIGRATE_ASYNC, }; if (!order) @@ -1185,7 +1219,8 @@ static void compact_node(int nid) { struct compact_control cc = { .order = -1, - .sync = true, + .mode = MIGRATE_SYNC, + .ignore_skip_hint = true, }; __compact_pgdat(NODE_DATA(nid), &cc); @@ -1225,7 +1260,7 @@ int sysctl_extfrag_handler(struct ctl_table *table, int write, } #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) -ssize_t sysfs_compact_node(struct device *dev, +static ssize_t sysfs_compact_node(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { diff --git a/mm/dmapool.c b/mm/dmapool.c index 8058fcd..306baa5 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -170,24 +170,16 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, retval->boundary = boundary; retval->allocation = allocation; - if (dev) { - int ret; + INIT_LIST_HEAD(&retval->pools); - mutex_lock(&pools_lock); - if (list_empty(&dev->dma_pools)) - ret = device_create_file(dev, &dev_attr_pools); - else - ret = 0; - /* note: not currently insisting "name" be unique */ - if (!ret) - list_add(&retval->pools, &dev->dma_pools); - else { - kfree(retval); - retval = NULL; - } - mutex_unlock(&pools_lock); + mutex_lock(&pools_lock); + if (list_empty(&dev->dma_pools) && + device_create_file(dev, &dev_attr_pools)) { + kfree(retval); + return NULL; } else - INIT_LIST_HEAD(&retval->pools); + list_add(&retval->pools, &dev->dma_pools); + mutex_unlock(&pools_lock); return retval; } @@ -508,7 +500,6 @@ void dmam_pool_destroy(struct dma_pool *pool) { struct device *dev = pool->dev; - WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); - dma_pool_destroy(pool); + WARN_ON(devres_release(dev, dmam_pool_release, dmam_pool_match, pool)); } EXPORT_SYMBOL(dmam_pool_destroy); diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c new file mode 100644 index 0000000..e10ccd2 --- /dev/null +++ b/mm/early_ioremap.c @@ -0,0 +1,245 @@ +/* + * Provide common bits of early_ioremap() support for architectures needing + * temporary mappings during boot before ioremap() is available. + * + * This is mostly a direct copy of the x86 early_ioremap implementation. + * + * (C) Copyright 1995 1996, 2014 Linus Torvalds + * + */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <asm/fixmap.h> + +#ifdef CONFIG_MMU +static int early_ioremap_debug __initdata; + +static int __init early_ioremap_debug_setup(char *str) +{ + early_ioremap_debug = 1; + + return 0; +} +early_param("early_ioremap_debug", early_ioremap_debug_setup); + +static int after_paging_init __initdata; + +void __init __weak early_ioremap_shutdown(void) +{ +} + +void __init early_ioremap_reset(void) +{ + early_ioremap_shutdown(); + after_paging_init = 1; +} + +/* + * Generally, ioremap() is available after paging_init() has been called. + * Architectures wanting to allow early_ioremap after paging_init() can + * define __late_set_fixmap and __late_clear_fixmap to do the right thing. + */ +#ifndef __late_set_fixmap +static inline void __init __late_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t prot) +{ + BUG(); +} +#endif + +#ifndef __late_clear_fixmap +static inline void __init __late_clear_fixmap(enum fixed_addresses idx) +{ + BUG(); +} +#endif + +static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; +static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; +static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; + +void __init early_ioremap_setup(void) +{ + int i; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + if (WARN_ON(prev_map[i])) + break; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); +} + +static int __init check_early_ioremap_leak(void) +{ + int count = 0; + int i; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + if (prev_map[i]) + count++; + + if (WARN(count, KERN_WARNING + "Debug warning: early ioremap leak of %d areas detected.\n" + "please boot with early_ioremap_debug and report the dmesg.\n", + count)) + return 1; + return 0; +} +late_initcall(check_early_ioremap_leak); + +static void __init __iomem * +__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) +{ + unsigned long offset; + resource_size_t last_addr; + unsigned int nrpages; + enum fixed_addresses idx; + int i, slot; + + WARN_ON(system_state != SYSTEM_BOOTING); + + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (!prev_map[i]) { + slot = i; + break; + } + } + + if (WARN(slot < 0, "%s(%08llx, %08lx) not found slot\n", + __func__, (u64)phys_addr, size)) + return NULL; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (WARN_ON(!size || last_addr < phys_addr)) + return NULL; + + prev_size[slot] = size; + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr + 1) - phys_addr; + + /* + * Mappings have to fit in the FIX_BTMAP area. + */ + nrpages = size >> PAGE_SHIFT; + if (WARN_ON(nrpages > NR_FIX_BTMAPS)) + return NULL; + + /* + * Ok, go for it.. + */ + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; + while (nrpages > 0) { + if (after_paging_init) + __late_set_fixmap(idx, phys_addr, prot); + else + __early_set_fixmap(idx, phys_addr, prot); + phys_addr += PAGE_SIZE; + --idx; + --nrpages; + } + WARN(early_ioremap_debug, "%s(%08llx, %08lx) [%d] => %08lx + %08lx\n", + __func__, (u64)phys_addr, size, slot, offset, slot_virt[slot]); + + prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); + return prev_map[slot]; +} + +void __init early_iounmap(void __iomem *addr, unsigned long size) +{ + unsigned long virt_addr; + unsigned long offset; + unsigned int nrpages; + enum fixed_addresses idx; + int i, slot; + + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (prev_map[i] == addr) { + slot = i; + break; + } + } + + if (WARN(slot < 0, "early_iounmap(%p, %08lx) not found slot\n", + addr, size)) + return; + + if (WARN(prev_size[slot] != size, + "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", + addr, size, slot, prev_size[slot])) + return; + + WARN(early_ioremap_debug, "early_iounmap(%p, %08lx) [%d]\n", + addr, size, slot); + + virt_addr = (unsigned long)addr; + if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))) + return; + + offset = virt_addr & ~PAGE_MASK; + nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; + + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; + while (nrpages > 0) { + if (after_paging_init) + __late_clear_fixmap(idx); + else + __early_set_fixmap(idx, 0, FIXMAP_PAGE_CLEAR); + --idx; + --nrpages; + } + prev_map[slot] = NULL; +} + +/* Remap an IO device */ +void __init __iomem * +early_ioremap(resource_size_t phys_addr, unsigned long size) +{ + return __early_ioremap(phys_addr, size, FIXMAP_PAGE_IO); +} + +/* Remap memory */ +void __init * +early_memremap(resource_size_t phys_addr, unsigned long size) +{ + return (__force void *)__early_ioremap(phys_addr, size, + FIXMAP_PAGE_NORMAL); +} +#else /* CONFIG_MMU */ + +void __init __iomem * +early_ioremap(resource_size_t phys_addr, unsigned long size) +{ + return (__force void __iomem *)phys_addr; +} + +/* Remap memory */ +void __init * +early_memremap(resource_size_t phys_addr, unsigned long size) +{ + return (void *)phys_addr; +} + +void __init early_iounmap(void __iomem *addr, unsigned long size) +{ +} + +#endif /* CONFIG_MMU */ + + +void __init early_memunmap(void *addr, unsigned long size) +{ + early_iounmap((__force void __iomem *)addr, size); +} diff --git a/mm/filemap.c b/mm/filemap.c index 7a13f6a..65d44fd 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -33,6 +33,7 @@ #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ #include <linux/memcontrol.h> #include <linux/cleancache.h> +#include <linux/rmap.h> #include "internal.h" #define CREATE_TRACE_POINTS @@ -76,7 +77,7 @@ * ->mmap_sem * ->lock_page (access_process_vm) * - * ->i_mutex (generic_file_buffered_write) + * ->i_mutex (generic_perform_write) * ->mmap_sem (fault_in_pages_readable->do_page_fault) * * bdi->wb.list_lock @@ -107,12 +108,75 @@ * ->tasklist_lock (memory_failure, collect_procs_ao) */ +static void page_cache_tree_delete(struct address_space *mapping, + struct page *page, void *shadow) +{ + struct radix_tree_node *node; + unsigned long index; + unsigned int offset; + unsigned int tag; + void **slot; + + VM_BUG_ON(!PageLocked(page)); + + __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); + + if (shadow) { + mapping->nrshadows++; + /* + * Make sure the nrshadows update is committed before + * the nrpages update so that final truncate racing + * with reclaim does not see both counters 0 at the + * same time and miss a shadow entry. + */ + smp_wmb(); + } + mapping->nrpages--; + + if (!node) { + /* Clear direct pointer tags in root node */ + mapping->page_tree.gfp_mask &= __GFP_BITS_MASK; + radix_tree_replace_slot(slot, shadow); + return; + } + + /* Clear tree tags for the removed page */ + index = page->index; + offset = index & RADIX_TREE_MAP_MASK; + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { + if (test_bit(offset, node->tags[tag])) + radix_tree_tag_clear(&mapping->page_tree, index, tag); + } + + /* Delete page, swap shadow entry */ + radix_tree_replace_slot(slot, shadow); + workingset_node_pages_dec(node); + if (shadow) + workingset_node_shadows_inc(node); + else + if (__radix_tree_delete_node(&mapping->page_tree, node)) + return; + + /* + * Track node that only contains shadow entries. + * + * Avoid acquiring the list_lru lock if already tracked. The + * list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_pages(node) && + list_empty(&node->private_list)) { + node->private_data = mapping; + list_lru_add(&workingset_shadow_nodes, &node->private_list); + } +} + /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold the mapping's tree_lock. */ -void __delete_from_page_cache(struct page *page) +void __delete_from_page_cache(struct page *page, void *shadow) { struct address_space *mapping = page->mapping; @@ -127,10 +191,11 @@ void __delete_from_page_cache(struct page *page) else cleancache_invalidate_page(mapping, page); - radix_tree_delete(&mapping->page_tree, page->index); + page_cache_tree_delete(mapping, page, shadow); + page->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ - mapping->nrpages--; + __dec_zone_page_state(page, NR_FILE_PAGES); if (PageSwapBacked(page)) __dec_zone_page_state(page, NR_SHMEM); @@ -166,7 +231,7 @@ void delete_from_page_cache(struct page *page) freepage = mapping->a_ops->freepage; spin_lock_irq(&mapping->tree_lock); - __delete_from_page_cache(page); + __delete_from_page_cache(page, NULL); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); @@ -176,25 +241,15 @@ void delete_from_page_cache(struct page *page) } EXPORT_SYMBOL(delete_from_page_cache); -static int sleep_on_page(void *word) -{ - io_schedule(); - return 0; -} - -static int sleep_on_page_killable(void *word) -{ - sleep_on_page(word); - return fatal_signal_pending(current) ? -EINTR : 0; -} - static int filemap_check_errors(struct address_space *mapping) { int ret = 0; /* Check for outstanding write errors */ - if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) + if (test_bit(AS_ENOSPC, &mapping->flags) && + test_and_clear_bit(AS_ENOSPC, &mapping->flags)) ret = -ENOSPC; - if (test_and_clear_bit(AS_EIO, &mapping->flags)) + if (test_bit(AS_EIO, &mapping->flags) && + test_and_clear_bit(AS_EIO, &mapping->flags)) ret = -EIO; return ret; } @@ -426,7 +481,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->index = offset; spin_lock_irq(&mapping->tree_lock); - __delete_from_page_cache(old); + __delete_from_page_cache(old, NULL); error = radix_tree_insert(&mapping->page_tree, offset, new); BUG_ON(error); mapping->nrpages++; @@ -446,25 +501,59 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) } EXPORT_SYMBOL_GPL(replace_page_cache_page); -/** - * add_to_page_cache_locked - add a locked page to the pagecache - * @page: page to add - * @mapping: the page's address_space - * @offset: page index - * @gfp_mask: page allocation mode - * - * This function is used to add a page to the pagecache. It must be locked. - * This function does not add the page to the LRU. The caller must do that. - */ -int add_to_page_cache_locked(struct page *page, struct address_space *mapping, - pgoff_t offset, gfp_t gfp_mask) +static int page_cache_tree_insert(struct address_space *mapping, + struct page *page, void **shadowp) +{ + struct radix_tree_node *node; + void **slot; + int error; + + error = __radix_tree_create(&mapping->page_tree, page->index, + &node, &slot); + if (error) + return error; + if (*slot) { + void *p; + + p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + if (!radix_tree_exceptional_entry(p)) + return -EEXIST; + if (shadowp) + *shadowp = p; + mapping->nrshadows--; + if (node) + workingset_node_shadows_dec(node); + } + radix_tree_replace_slot(slot, page); + mapping->nrpages++; + if (node) { + workingset_node_pages_inc(node); + /* + * Don't track node that contains actual pages. + * + * Avoid acquiring the list_lru lock if already + * untracked. The list_empty() test is safe as + * node->private_list is protected by + * mapping->tree_lock. + */ + if (!list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, + &node->private_list); + } + return 0; +} + +static int __add_to_page_cache_locked(struct page *page, + struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask, + void **shadowp) { int error; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapBacked(page), page); - error = mem_cgroup_cache_charge(page, current->mm, + error = mem_cgroup_charge_file(page, current->mm, gfp_mask & GFP_RECLAIM_MASK); if (error) return error; @@ -480,11 +569,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, page->index = offset; spin_lock_irq(&mapping->tree_lock); - error = radix_tree_insert(&mapping->page_tree, offset, page); + error = page_cache_tree_insert(mapping, page, shadowp); radix_tree_preload_end(); if (unlikely(error)) goto err_insert; - mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); trace_mm_filemap_add_to_page_cache(page); @@ -497,16 +585,49 @@ err_insert: page_cache_release(page); return error; } + +/** + * add_to_page_cache_locked - add a locked page to the pagecache + * @page: page to add + * @mapping: the page's address_space + * @offset: page index + * @gfp_mask: page allocation mode + * + * This function is used to add a page to the pagecache. It must be locked. + * This function does not add the page to the LRU. The caller must do that. + */ +int add_to_page_cache_locked(struct page *page, struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask) +{ + return __add_to_page_cache_locked(page, mapping, offset, + gfp_mask, NULL); +} EXPORT_SYMBOL(add_to_page_cache_locked); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { + void *shadow = NULL; int ret; - ret = add_to_page_cache(page, mapping, offset, gfp_mask); - if (ret == 0) - lru_cache_add_file(page); + __set_page_locked(page); + ret = __add_to_page_cache_locked(page, mapping, offset, + gfp_mask, &shadow); + if (unlikely(ret)) + __clear_page_locked(page); + else { + /* + * The page might have been evicted from cache only + * recently, in which case it should be activated like + * any other repeatedly accessed page. + */ + if (shadow && workingset_refault(shadow)) { + SetPageActive(page); + workingset_activation(page); + } else + ClearPageActive(page); + lru_cache_add(page); + } return ret; } EXPORT_SYMBOL_GPL(add_to_page_cache_lru); @@ -520,10 +641,10 @@ struct page *__page_cache_alloc(gfp_t gfp) if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); n = cpuset_mem_spread_node(); page = alloc_pages_exact_node(n, gfp, 0); - } while (!put_mems_allowed(cpuset_mems_cookie) && !page); + } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); return page; } @@ -559,7 +680,7 @@ void wait_on_page_bit(struct page *page, int bit_nr) DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); if (test_bit(bit_nr, &page->flags)) - __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page, + __wait_on_bit(page_waitqueue(page), &wait, bit_wait_io, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_on_page_bit); @@ -572,7 +693,7 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr) return 0; return __wait_on_bit(page_waitqueue(page), &wait, - sleep_on_page_killable, TASK_KILLABLE); + bit_wait_io, TASK_KILLABLE); } /** @@ -609,7 +730,7 @@ void unlock_page(struct page *page) { VM_BUG_ON_PAGE(!PageLocked(page), page); clear_bit_unlock(PG_locked, &page->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_page(page, PG_locked); } EXPORT_SYMBOL(unlock_page); @@ -620,17 +741,51 @@ EXPORT_SYMBOL(unlock_page); */ void end_page_writeback(struct page *page) { - if (TestClearPageReclaim(page)) + /* + * TestClearPageReclaim could be used here but it is an atomic + * operation and overkill in this particular case. Failing to + * shuffle a page marked for immediate reclaim is too mild to + * justify taking an atomic operation penalty at the end of + * ever page writeback. + */ + if (PageReclaim(page)) { + ClearPageReclaim(page); rotate_reclaimable_page(page); + } if (!test_clear_page_writeback(page)) BUG(); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_page(page, PG_writeback); } EXPORT_SYMBOL(end_page_writeback); +/* + * After completing I/O on a page, call this routine to update the page + * flags appropriately + */ +void page_endio(struct page *page, int rw, int err) +{ + if (rw == READ) { + if (!err) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } else { /* rw == WRITE */ + if (err) { + SetPageError(page); + if (page->mapping) + mapping_set_error(page->mapping, err); + } + end_page_writeback(page); + } +} +EXPORT_SYMBOL_GPL(page_endio); + /** * __lock_page - get a lock on the page, assuming we need to sleep to get it * @page: the page to lock @@ -639,7 +794,7 @@ void __lock_page(struct page *page) { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); - __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page, + __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__lock_page); @@ -649,7 +804,7 @@ int __lock_page_killable(struct page *page) DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); return __wait_on_bit_lock(page_waitqueue(page), &wait, - sleep_on_page_killable, TASK_KILLABLE); + bit_wait_io, TASK_KILLABLE); } EXPORT_SYMBOL_GPL(__lock_page_killable); @@ -686,14 +841,101 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, } /** - * find_get_page - find and get a page reference + * page_cache_next_hole - find the next hole (not-present entry) + * @mapping: mapping + * @index: index + * @max_scan: maximum range to search + * + * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the + * lowest indexed hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'return - index >= + * max_scan' will be true). In rare cases of index wrap-around, 0 will + * be returned. + * + * page_cache_next_hole may be called under rcu_read_lock. However, + * like radix_tree_gang_lookup, this will not atomically search a + * snapshot of the tree at a single point in time. For example, if a + * hole is created at index 5, then subsequently a hole is created at + * index 10, page_cache_next_hole covering both indexes may return 10 + * if called under rcu_read_lock. + */ +pgoff_t page_cache_next_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + struct page *page; + + page = radix_tree_lookup(&mapping->page_tree, index); + if (!page || radix_tree_exceptional_entry(page)) + break; + index++; + if (index == 0) + break; + } + + return index; +} +EXPORT_SYMBOL(page_cache_next_hole); + +/** + * page_cache_prev_hole - find the prev hole (not-present entry) + * @mapping: mapping + * @index: index + * @max_scan: maximum range to search + * + * Search backwards in the range [max(index-max_scan+1, 0), index] for + * the first hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'index - return >= + * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX + * will be returned. + * + * page_cache_prev_hole may be called under rcu_read_lock. However, + * like radix_tree_gang_lookup, this will not atomically search a + * snapshot of the tree at a single point in time. For example, if a + * hole is created at index 10, then subsequently a hole is created at + * index 5, page_cache_prev_hole covering both indexes may return 5 if + * called under rcu_read_lock. + */ +pgoff_t page_cache_prev_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + struct page *page; + + page = radix_tree_lookup(&mapping->page_tree, index); + if (!page || radix_tree_exceptional_entry(page)) + break; + index--; + if (index == ULONG_MAX) + break; + } + + return index; +} +EXPORT_SYMBOL(page_cache_prev_hole); + +/** + * find_get_entry - find and get a page cache entry * @mapping: the address_space to search - * @offset: the page index + * @offset: the page cache index * - * Is there a pagecache struct page at the given (mapping, offset) tuple? - * If yes, increment its refcount and return it; if no, return NULL. + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned with an increased refcount. + * + * If the slot holds a shadow entry of a previously evicted page, or a + * swap entry from shmem/tmpfs, it is returned. + * + * Otherwise, %NULL is returned. */ -struct page *find_get_page(struct address_space *mapping, pgoff_t offset) +struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { void **pagep; struct page *page; @@ -710,9 +952,9 @@ repeat: if (radix_tree_deref_retry(page)) goto repeat; /* - * Otherwise, shmem/tmpfs must be storing a swap entry - * here as an exceptional entry: so return it without - * attempting to raise page count. + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Return + * it without attempting to raise page count. */ goto out; } @@ -734,24 +976,30 @@ out: return page; } -EXPORT_SYMBOL(find_get_page); +EXPORT_SYMBOL(find_get_entry); /** - * find_lock_page - locate, pin and lock a pagecache page + * find_lock_entry - locate, pin and lock a page cache entry * @mapping: the address_space to search - * @offset: the page index + * @offset: the page cache index * - * Locates the desired pagecache page, locks it, increments its reference - * count and returns its address. + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned locked and with an increased + * refcount. * - * Returns zero if the page was not present. find_lock_page() may sleep. + * If the slot holds a shadow entry of a previously evicted page, or a + * swap entry from shmem/tmpfs, it is returned. + * + * Otherwise, %NULL is returned. + * + * find_lock_entry() may sleep. */ -struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) +struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) { struct page *page; repeat: - page = find_get_page(mapping, offset); + page = find_get_entry(mapping, offset); if (page && !radix_tree_exception(page)) { lock_page(page); /* Has the page been truncated? */ @@ -764,44 +1012,90 @@ repeat: } return page; } -EXPORT_SYMBOL(find_lock_page); +EXPORT_SYMBOL(find_lock_entry); /** - * find_or_create_page - locate or add a pagecache page - * @mapping: the page's address_space - * @index: the page's index into the mapping - * @gfp_mask: page allocation mode + * pagecache_get_page - find and get a page reference + * @mapping: the address_space to search + * @offset: the page index + * @fgp_flags: PCG flags + * @cache_gfp_mask: gfp mask to use for the page cache data page allocation + * @radix_gfp_mask: gfp mask to use for radix tree node allocation + * + * Looks up the page cache slot at @mapping & @offset. + * + * PCG flags modify how the page is returned. * - * Locates a page in the pagecache. If the page is not present, a new page - * is allocated using @gfp_mask and is added to the pagecache and to the VM's - * LRU list. The returned page is locked and has its reference count - * incremented. + * FGP_ACCESSED: the page will be marked accessed + * FGP_LOCK: Page is return locked + * FGP_CREAT: If page is not present then a new page is allocated using + * @cache_gfp_mask and added to the page cache and the VM's LRU + * list. If radix tree nodes are allocated during page cache + * insertion then @radix_gfp_mask is used. The page is returned + * locked and with an increased refcount. Otherwise, %NULL is + * returned. * - * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic - * allocation! + * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even + * if the GFP flags specified for FGP_CREAT are atomic. * - * find_or_create_page() returns the desired page's address, or zero on - * memory exhaustion. + * If there is a page cache page, it is returned with an increased refcount. */ -struct page *find_or_create_page(struct address_space *mapping, - pgoff_t index, gfp_t gfp_mask) +struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, + int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask) { struct page *page; - int err; + repeat: - page = find_lock_page(mapping, index); - if (!page) { - page = __page_cache_alloc(gfp_mask); + page = find_get_entry(mapping, offset); + if (radix_tree_exceptional_entry(page)) + page = NULL; + if (!page) + goto no_page; + + if (fgp_flags & FGP_LOCK) { + if (fgp_flags & FGP_NOWAIT) { + if (!trylock_page(page)) { + page_cache_release(page); + return NULL; + } + } else { + lock_page(page); + } + + /* Has the page been truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; + } + VM_BUG_ON_PAGE(page->index != offset, page); + } + + if (page && (fgp_flags & FGP_ACCESSED)) + mark_page_accessed(page); + +no_page: + if (!page && (fgp_flags & FGP_CREAT)) { + int err; + if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) + cache_gfp_mask |= __GFP_WRITE; + if (fgp_flags & FGP_NOFS) { + cache_gfp_mask &= ~__GFP_FS; + radix_gfp_mask &= ~__GFP_FS; + } + + page = __page_cache_alloc(cache_gfp_mask); if (!page) return NULL; - /* - * We want a regular kernel memory (not highmem or DMA etc) - * allocation for the radix tree nodes, but we need to honour - * the context-specific requirements the caller has asked for. - * GFP_RECLAIM_MASK collects those requirements. - */ - err = add_to_page_cache_lru(page, mapping, index, - (gfp_mask & GFP_RECLAIM_MASK)); + + if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) + fgp_flags |= FGP_LOCK; + + /* Init accessed so avoit atomic mark_page_accessed later */ + if (fgp_flags & FGP_ACCESSED) + init_page_accessed(page); + + err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask); if (unlikely(err)) { page_cache_release(page); page = NULL; @@ -809,9 +1103,80 @@ repeat: goto repeat; } } + return page; } -EXPORT_SYMBOL(find_or_create_page); +EXPORT_SYMBOL(pagecache_get_page); + +/** + * find_get_entries - gang pagecache lookup + * @mapping: The address_space to search + * @start: The starting page cache index + * @nr_entries: The maximum number of entries + * @entries: Where the resulting entries are placed + * @indices: The cache indices corresponding to the entries in @entries + * + * find_get_entries() will search for and return a group of up to + * @nr_entries entries in the mapping. The entries are placed at + * @entries. find_get_entries() takes a reference against any actual + * pages it returns. + * + * The search returns a group of mapping-contiguous page cache entries + * with ascending indexes. There may be holes in the indices due to + * not-present pages. + * + * Any shadow entries of evicted pages, or swap entries from + * shmem/tmpfs, are included in the returned array. + * + * find_get_entries() returns the number of pages and shadow entries + * which were found. + */ +unsigned find_get_entries(struct address_space *mapping, + pgoff_t start, unsigned int nr_entries, + struct page **entries, pgoff_t *indices) +{ + void **slot; + unsigned int ret = 0; + struct radix_tree_iter iter; + + if (!nr_entries) + return 0; + + rcu_read_lock(); +restart: + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + struct page *page; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + continue; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto restart; + /* + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Return + * it without attempting to raise page count. + */ + goto export; + } + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } +export: + indices[ret] = iter.index; + entries[ret] = page; + if (++ret == nr_entries) + break; + } + rcu_read_unlock(); + return ret; +} /** * find_get_pages - gang pagecache lookup @@ -859,9 +1224,9 @@ repeat: goto restart; } /* - * Otherwise, shmem/tmpfs must be storing a swap entry - * here as an exceptional entry: so skip over it - - * we only reach this from invalidate_mapping_pages(). + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Skip + * over it. */ continue; } @@ -926,9 +1291,9 @@ repeat: goto restart; } /* - * Otherwise, shmem/tmpfs must be storing a swap entry - * here as an exceptional entry: so stop looking for - * contiguous pages. + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Stop + * looking for contiguous pages. */ break; } @@ -1002,10 +1367,17 @@ repeat: goto restart; } /* - * This function is never used on a shmem/tmpfs - * mapping, so a swap entry won't be found here. + * A shadow entry of a recently evicted page. + * + * Those entries should never be tagged, but + * this tree walk is lockless and the tags are + * looked up in bulk, one radix tree node at a + * time, so there is a sizable window for page + * reclaim to evict a page we saw tagged. + * + * Skip over it. */ - BUG(); + continue; } if (!page_cache_get_speculative(page)) @@ -1031,39 +1403,6 @@ repeat: } EXPORT_SYMBOL(find_get_pages_tag); -/** - * grab_cache_page_nowait - returns locked page at given index in given cache - * @mapping: target address_space - * @index: the page index - * - * Same as grab_cache_page(), but do not wait if the page is unavailable. - * This is intended for speculative data generators, where the data can - * be regenerated if the page couldn't be grabbed. This routine should - * be safe to call while holding the lock for another page. - * - * Clear __GFP_FS when allocating the page to avoid recursion into the fs - * and deadlock against the caller's locked page. - */ -struct page * -grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) -{ - struct page *page = find_get_page(mapping, index); - - if (page) { - if (trylock_page(page)) - return page; - page_cache_release(page); - return NULL; - } - page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); - if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { - page_cache_release(page); - page = NULL; - } - return page; -} -EXPORT_SYMBOL(grab_cache_page_nowait); - /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: @@ -1089,7 +1428,8 @@ static void shrink_readahead_size_eio(struct file *filp, * do_generic_file_read - generic file read routine * @filp: the file to read * @ppos: current file position - * @desc: read_descriptor + * @iter: data destination + * @written: already copied * * This is a generic file read routine, and uses the * mapping->a_ops->readpage() function for the actual low-level stuff. @@ -1097,8 +1437,8 @@ static void shrink_readahead_size_eio(struct file *filp, * This is really ugly. But the goto's actually try to clarify some * of the logic when it comes to error handling etc. */ -static void do_generic_file_read(struct file *filp, loff_t *ppos, - read_descriptor_t *desc) +static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, + struct iov_iter *iter, ssize_t written) { struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; @@ -1108,12 +1448,12 @@ static void do_generic_file_read(struct file *filp, loff_t *ppos, pgoff_t prev_index; unsigned long offset; /* offset into pagecache page */ unsigned int prev_offset; - int error; + int error = 0; index = *ppos >> PAGE_CACHE_SHIFT; prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); - last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; + last_index = (*ppos + iter->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; for (;;) { @@ -1148,7 +1488,7 @@ find_page: if (!page->mapping) goto page_not_up_to_date_locked; if (!mapping->a_ops->is_partially_uptodate(page, - desc, offset)) + offset, iter->count)) goto page_not_up_to_date_locked; unlock_page(page); } @@ -1198,24 +1538,23 @@ page_ok: /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... - * - * The file_read_actor routine returns how many bytes were - * actually used.. - * NOTE! This may not be the same as how much of a user buffer - * we filled up (we may be padding etc), so we can only update - * "pos" here (the actor routine has to update the user buffer - * pointers and the remaining count). */ - ret = file_read_actor(desc, page, offset, nr); + + ret = copy_page_to_iter(page, offset, nr, iter); offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; prev_offset = offset; page_cache_release(page); - if (ret == nr && desc->count) - continue; - goto out; + written += ret; + if (!iov_iter_count(iter)) + goto out; + if (ret < nr) { + error = -EFAULT; + goto out; + } + continue; page_not_up_to_date: /* Get exclusive access to the page ... */ @@ -1250,6 +1589,7 @@ readpage: if (unlikely(error)) { if (error == AOP_TRUNCATED_PAGE) { page_cache_release(page); + error = 0; goto find_page; } goto readpage_error; @@ -1280,7 +1620,6 @@ readpage: readpage_error: /* UHHUH! A synchronous read error occurred. Report it */ - desc->error = error; page_cache_release(page); goto out; @@ -1291,16 +1630,17 @@ no_cached_page: */ page = page_cache_alloc_cold(mapping); if (!page) { - desc->error = -ENOMEM; + error = -ENOMEM; goto out; } error = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); if (error) { page_cache_release(page); - if (error == -EEXIST) + if (error == -EEXIST) { + error = 0; goto find_page; - desc->error = error; + } goto out; } goto readpage; @@ -1313,130 +1653,45 @@ out: *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; file_accessed(filp); + return written ? written : error; } -int file_read_actor(read_descriptor_t *desc, struct page *page, - unsigned long offset, unsigned long size) -{ - char *kaddr; - unsigned long left, count = desc->count; - - if (size > count) - size = count; - - /* - * Faults on the destination of a read are common, so do it before - * taking the kmap. - */ - if (!fault_in_pages_writeable(desc->arg.buf, size)) { - kaddr = kmap_atomic(page); - left = __copy_to_user_inatomic(desc->arg.buf, - kaddr + offset, size); - kunmap_atomic(kaddr); - if (left == 0) - goto success; - } - - /* Do it the slow way */ - kaddr = kmap(page); - left = __copy_to_user(desc->arg.buf, kaddr + offset, size); - kunmap(page); - - if (left) { - size -= left; - desc->error = -EFAULT; - } -success: - desc->count = count - size; - desc->written += size; - desc->arg.buf += size; - return size; -} - -/* - * Performs necessary checks before doing a write - * @iov: io vector request - * @nr_segs: number of segments in the iovec - * @count: number of bytes to write - * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE - * - * Adjust number of segments and amount of bytes to write (nr_segs should be - * properly initialized first). Returns appropriate error code that caller - * should return or zero in case that write should be allowed. - */ -int generic_segment_checks(const struct iovec *iov, - unsigned long *nr_segs, size_t *count, int access_flags) -{ - unsigned long seg; - size_t cnt = 0; - for (seg = 0; seg < *nr_segs; seg++) { - const struct iovec *iv = &iov[seg]; - - /* - * If any segment has a negative length, or the cumulative - * length ever wraps negative then return -EINVAL. - */ - cnt += iv->iov_len; - if (unlikely((ssize_t)(cnt|iv->iov_len) < 0)) - return -EINVAL; - if (access_ok(access_flags, iv->iov_base, iv->iov_len)) - continue; - if (seg == 0) - return -EFAULT; - *nr_segs = seg; - cnt -= iv->iov_len; /* This segment is no good */ - break; - } - *count = cnt; - return 0; -} -EXPORT_SYMBOL(generic_segment_checks); - /** - * generic_file_aio_read - generic filesystem read routine + * generic_file_read_iter - generic filesystem read routine * @iocb: kernel I/O control block - * @iov: io vector request - * @nr_segs: number of segments in the iovec - * @pos: current file position + * @iter: destination for the data read * - * This is the "read()" routine for all filesystems + * This is the "read_iter()" routine for all filesystems * that can use the page cache directly. */ ssize_t -generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { - struct file *filp = iocb->ki_filp; - ssize_t retval; - unsigned long seg = 0; - size_t count; + struct file *file = iocb->ki_filp; + ssize_t retval = 0; loff_t *ppos = &iocb->ki_pos; - - count = 0; - retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); - if (retval) - return retval; + loff_t pos = *ppos; /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ - if (filp->f_flags & O_DIRECT) { + if (file->f_flags & O_DIRECT) { + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); loff_t size; - struct address_space *mapping; - struct inode *inode; - mapping = filp->f_mapping; - inode = mapping->host; if (!count) goto out; /* skip atime */ size = i_size_read(inode); retval = filemap_write_and_wait_range(mapping, pos, - pos + iov_length(iov, nr_segs) - 1); + pos + count - 1); if (!retval) { - retval = mapping->a_ops->direct_IO(READ, iocb, - iov, pos, nr_segs); + struct iov_iter data = *iter; + retval = mapping->a_ops->direct_IO(READ, iocb, &data, pos); } + if (retval > 0) { *ppos = pos + retval; - count -= retval; + iov_iter_advance(iter, retval); } /* @@ -1447,49 +1702,17 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, * and return. Otherwise fallthrough to buffered io for * the rest of the read. */ - if (retval < 0 || !count || *ppos >= size) { - file_accessed(filp); + if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) { + file_accessed(file); goto out; } } - count = retval; - for (seg = 0; seg < nr_segs; seg++) { - read_descriptor_t desc; - loff_t offset = 0; - - /* - * If we did a short DIO read we need to skip the section of the - * iov that we've already read data into. - */ - if (count) { - if (count > iov[seg].iov_len) { - count -= iov[seg].iov_len; - continue; - } - offset = count; - count = 0; - } - - desc.written = 0; - desc.arg.buf = iov[seg].iov_base + offset; - desc.count = iov[seg].iov_len - offset; - if (desc.count == 0) - continue; - desc.error = 0; - do_generic_file_read(filp, ppos, &desc); - retval += desc.written; - if (desc.error) { - retval = retval ?: desc.error; - break; - } - if (desc.count > 0) - break; - } + retval = do_generic_file_read(file, ppos, iter, retval); out: return retval; } -EXPORT_SYMBOL(generic_file_aio_read); +EXPORT_SYMBOL(generic_file_read_iter); #ifdef CONFIG_MMU /** @@ -1614,11 +1837,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; struct page *page; - pgoff_t size; + loff_t size; int ret = 0; - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (offset >= size) + size = round_up(i_size_read(inode), PAGE_CACHE_SIZE); + if (offset >= size >> PAGE_CACHE_SHIFT) return VM_FAULT_SIGBUS; /* @@ -1667,8 +1890,8 @@ retry_find: * Found the page and have a reference on it. * We must recheck i_size under page lock. */ - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(offset >= size)) { + size = round_up(i_size_read(inode), PAGE_CACHE_SIZE); + if (unlikely(offset >= size >> PAGE_CACHE_SHIFT)) { unlock_page(page); page_cache_release(page); return VM_FAULT_SIGBUS; @@ -1726,6 +1949,78 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); +void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct radix_tree_iter iter; + void **slot; + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + loff_t size; + struct page *page; + unsigned long address = (unsigned long) vmf->virtual_address; + unsigned long addr; + pte_t *pte; + + rcu_read_lock(); + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) { + if (iter.index > vmf->max_pgoff) + break; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + goto next; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + break; + else + goto next; + } + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } + + if (!PageUptodate(page) || + PageReadahead(page) || + PageHWPoison(page)) + goto skip; + if (!trylock_page(page)) + goto skip; + + if (page->mapping != mapping || !PageUptodate(page)) + goto unlock; + + size = round_up(i_size_read(mapping->host), PAGE_CACHE_SIZE); + if (page->index >= size >> PAGE_CACHE_SHIFT) + goto unlock; + + pte = vmf->pte + page->index - vmf->pgoff; + if (!pte_none(*pte)) + goto unlock; + + if (file->f_ra.mmap_miss > 0) + file->f_ra.mmap_miss--; + addr = address + (page->index - vmf->pgoff) * PAGE_SIZE; + do_set_pte(vma, addr, page, pte, false, false); + unlock_page(page); + goto next; +unlock: + unlock_page(page); +skip: + page_cache_release(page); +next: + if (iter.index == vmf->max_pgoff) + break; + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(filemap_map_pages); + int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; @@ -1755,6 +2050,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite); const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = filemap_page_mkwrite, .remap_pages = generic_file_remap_pages, }; @@ -1795,6 +2091,18 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); +static struct page *wait_on_page_read(struct page *page) +{ + if (!IS_ERR(page)) { + wait_on_page_locked(page); + if (!PageUptodate(page)) { + page_cache_release(page); + page = ERR_PTR(-EIO); + } + } + return page; +} + static struct page *__read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), @@ -1821,6 +2129,8 @@ repeat: if (err < 0) { page_cache_release(page); page = ERR_PTR(err); + } else { + page = wait_on_page_read(page); } } return page; @@ -1857,6 +2167,10 @@ retry: if (err < 0) { page_cache_release(page); return ERR_PTR(err); + } else { + page = wait_on_page_read(page); + if (IS_ERR(page)) + return page; } out: mark_page_accessed(page); @@ -1864,40 +2178,25 @@ out: } /** - * read_cache_page_async - read into page cache, fill it if needed + * read_cache_page - read into page cache, fill it if needed * @mapping: the page's address_space * @index: the page index * @filler: function to perform the read * @data: first arg to filler(data, page) function, often left as NULL * - * Same as read_cache_page, but don't wait for page to become unlocked - * after submitting it to the filler. - * * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page but don't wait for it to become unlocked. + * not set, try to fill the page and wait for it to become unlocked. * * If the page does not get brought uptodate, return -EIO. */ -struct page *read_cache_page_async(struct address_space *mapping, +struct page *read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), void *data) { return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); } -EXPORT_SYMBOL(read_cache_page_async); - -static struct page *wait_on_page_read(struct page *page) -{ - if (!IS_ERR(page)) { - wait_on_page_locked(page); - if (!PageUptodate(page)) { - page_cache_release(page); - page = ERR_PTR(-EIO); - } - } - return page; -} +EXPORT_SYMBOL(read_cache_page); /** * read_cache_page_gfp - read into page cache, using specified page allocation flags. @@ -1916,175 +2215,10 @@ struct page *read_cache_page_gfp(struct address_space *mapping, { filler_t *filler = (filler_t *)mapping->a_ops->readpage; - return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); + return do_read_cache_page(mapping, index, filler, NULL, gfp); } EXPORT_SYMBOL(read_cache_page_gfp); -/** - * read_cache_page - read into page cache, fill it if needed - * @mapping: the page's address_space - * @index: the page index - * @filler: function to perform the read - * @data: first arg to filler(data, page) function, often left as NULL - * - * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page then wait for it to become unlocked. - * - * If the page does not get brought uptodate, return -EIO. - */ -struct page *read_cache_page(struct address_space *mapping, - pgoff_t index, - int (*filler)(void *, struct page *), - void *data) -{ - return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); -} -EXPORT_SYMBOL(read_cache_page); - -static size_t __iovec_copy_from_user_inatomic(char *vaddr, - const struct iovec *iov, size_t base, size_t bytes) -{ - size_t copied = 0, left = 0; - - while (bytes) { - char __user *buf = iov->iov_base + base; - int copy = min(bytes, iov->iov_len - base); - - base = 0; - left = __copy_from_user_inatomic(vaddr, buf, copy); - copied += copy; - bytes -= copy; - vaddr += copy; - iov++; - - if (unlikely(left)) - break; - } - return copied - left; -} - -/* - * Copy as much as we can into the page and return the number of bytes which - * were successfully copied. If a fault is encountered then return the number of - * bytes which were copied. - */ -size_t iov_iter_copy_from_user_atomic(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes) -{ - char *kaddr; - size_t copied; - - BUG_ON(!in_atomic()); - kaddr = kmap_atomic(page); - if (likely(i->nr_segs == 1)) { - int left; - char __user *buf = i->iov->iov_base + i->iov_offset; - left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); - copied = bytes - left; - } else { - copied = __iovec_copy_from_user_inatomic(kaddr + offset, - i->iov, i->iov_offset, bytes); - } - kunmap_atomic(kaddr); - - return copied; -} -EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); - -/* - * This has the same sideeffects and return value as - * iov_iter_copy_from_user_atomic(). - * The difference is that it attempts to resolve faults. - * Page must not be locked. - */ -size_t iov_iter_copy_from_user(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes) -{ - char *kaddr; - size_t copied; - - kaddr = kmap(page); - if (likely(i->nr_segs == 1)) { - int left; - char __user *buf = i->iov->iov_base + i->iov_offset; - left = __copy_from_user(kaddr + offset, buf, bytes); - copied = bytes - left; - } else { - copied = __iovec_copy_from_user_inatomic(kaddr + offset, - i->iov, i->iov_offset, bytes); - } - kunmap(page); - return copied; -} -EXPORT_SYMBOL(iov_iter_copy_from_user); - -void iov_iter_advance(struct iov_iter *i, size_t bytes) -{ - BUG_ON(i->count < bytes); - - if (likely(i->nr_segs == 1)) { - i->iov_offset += bytes; - i->count -= bytes; - } else { - const struct iovec *iov = i->iov; - size_t base = i->iov_offset; - unsigned long nr_segs = i->nr_segs; - - /* - * The !iov->iov_len check ensures we skip over unlikely - * zero-length segments (without overruning the iovec). - */ - while (bytes || unlikely(i->count && !iov->iov_len)) { - int copy; - - copy = min(bytes, iov->iov_len - base); - BUG_ON(!i->count || i->count < copy); - i->count -= copy; - bytes -= copy; - base += copy; - if (iov->iov_len == base) { - iov++; - nr_segs--; - base = 0; - } - } - i->iov = iov; - i->iov_offset = base; - i->nr_segs = nr_segs; - } -} -EXPORT_SYMBOL(iov_iter_advance); - -/* - * Fault in the first iovec of the given iov_iter, to a maximum length - * of bytes. Returns 0 on success, or non-zero if the memory could not be - * accessed (ie. because it is an invalid address). - * - * writev-intensive code may want this to prefault several iovecs -- that - * would be possible (callers must not rely on the fact that _only_ the - * first iovec will be faulted with the current implementation). - */ -int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) -{ - char __user *buf = i->iov->iov_base + i->iov_offset; - bytes = min(bytes, i->iov->iov_len - i->iov_offset); - return fault_in_pages_readable(buf, bytes); -} -EXPORT_SYMBOL(iov_iter_fault_in_readable); - -/* - * Return the count of just the current iov_iter segment. - */ -size_t iov_iter_single_seg_count(const struct iov_iter *i) -{ - const struct iovec *iov = i->iov; - if (i->nr_segs == 1) - return i->count; - else - return min(i->count, iov->iov_len - i->iov_offset); -} -EXPORT_SYMBOL(iov_iter_single_seg_count); - /* * Performs necessary checks before doing a write * @@ -2184,15 +2318,12 @@ int pagecache_write_end(struct file *file, struct address_space *mapping, { const struct address_space_operations *aops = mapping->a_ops; - mark_page_accessed(page); return aops->write_end(file, mapping, pos, len, copied, page, fsdata); } EXPORT_SYMBOL(pagecache_write_end); ssize_t -generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long *nr_segs, loff_t pos, loff_t *ppos, - size_t count, size_t ocount) +generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -2200,11 +2331,9 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, ssize_t written; size_t write_len; pgoff_t end; + struct iov_iter data; - if (count != ocount) - *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); - - write_len = iov_length(iov, *nr_segs); + write_len = iov_iter_count(from); end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); @@ -2231,7 +2360,8 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, } } - written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); + data = *from; + written = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos); /* * Finally, try again to invalidate clean pages which might have been @@ -2248,11 +2378,12 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, if (written > 0) { pos += written; + iov_iter_advance(from, written); if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { i_size_write(inode, pos); mark_inode_dirty(inode); } - *ppos = pos; + iocb->ki_pos = pos; } out: return written; @@ -2266,39 +2397,23 @@ EXPORT_SYMBOL(generic_file_direct_write); struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags) { - int status; - gfp_t gfp_mask; struct page *page; - gfp_t gfp_notmask = 0; + int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT; - gfp_mask = mapping_gfp_mask(mapping); - if (mapping_cap_account_dirty(mapping)) - gfp_mask |= __GFP_WRITE; if (flags & AOP_FLAG_NOFS) - gfp_notmask = __GFP_FS; -repeat: - page = find_lock_page(mapping, index); + fgp_flags |= FGP_NOFS; + + page = pagecache_get_page(mapping, index, fgp_flags, + mapping_gfp_mask(mapping), + GFP_KERNEL); if (page) - goto found; + wait_for_stable_page(page); - page = __page_cache_alloc(gfp_mask & ~gfp_notmask); - if (!page) - return NULL; - status = add_to_page_cache_lru(page, mapping, index, - GFP_KERNEL & ~gfp_notmask); - if (unlikely(status)) { - page_cache_release(page); - if (status == -EEXIST) - goto repeat; - return NULL; - } -found: - wait_for_stable_page(page); return page; } EXPORT_SYMBOL(grab_cache_page_write_begin); -static ssize_t generic_perform_write(struct file *file, +ssize_t generic_perform_write(struct file *file, struct iov_iter *i, loff_t pos) { struct address_space *mapping = file->f_mapping; @@ -2342,18 +2457,15 @@ again: status = a_ops->write_begin(file, mapping, pos, bytes, flags, &page, &fsdata); - if (unlikely(status)) + if (unlikely(status < 0)) break; if (mapping_writably_mapped(mapping)) flush_dcache_page(page); - pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); - pagefault_enable(); flush_dcache_page(page); - mark_page_accessed(page); status = a_ops->write_end(file, mapping, pos, bytes, copied, page, fsdata); if (unlikely(status < 0)) @@ -2388,34 +2500,12 @@ again: return written ? written : status; } - -ssize_t -generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, loff_t *ppos, - size_t count, ssize_t written) -{ - struct file *file = iocb->ki_filp; - ssize_t status; - struct iov_iter i; - - iov_iter_init(&i, iov, nr_segs, count, written); - status = generic_perform_write(file, &i, pos); - - if (likely(status >= 0)) { - written += status; - *ppos = pos + status; - } - - return written ? written : status; -} -EXPORT_SYMBOL(generic_file_buffered_write); +EXPORT_SYMBOL(generic_perform_write); /** - * __generic_file_aio_write - write data to a file + * __generic_file_write_iter - write data to a file * @iocb: IO state structure (file, offset, etc.) - * @iov: vector with data to write - * @nr_segs: number of segments in the vector - * @ppos: position where to write + * @from: iov_iter with data to write * * This function does all the work needed for actually writing data to a * file. It does all basic checks, removes SUID from the file, updates @@ -2429,30 +2519,19 @@ EXPORT_SYMBOL(generic_file_buffered_write); * A caller has to handle it. This is mainly due to the fact that we want to * avoid syncing under i_mutex. */ -ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) +ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; - size_t ocount; /* original count */ - size_t count; /* after file limit checks */ struct inode *inode = mapping->host; - loff_t pos; - ssize_t written; + loff_t pos = iocb->ki_pos; + ssize_t written = 0; ssize_t err; - - ocount = 0; - err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); - if (err) - return err; - - count = ocount; - pos = *ppos; + ssize_t status; + size_t count = iov_iter_count(from); /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; - written = 0; - err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) goto out; @@ -2460,6 +2539,8 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (count == 0) goto out; + iov_iter_truncate(from, count); + err = file_remove_suid(file); if (err) goto out; @@ -2471,42 +2552,40 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (unlikely(file->f_flags & O_DIRECT)) { loff_t endbyte; - ssize_t written_buffered; - written = generic_file_direct_write(iocb, iov, &nr_segs, pos, - ppos, count, ocount); + written = generic_file_direct_write(iocb, from, pos); if (written < 0 || written == count) goto out; + /* * direct-io write to a hole: fall through to buffered I/O * for completing the rest of the request. */ pos += written; count -= written; - written_buffered = generic_file_buffered_write(iocb, iov, - nr_segs, pos, ppos, count, - written); + + status = generic_perform_write(file, from, pos); /* - * If generic_file_buffered_write() retuned a synchronous error + * If generic_perform_write() returned a synchronous error * then we want to return the number of bytes which were * direct-written, or the error code if that was zero. Note * that this differs from normal direct-io semantics, which * will return -EFOO even if some bytes were written. */ - if (written_buffered < 0) { - err = written_buffered; + if (unlikely(status < 0) && !written) { + err = status; goto out; } - + iocb->ki_pos = pos + status; /* * We need to ensure that the page cache pages are written to * disk and invalidated to preserve the expected O_DIRECT * semantics. */ - endbyte = pos + written_buffered - written - 1; + endbyte = pos + status - 1; err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); if (err == 0) { - written = written_buffered; + written += status; invalidate_mapping_pages(mapping, pos >> PAGE_CACHE_SHIFT, endbyte >> PAGE_CACHE_SHIFT); @@ -2517,37 +2596,33 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, */ } } else { - written = generic_file_buffered_write(iocb, iov, nr_segs, - pos, ppos, count, written); + written = generic_perform_write(file, from, pos); + if (likely(written >= 0)) + iocb->ki_pos = pos + written; } out: current->backing_dev_info = NULL; return written ? written : err; } -EXPORT_SYMBOL(__generic_file_aio_write); +EXPORT_SYMBOL(__generic_file_write_iter); /** - * generic_file_aio_write - write data to a file + * generic_file_write_iter - write data to a file * @iocb: IO state structure - * @iov: vector with data to write - * @nr_segs: number of segments in the vector - * @pos: position in file where to write + * @from: iov_iter with data to write * - * This is a wrapper around __generic_file_aio_write() to be used by most + * This is a wrapper around __generic_file_write_iter() to be used by most * filesystems. It takes care of syncing the file in case of O_SYNC file * and acquires i_mutex as needed. */ -ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t ret; - BUG_ON(iocb->ki_pos != pos); - mutex_lock(&inode->i_mutex); - ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + ret = __generic_file_write_iter(iocb, from); mutex_unlock(&inode->i_mutex); if (ret > 0) { @@ -2559,7 +2634,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, } return ret; } -EXPORT_SYMBOL(generic_file_aio_write); +EXPORT_SYMBOL(generic_file_write_iter); /** * try_to_release_page() - release old fs-specific metadata on a page diff --git a/mm/fremap.c b/mm/fremap.c index 34feba6..72b8fa3 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -82,13 +82,10 @@ static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, ptfile = pgoff_to_pte(pgoff); - if (!pte_none(*pte)) { - if (pte_present(*pte) && pte_soft_dirty(*pte)) - pte_file_mksoft_dirty(ptfile); + if (!pte_none(*pte)) zap_pte(mm, vma, addr, pte); - } - set_pte_at(mm, addr, pte, ptfile); + set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile)); /* * We don't need to run update_mmu_cache() here because the "file pte" * being installed by install_file_pte() is not a real pte - it's a @@ -152,6 +149,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, int has_write_lock = 0; vm_flags_t vm_flags = 0; + pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " + "See Documentation/vm/remap_file_pages.txt.\n", + current->comm, current->pid); + if (prot) return err; /* diff --git a/mm/frontswap.c b/mm/frontswap.c index 1b24bdc..c30eec5 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area); static unsigned long __frontswap_curr_pages(void) { - int type; unsigned long totalpages = 0; struct swap_info_struct *si = NULL; assert_spin_locked(&swap_lock); - for (type = swap_list.head; type >= 0; type = si->next) { - si = swap_info[type]; + plist_for_each_entry(si, &swap_active_head, list) totalpages += atomic_read(&si->frontswap_pages); - } return totalpages; } @@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, int si_frontswap_pages; unsigned long total_pages_to_unuse = total; unsigned long pages = 0, pages_to_unuse = 0; - int type; assert_spin_locked(&swap_lock); - for (type = swap_list.head; type >= 0; type = si->next) { - si = swap_info[type]; + plist_for_each_entry(si, &swap_active_head, list) { si_frontswap_pages = atomic_read(&si->frontswap_pages); if (total_pages_to_unuse < si_frontswap_pages) { pages = pages_to_unuse = total_pages_to_unuse; @@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, } vm_unacct_memory(pages); *unused = pages_to_unuse; - *swapid = type; + *swapid = si->type; ret = 0; break; } @@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages) /* * we don't want to hold swap_lock while doing a very * lengthy try_to_unuse, but swap_list may change - * so restart scan from swap_list.head each time + * so restart scan from swap_active_head each time */ spin_lock(&swap_lock); ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); diff --git a/mm/gup.c b/mm/gup.c new file mode 100644 index 0000000..cc5a9e7 --- /dev/null +++ b/mm/gup.c @@ -0,0 +1,662 @@ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/spinlock.h> + +#include <linux/hugetlb.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/swapops.h> + +#include "internal.h" + +static struct page *no_page_table(struct vm_area_struct *vma, + unsigned int flags) +{ + /* + * When core dumping an enormous anonymous area that nobody + * has touched so far, we don't want to allocate unnecessary pages or + * page tables. Return error instead of NULL to skip handle_mm_fault, + * then get_dump_page() will return NULL to leave a hole in the dump. + * But we can only make this optimization where a hole would surely + * be zero-filled if handle_mm_fault() actually did handle it. + */ + if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault)) + return ERR_PTR(-EFAULT); + return NULL; +} + +static struct page *follow_page_pte(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, unsigned int flags) +{ + struct mm_struct *mm = vma->vm_mm; + struct page *page; + spinlock_t *ptl; + pte_t *ptep, pte; + +retry: + if (unlikely(pmd_bad(*pmd))) + return no_page_table(vma, flags); + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + pte = *ptep; + if (!pte_present(pte)) { + swp_entry_t entry; + /* + * KSM's break_ksm() relies upon recognizing a ksm page + * even while it is being migrated, so for that case we + * need migration_entry_wait(). + */ + if (likely(!(flags & FOLL_MIGRATION))) + goto no_page; + if (pte_none(pte) || pte_file(pte)) + goto no_page; + entry = pte_to_swp_entry(pte); + if (!is_migration_entry(entry)) + goto no_page; + pte_unmap_unlock(ptep, ptl); + migration_entry_wait(mm, pmd, address); + goto retry; + } + if ((flags & FOLL_NUMA) && pte_numa(pte)) + goto no_page; + if ((flags & FOLL_WRITE) && !pte_write(pte)) { + pte_unmap_unlock(ptep, ptl); + return NULL; + } + + page = vm_normal_page(vma, address, pte); + if (unlikely(!page)) { + if ((flags & FOLL_DUMP) || + !is_zero_pfn(pte_pfn(pte))) + goto bad_page; + page = pte_page(pte); + } + + if (flags & FOLL_GET) + get_page_foll(page); + if (flags & FOLL_TOUCH) { + if ((flags & FOLL_WRITE) && + !pte_dirty(pte) && !PageDirty(page)) + set_page_dirty(page); + /* + * pte_mkyoung() would be more correct here, but atomic care + * is needed to avoid losing the dirty bit: it is easier to use + * mark_page_accessed(). + */ + mark_page_accessed(page); + } + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + /* + * The preliminary mapping check is mainly to avoid the + * pointless overhead of lock_page on the ZERO_PAGE + * which might bounce very badly if there is contention. + * + * If the page is already locked, we don't need to + * handle it now - vmscan will handle it later if and + * when it attempts to reclaim the page. + */ + if (page->mapping && trylock_page(page)) { + lru_add_drain(); /* push cached pages to LRU */ + /* + * Because we lock page here, and migration is + * blocked by the pte's page reference, and we + * know the page is still mapped, we don't even + * need to check for file-cache page truncation. + */ + mlock_vma_page(page); + unlock_page(page); + } + } + pte_unmap_unlock(ptep, ptl); + return page; +bad_page: + pte_unmap_unlock(ptep, ptl); + return ERR_PTR(-EFAULT); + +no_page: + pte_unmap_unlock(ptep, ptl); + if (!pte_none(pte)) + return NULL; + return no_page_table(vma, flags); +} + +/** + * follow_page_mask - look up a page descriptor from a user-virtual address + * @vma: vm_area_struct mapping @address + * @address: virtual address to look up + * @flags: flags modifying lookup behaviour + * @page_mask: on output, *page_mask is set according to the size of the page + * + * @flags can have FOLL_ flags set, defined in <linux/mm.h> + * + * Returns the mapped (struct page *), %NULL if no mapping exists, or + * an error pointer if there is a mapping to something not represented + * by a page descriptor (see also vm_normal_page()). + */ +struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned int *page_mask) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + spinlock_t *ptl; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + *page_mask = 0; + + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); + if (!IS_ERR(page)) { + BUG_ON(flags & FOLL_GET); + return page; + } + + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + return no_page_table(vma, flags); + + pud = pud_offset(pgd, address); + if (pud_none(*pud)) + return no_page_table(vma, flags); + if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { + if (flags & FOLL_GET) + return NULL; + page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); + return page; + } + if (unlikely(pud_bad(*pud))) + return no_page_table(vma, flags); + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + return no_page_table(vma, flags); + if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { + page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); + if (flags & FOLL_GET) { + /* + * Refcount on tail pages are not well-defined and + * shouldn't be taken. The caller should handle a NULL + * return when trying to follow tail pages. + */ + if (PageHead(page)) + get_page(page); + else + page = NULL; + } + return page; + } + if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + return no_page_table(vma, flags); + if (pmd_trans_huge(*pmd)) { + if (flags & FOLL_SPLIT) { + split_huge_page_pmd(vma, address, pmd); + return follow_page_pte(vma, address, pmd, flags); + } + ptl = pmd_lock(mm, pmd); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(ptl); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + page = follow_trans_huge_pmd(vma, address, + pmd, flags); + spin_unlock(ptl); + *page_mask = HPAGE_PMD_NR - 1; + return page; + } + } else + spin_unlock(ptl); + } + return follow_page_pte(vma, address, pmd, flags); +} + +static int get_gate_page(struct mm_struct *mm, unsigned long address, + unsigned int gup_flags, struct vm_area_struct **vma, + struct page **page) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int ret = -EFAULT; + + /* user gate pages are read-only */ + if (gup_flags & FOLL_WRITE) + return -EFAULT; + if (address > TASK_SIZE) + pgd = pgd_offset_k(address); + else + pgd = pgd_offset_gate(mm, address); + BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd, address); + BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + return -EFAULT; + VM_BUG_ON(pmd_trans_huge(*pmd)); + pte = pte_offset_map(pmd, address); + if (pte_none(*pte)) + goto unmap; + *vma = get_gate_vma(mm); + if (!page) + goto out; + *page = vm_normal_page(*vma, address, *pte); + if (!*page) { + if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) + goto unmap; + *page = pte_page(*pte); + } + get_page(*page); +out: + ret = 0; +unmap: + pte_unmap(pte); + return ret; +} + +static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, + unsigned long address, unsigned int *flags, int *nonblocking) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned int fault_flags = 0; + int ret; + + /* For mlock, just skip the stack guard page. */ + if ((*flags & FOLL_MLOCK) && + (stack_guard_page_start(vma, address) || + stack_guard_page_end(vma, address + PAGE_SIZE))) + return -ENOENT; + if (*flags & FOLL_WRITE) + fault_flags |= FAULT_FLAG_WRITE; + if (nonblocking) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; + if (*flags & FOLL_NOWAIT) + fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; + + ret = handle_mm_fault(mm, vma, address, fault_flags); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return -ENOMEM; + if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) + return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT; + if (ret & VM_FAULT_SIGBUS) + return -EFAULT; + BUG(); + } + + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } + + if (ret & VM_FAULT_RETRY) { + if (nonblocking) + *nonblocking = 0; + return -EBUSY; + } + + /* + * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when + * necessary, even if maybe_mkwrite decided not to set pte_write. We + * can thus safely do subsequent page lookups as if they were reads. + * But only do so when looping for pte_write is futile: in some cases + * userspace may also be wanting to write to the gotten user page, + * which a read fault here might prevent (a readonly page might get + * reCOWed by userspace write). + */ + if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) + *flags &= ~FOLL_WRITE; + return 0; +} + +static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) +{ + vm_flags_t vm_flags = vma->vm_flags; + + if (vm_flags & (VM_IO | VM_PFNMAP)) + return -EFAULT; + + if (gup_flags & FOLL_WRITE) { + if (!(vm_flags & VM_WRITE)) { + if (!(gup_flags & FOLL_FORCE)) + return -EFAULT; + /* + * We used to let the write,force case do COW in a + * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could + * set a breakpoint in a read-only mapping of an + * executable, without corrupting the file (yet only + * when that file had been opened for writing!). + * Anon pages in shared mappings are surprising: now + * just reject it. + */ + if (!is_cow_mapping(vm_flags)) { + WARN_ON_ONCE(vm_flags & VM_MAYWRITE); + return -EFAULT; + } + } + } else if (!(vm_flags & VM_READ)) { + if (!(gup_flags & FOLL_FORCE)) + return -EFAULT; + /* + * Is there actually any vma we can reach here which does not + * have VM_MAYREAD set? + */ + if (!(vm_flags & VM_MAYREAD)) + return -EFAULT; + } + return 0; +} + +/** + * __get_user_pages() - pin user pages in memory + * @tsk: task_struct of target task + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * @nonblocking: whether waiting for disk IO or mmap_sem contention + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held for read or write. + * + * __get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * __get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If + * the page is written to, set_page_dirty (or set_page_dirty_lock, as + * appropriate) must be called after the page is finished with, and + * before put_page is called. + * + * If @nonblocking != NULL, __get_user_pages will not wait for disk IO + * or mmap_sem contention, and if waiting is needed to pin all pages, + * *@nonblocking will be set to 0. + * + * In most cases, get_user_pages or get_user_pages_fast should be used + * instead of __get_user_pages. __get_user_pages should be used only if + * you need some special @gup_flags. + */ +long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *nonblocking) +{ + long i = 0; + unsigned int page_mask; + struct vm_area_struct *vma = NULL; + + if (!nr_pages) + return 0; + + VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); + + /* + * If FOLL_FORCE is set then do not force a full fault as the hinting + * fault information is unrelated to the reference behaviour of a task + * using the address space + */ + if (!(gup_flags & FOLL_FORCE)) + gup_flags |= FOLL_NUMA; + + do { + struct page *page; + unsigned int foll_flags = gup_flags; + unsigned int page_increm; + + /* first iteration or cross vma bound */ + if (!vma || start >= vma->vm_end) { + vma = find_extend_vma(mm, start); + if (!vma && in_gate_area(mm, start)) { + int ret; + ret = get_gate_page(mm, start & PAGE_MASK, + gup_flags, &vma, + pages ? &pages[i] : NULL); + if (ret) + return i ? : ret; + page_mask = 0; + goto next_page; + } + + if (!vma || check_vma_flags(vma, gup_flags)) + return i ? : -EFAULT; + if (is_vm_hugetlb_page(vma)) { + i = follow_hugetlb_page(mm, vma, pages, vmas, + &start, &nr_pages, i, + gup_flags); + continue; + } + } +retry: + /* + * If we have a pending SIGKILL, don't keep faulting pages and + * potentially allocating memory. + */ + if (unlikely(fatal_signal_pending(current))) + return i ? i : -ERESTARTSYS; + cond_resched(); + page = follow_page_mask(vma, start, foll_flags, &page_mask); + if (!page) { + int ret; + ret = faultin_page(tsk, vma, start, &foll_flags, + nonblocking); + switch (ret) { + case 0: + goto retry; + case -EFAULT: + case -ENOMEM: + case -EHWPOISON: + return i ? i : ret; + case -EBUSY: + return i; + case -ENOENT: + goto next_page; + } + BUG(); + } + if (IS_ERR(page)) + return i ? i : PTR_ERR(page); + if (pages) { + pages[i] = page; + flush_anon_page(vma, page, start); + flush_dcache_page(page); + page_mask = 0; + } +next_page: + if (vmas) { + vmas[i] = vma; + page_mask = 0; + } + page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); + if (page_increm > nr_pages) + page_increm = nr_pages; + i += page_increm; + start += page_increm * PAGE_SIZE; + nr_pages -= page_increm; + } while (nr_pages); + return i; +} +EXPORT_SYMBOL(__get_user_pages); + +/* + * fixup_user_fault() - manually resolve a user page fault + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. + * @mm: mm_struct of target mm + * @address: user address + * @fault_flags:flags to pass down to handle_mm_fault() + * + * This is meant to be called in the specific scenario where for locking reasons + * we try to access user memory in atomic context (within a pagefault_disable() + * section), this returns -EFAULT, and we want to resolve the user fault before + * trying again. + * + * Typically this is meant to be used by the futex code. + * + * The main difference with get_user_pages() is that this function will + * unconditionally call handle_mm_fault() which will in turn perform all the + * necessary SW fixup of the dirty and young bits in the PTE, while + * handle_mm_fault() only guarantees to update these in the struct page. + * + * This is important for some architectures where those bits also gate the + * access permission to the page because they are maintained in software. On + * such architectures, gup() will not be enough to make a subsequent access + * succeed. + * + * This should be called with the mm_sem held for read. + */ +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, + unsigned long address, unsigned int fault_flags) +{ + struct vm_area_struct *vma; + vm_flags_t vm_flags; + int ret; + + vma = find_extend_vma(mm, address); + if (!vma || address < vma->vm_start) + return -EFAULT; + + vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; + if (!(vm_flags & vma->vm_flags)) + return -EFAULT; + + ret = handle_mm_fault(mm, vma, address, fault_flags); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return -ENOMEM; + if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) + return -EHWPOISON; + if (ret & VM_FAULT_SIGBUS) + return -EFAULT; + BUG(); + } + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } + return 0; +} + +/* + * get_user_pages() - pin user pages in memory + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @write: whether pages will be written to by the caller + * @force: whether to force access even when user mapping is currently + * protected (but never forces write access to shared mapping). + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held for read or write. + * + * get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If write=0, the page must not be written to. If the page is written to, + * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called + * after the page is finished with, and before put_page is called. + * + * get_user_pages is typically used for fewer-copy IO operations, to get a + * handle on the memory by some means other than accesses via the user virtual + * addresses. The pages may be submitted for DMA to devices or accessed via + * their kernel linear mapping (via the kmap APIs). Care should be taken to + * use the correct cache flushing APIs. + * + * See also get_user_pages_fast, for performance critical applications. + */ +long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, int write, + int force, struct page **pages, struct vm_area_struct **vmas) +{ + int flags = FOLL_TOUCH; + + if (pages) + flags |= FOLL_GET; + if (write) + flags |= FOLL_WRITE; + if (force) + flags |= FOLL_FORCE; + + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, + NULL); +} +EXPORT_SYMBOL(get_user_pages); + +/** + * get_dump_page() - pin user page in memory while writing it to core dump + * @addr: user address + * + * Returns struct page pointer of user page pinned for dump, + * to be freed afterwards by page_cache_release() or put_page(). + * + * Returns NULL on any kind of failure - a hole must then be inserted into + * the corefile, to preserve alignment with its headers; and also returns + * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - + * allowing a hole to be left in the corefile to save diskspace. + * + * Called without mmap_sem, but after all other threads have been killed. + */ +#ifdef CONFIG_ELF_CORE +struct page *get_dump_page(unsigned long addr) +{ + struct vm_area_struct *vma; + struct page *page; + + if (__get_user_pages(current, current->mm, addr, 1, + FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, + NULL) < 1) + return NULL; + flush_cache_page(vma, addr, page_to_pfn(page)); + return page; +} +#endif /* CONFIG_ELF_CORE */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1546655..33514d8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -5,6 +5,8 @@ * the COPYING file in the top-level directory. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/mm.h> #include <linux/sched.h> #include <linux/highmem.h> @@ -151,8 +153,7 @@ static int start_khugepaged(void) khugepaged_thread = kthread_run(khugepaged, NULL, "khugepaged"); if (unlikely(IS_ERR(khugepaged_thread))) { - printk(KERN_ERR - "khugepaged: kthread_run(khugepaged) failed\n"); + pr_err("khugepaged: kthread_run(khugepaged) failed\n"); err = PTR_ERR(khugepaged_thread); khugepaged_thread = NULL; } @@ -584,19 +585,19 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { - printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n"); + pr_err("failed to create transparent hugepage kobject\n"); return -ENOMEM; } err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); if (err) { - printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); + pr_err("failed to register transparent hugepage group\n"); goto delete_obj; } err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); if (err) { - printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); + pr_err("failed to register transparent hugepage group\n"); goto remove_hp_group; } @@ -689,8 +690,7 @@ static int __init setup_transparent_hugepage(char *str) } out: if (!ret) - printk(KERN_WARNING - "transparent_hugepage= cannot parse, ignored\n"); + pr_warn("transparent_hugepage= cannot parse, ignored\n"); return ret; } __setup("transparent_hugepage=", setup_transparent_hugepage); @@ -827,7 +827,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { + if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_KERNEL))) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -941,79 +941,35 @@ unlock: spin_unlock(ptl); } -static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) +/* + * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages + * during copy_user_huge_page()'s copy_page_rep(): in the case when + * the source page gets split and a tail freed before copy completes. + * Called under pmd_lock of checked pmd, so safe from splitting itself. + */ +static void get_user_huge_page(struct page *page) { - spinlock_t *ptl; - pgtable_t pgtable; - pmd_t _pmd; - struct page *page; - int i, ret = 0; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { + struct page *endpage = page + HPAGE_PMD_NR; - page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (!page) { - ret |= VM_FAULT_OOM; - goto out; - } - - if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { - put_page(page); - ret |= VM_FAULT_OOM; - goto out; + atomic_add(HPAGE_PMD_NR, &page->_count); + while (++page < endpage) + get_huge_page_tail(page); + } else { + get_page(page); } +} - clear_user_highpage(page, address); - __SetPageUptodate(page); - - mmun_start = haddr; - mmun_end = haddr + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - - ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_same(*pmd, orig_pmd))) - goto out_free_page; - - pmdp_clear_flush(vma, haddr, pmd); - /* leave pmd empty until pte is filled */ - - pgtable = pgtable_trans_huge_withdraw(mm, pmd); - pmd_populate(mm, &_pmd, pgtable); +static void put_user_huge_page(struct page *page) +{ + if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { + struct page *endpage = page + HPAGE_PMD_NR; - for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { - pte_t *pte, entry; - if (haddr == (address & PAGE_MASK)) { - entry = mk_pte(page, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - page_add_new_anon_rmap(page, vma, haddr); - } else { - entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); - entry = pte_mkspecial(entry); - } - pte = pte_offset_map(&_pmd, haddr); - VM_BUG_ON(!pte_none(*pte)); - set_pte_at(mm, haddr, pte, entry); - pte_unmap(pte); + while (page < endpage) + put_page(page++); + } else { + put_page(page); } - smp_wmb(); /* make pte visible before pmd */ - pmd_populate(mm, pmd, pgtable); - spin_unlock(ptl); - put_huge_zero_page(); - inc_mm_counter(mm, MM_ANONPAGES); - - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - - ret |= VM_FAULT_WRITE; -out: - return ret; -out_free_page: - spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - mem_cgroup_uncharge_page(page); - put_page(page); - goto out; } static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, @@ -1043,7 +999,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, __GFP_OTHER_NODE, vma, address, page_to_nid(page)); if (unlikely(!pages[i] || - mem_cgroup_newpage_charge(pages[i], mm, + mem_cgroup_charge_anon(pages[i], mm, GFP_KERNEL))) { if (pages[i]) put_page(pages[i]); @@ -1149,7 +1105,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, ret |= VM_FAULT_WRITE; goto out_unlock; } - get_page(page); + get_user_huge_page(page); spin_unlock(ptl); alloc: if (transparent_hugepage_enabled(vma) && @@ -1161,8 +1117,8 @@ alloc: if (unlikely(!new_page)) { if (!page) { - ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, - address, pmd, orig_pmd, haddr); + split_huge_page_pmd(vma, address, pmd); + ret |= VM_FAULT_FALLBACK; } else { ret = do_huge_pmd_wp_page_fallback(mm, vma, address, pmd, orig_pmd, page, haddr); @@ -1170,17 +1126,17 @@ alloc: split_huge_page(page); ret |= VM_FAULT_FALLBACK; } - put_page(page); + put_user_huge_page(page); } count_vm_event(THP_FAULT_FALLBACK); goto out; } - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) { put_page(new_page); if (page) { split_huge_page(page); - put_page(page); + put_user_huge_page(page); } else split_huge_page_pmd(vma, address, pmd); ret |= VM_FAULT_FALLBACK; @@ -1202,7 +1158,7 @@ alloc: spin_lock(ptl); if (page) - put_page(page); + put_user_huge_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { spin_unlock(ptl); mem_cgroup_uncharge_page(new_page); @@ -1611,16 +1567,23 @@ pmd_t *page_check_address_pmd(struct page *page, enum page_check_address_pmd_flag flag, spinlock_t **ptl) { + pgd_t *pgd; + pud_t *pud; pmd_t *pmd; if (address & ~HPAGE_PMD_MASK) return NULL; - pmd = mm_find_pmd(mm, address); - if (!pmd) + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return NULL; + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) return NULL; + pmd = pmd_offset(pud, address); + *ptl = pmd_lock(mm, pmd); - if (pmd_none(*pmd)) + if (!pmd_present(*pmd)) goto unlock; if (pmd_page(*pmd) != page) goto unlock; @@ -1898,10 +1861,11 @@ static void __split_huge_page(struct page *page, * the newly established pmd of the child later during the * walk, to be able to set it as pmd_trans_splitting too. */ - if (mapcount != page_mapcount(page)) - printk(KERN_ERR "mapcount %d page_mapcount %d\n", - mapcount, page_mapcount(page)); - BUG_ON(mapcount != page_mapcount(page)); + if (mapcount != page_mapcount(page)) { + pr_err("mapcount %d page_mapcount %d\n", + mapcount, page_mapcount(page)); + BUG(); + } __split_huge_page_refcount(page, list); @@ -1912,10 +1876,11 @@ static void __split_huge_page(struct page *page, BUG_ON(is_vma_temporary_stack(vma)); mapcount2 += __split_huge_page_map(page, vma, addr); } - if (mapcount != mapcount2) - printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", - mapcount, mapcount2, page_mapcount(page)); - BUG_ON(mapcount != mapcount2); + if (mapcount != mapcount2) { + pr_err("mapcount %d mapcount2 %d page_mapcount %d\n", + mapcount, mapcount2, page_mapcount(page)); + BUG(); + } } /* @@ -1966,17 +1931,22 @@ out: int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { - struct mm_struct *mm = vma->vm_mm; - switch (advice) { case MADV_HUGEPAGE: +#ifdef CONFIG_S390 + /* + * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 + * can't handle this properly after s390_enable_sie, so we simply + * ignore the madvise to prevent qemu from causing a SIGSEGV. + */ + if (mm_has_pgste(vma->vm_mm)) + return 0; +#endif /* * Be somewhat over-protective like KSM for now! */ if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) return -EINVAL; - if (mm->def_flags & VM_NOHUGEPAGE) - return -EINVAL; *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; /* @@ -2429,7 +2399,7 @@ static void collapse_huge_page(struct mm_struct *mm, if (!new_page) return; - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) + if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) return; /* @@ -2453,8 +2423,6 @@ static void collapse_huge_page(struct mm_struct *mm, pmd = mm_find_pmd(mm, address); if (!pmd) goto out; - if (pmd_trans_huge(*pmd)) - goto out; anon_vma_lock_write(vma->anon_vma); @@ -2553,8 +2521,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, pmd = mm_find_pmd(mm, address); if (!pmd) goto out; - if (pmd_trans_huge(*pmd)) - goto out; memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); @@ -2803,7 +2769,7 @@ static int khugepaged(void *none) struct mm_slot *mm_slot; set_freezable(); - set_user_nice(current, 19); + set_user_nice(current, MAX_NICE); while (!kthread_should_stop()) { khugepaged_do_scan(); @@ -2907,12 +2873,22 @@ void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, static void split_huge_page_address(struct mm_struct *mm, unsigned long address) { + pgd_t *pgd; + pud_t *pud; pmd_t *pmd; VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); - pmd = mm_find_pmd(mm, address); - if (!pmd) + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) return; /* * Caller holds the mmap_sem write mode, so a huge pmd cannot diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c01cb9f..7a0a73d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -13,6 +13,7 @@ #include <linux/nodemask.h> #include <linux/pagemap.h> #include <linux/mempolicy.h> +#include <linux/compiler.h> #include <linux/cpuset.h> #include <linux/mutex.h> #include <linux/bootmem.h> @@ -22,6 +23,7 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/page-isolation.h> +#include <linux/jhash.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -53,6 +55,13 @@ static unsigned long __initdata default_hstate_size; */ DEFINE_SPINLOCK(hugetlb_lock); +/* + * Serializes faults on the same logical page. This is used to + * prevent spurious OOMs when the hugepage pool is fully utilized. + */ +static int num_fault_mutexes; +static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; + static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) { bool free = (spool->count == 0) && (spool->used_hpages == 0); @@ -135,15 +144,8 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) * Region tracking -- allows tracking of reservations and instantiated pages * across the pages in a mapping. * - * The region data structures are protected by a combination of the mmap_sem - * and the hugetlb_instantiation_mutex. To access or modify a region the caller - * must either hold the mmap_sem for write, or the mmap_sem for read and - * the hugetlb_instantiation_mutex: - * - * down_write(&mm->mmap_sem); - * or - * down_read(&mm->mmap_sem); - * mutex_lock(&hugetlb_instantiation_mutex); + * The region data structures are embedded into a resv_map and + * protected by a resv_map's lock */ struct file_region { struct list_head link; @@ -151,10 +153,12 @@ struct file_region { long to; }; -static long region_add(struct list_head *head, long f, long t) +static long region_add(struct resv_map *resv, long f, long t) { + struct list_head *head = &resv->regions; struct file_region *rg, *nrg, *trg; + spin_lock(&resv->lock); /* Locate the region we are either in or before. */ list_for_each_entry(rg, head, link) if (f <= rg->to) @@ -184,14 +188,18 @@ static long region_add(struct list_head *head, long f, long t) } nrg->from = f; nrg->to = t; + spin_unlock(&resv->lock); return 0; } -static long region_chg(struct list_head *head, long f, long t) +static long region_chg(struct resv_map *resv, long f, long t) { - struct file_region *rg, *nrg; + struct list_head *head = &resv->regions; + struct file_region *rg, *nrg = NULL; long chg = 0; +retry: + spin_lock(&resv->lock); /* Locate the region we are before or in. */ list_for_each_entry(rg, head, link) if (f <= rg->to) @@ -201,15 +209,21 @@ static long region_chg(struct list_head *head, long f, long t) * Subtle, allocate a new region at the position but make it zero * size such that we can guarantee to record the reservation. */ if (&rg->link == head || t < rg->from) { - nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); - if (!nrg) - return -ENOMEM; - nrg->from = f; - nrg->to = f; - INIT_LIST_HEAD(&nrg->link); - list_add(&nrg->link, rg->link.prev); + if (!nrg) { + spin_unlock(&resv->lock); + nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); + if (!nrg) + return -ENOMEM; + + nrg->from = f; + nrg->to = f; + INIT_LIST_HEAD(&nrg->link); + goto retry; + } - return t - f; + list_add(&nrg->link, rg->link.prev); + chg = t - f; + goto out_nrg; } /* Round our left edge to the current segment if it encloses us. */ @@ -222,7 +236,7 @@ static long region_chg(struct list_head *head, long f, long t) if (&rg->link == head) break; if (rg->from > t) - return chg; + goto out; /* We overlap with this area, if it extends further than * us then we must extend ourselves. Account for its @@ -233,20 +247,30 @@ static long region_chg(struct list_head *head, long f, long t) } chg -= rg->to - rg->from; } + +out: + spin_unlock(&resv->lock); + /* We already know we raced and no longer need the new region */ + kfree(nrg); + return chg; +out_nrg: + spin_unlock(&resv->lock); return chg; } -static long region_truncate(struct list_head *head, long end) +static long region_truncate(struct resv_map *resv, long end) { + struct list_head *head = &resv->regions; struct file_region *rg, *trg; long chg = 0; + spin_lock(&resv->lock); /* Locate the region we are either in or before. */ list_for_each_entry(rg, head, link) if (end <= rg->to) break; if (&rg->link == head) - return 0; + goto out; /* If we are in the middle of a region then adjust it. */ if (end > rg->from) { @@ -263,14 +287,19 @@ static long region_truncate(struct list_head *head, long end) list_del(&rg->link); kfree(rg); } + +out: + spin_unlock(&resv->lock); return chg; } -static long region_count(struct list_head *head, long f, long t) +static long region_count(struct resv_map *resv, long f, long t) { + struct list_head *head = &resv->regions; struct file_region *rg; long chg = 0; + spin_lock(&resv->lock); /* Locate each segment we overlap with, and count that overlap. */ list_for_each_entry(rg, head, link) { long seg_from; @@ -286,6 +315,7 @@ static long region_count(struct list_head *head, long f, long t) chg += seg_to - seg_from; } + spin_unlock(&resv->lock); return chg; } @@ -376,39 +406,46 @@ static void set_vma_private_data(struct vm_area_struct *vma, vma->vm_private_data = (void *)value; } -struct resv_map { - struct kref refs; - struct list_head regions; -}; - -static struct resv_map *resv_map_alloc(void) +struct resv_map *resv_map_alloc(void) { struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); if (!resv_map) return NULL; kref_init(&resv_map->refs); + spin_lock_init(&resv_map->lock); INIT_LIST_HEAD(&resv_map->regions); return resv_map; } -static void resv_map_release(struct kref *ref) +void resv_map_release(struct kref *ref) { struct resv_map *resv_map = container_of(ref, struct resv_map, refs); /* Clear out any active regions before we release the map. */ - region_truncate(&resv_map->regions, 0); + region_truncate(resv_map, 0); kfree(resv_map); } +static inline struct resv_map *inode_resv_map(struct inode *inode) +{ + return inode->i_mapping->private_data; +} + static struct resv_map *vma_resv_map(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - if (!(vma->vm_flags & VM_MAYSHARE)) + if (vma->vm_flags & VM_MAYSHARE) { + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + + return inode_resv_map(inode); + + } else { return (struct resv_map *)(get_vma_private_data(vma) & ~HPAGE_RESV_MASK); - return NULL; + } } static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) @@ -507,7 +544,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { - if (hugepages_treat_as_movable || hugepage_migration_support(h)) + if (hugepages_treat_as_movable || hugepage_migration_supported(h)) return GFP_HIGHUSER_MOVABLE; else return GFP_HIGHUSER; @@ -540,7 +577,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, goto err; retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); zonelist = huge_zonelist(vma, address, htlb_alloc_mask(h), &mpol, &nodemask); @@ -562,7 +599,7 @@ retry_cpuset: } mpol_cond_put(mpol); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; @@ -570,25 +607,242 @@ err: return NULL; } +/* + * common helper functions for hstate_next_node_to_{alloc|free}. + * We may have allocated or freed a huge page based on a different + * nodes_allowed previously, so h->next_node_to_{alloc|free} might + * be outside of *nodes_allowed. Ensure that we use an allowed + * node for alloc or free. + */ +static int next_node_allowed(int nid, nodemask_t *nodes_allowed) +{ + nid = next_node(nid, *nodes_allowed); + if (nid == MAX_NUMNODES) + nid = first_node(*nodes_allowed); + VM_BUG_ON(nid >= MAX_NUMNODES); + + return nid; +} + +static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) +{ + if (!node_isset(nid, *nodes_allowed)) + nid = next_node_allowed(nid, nodes_allowed); + return nid; +} + +/* + * returns the previously saved node ["this node"] from which to + * allocate a persistent huge page for the pool and advance the + * next node from which to allocate, handling wrap at end of node + * mask. + */ +static int hstate_next_node_to_alloc(struct hstate *h, + nodemask_t *nodes_allowed) +{ + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); + h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); + + return nid; +} + +/* + * helper for free_pool_huge_page() - return the previously saved + * node ["this node"] from which to free a huge page. Advance the + * next node id whether or not we find a free huge page to free so + * that the next attempt to free addresses the next node. + */ +static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) +{ + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); + h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); + + return nid; +} + +#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ + nr_nodes--) + +#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_free(hs, mask)) || 1); \ + nr_nodes--) + +#if defined(CONFIG_CMA) && defined(CONFIG_X86_64) +static void destroy_compound_gigantic_page(struct page *page, + unsigned long order) +{ + int i; + int nr_pages = 1 << order; + struct page *p = page + 1; + + for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + __ClearPageTail(p); + set_page_refcounted(p); + p->first_page = NULL; + } + + set_compound_order(page, 0); + __ClearPageHead(page); +} + +static void free_gigantic_page(struct page *page, unsigned order) +{ + free_contig_range(page_to_pfn(page), 1 << order); +} + +static int __alloc_gigantic_page(unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long end_pfn = start_pfn + nr_pages; + return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE); +} + +static bool pfn_range_valid_gigantic(unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long i, end_pfn = start_pfn + nr_pages; + struct page *page; + + for (i = start_pfn; i < end_pfn; i++) { + if (!pfn_valid(i)) + return false; + + page = pfn_to_page(i); + + if (PageReserved(page)) + return false; + + if (page_count(page) > 0) + return false; + + if (PageHuge(page)) + return false; + } + + return true; +} + +static bool zone_spans_last_pfn(const struct zone *zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + unsigned long last_pfn = start_pfn + nr_pages - 1; + return zone_spans_pfn(zone, last_pfn); +} + +static struct page *alloc_gigantic_page(int nid, unsigned order) +{ + unsigned long nr_pages = 1 << order; + unsigned long ret, pfn, flags; + struct zone *z; + + z = NODE_DATA(nid)->node_zones; + for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) { + spin_lock_irqsave(&z->lock, flags); + + pfn = ALIGN(z->zone_start_pfn, nr_pages); + while (zone_spans_last_pfn(z, pfn, nr_pages)) { + if (pfn_range_valid_gigantic(pfn, nr_pages)) { + /* + * We release the zone lock here because + * alloc_contig_range() will also lock the zone + * at some point. If there's an allocation + * spinning on this lock, it may win the race + * and cause alloc_contig_range() to fail... + */ + spin_unlock_irqrestore(&z->lock, flags); + ret = __alloc_gigantic_page(pfn, nr_pages); + if (!ret) + return pfn_to_page(pfn); + spin_lock_irqsave(&z->lock, flags); + } + pfn += nr_pages; + } + + spin_unlock_irqrestore(&z->lock, flags); + } + + return NULL; +} + +static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); +static void prep_compound_gigantic_page(struct page *page, unsigned long order); + +static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) +{ + struct page *page; + + page = alloc_gigantic_page(nid, huge_page_order(h)); + if (page) { + prep_compound_gigantic_page(page, huge_page_order(h)); + prep_new_huge_page(h, page, nid); + } + + return page; +} + +static int alloc_fresh_gigantic_page(struct hstate *h, + nodemask_t *nodes_allowed) +{ + struct page *page = NULL; + int nr_nodes, node; + + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + page = alloc_fresh_gigantic_page_node(h, node); + if (page) + return 1; + } + + return 0; +} + +static inline bool gigantic_page_supported(void) { return true; } +#else +static inline bool gigantic_page_supported(void) { return false; } +static inline void free_gigantic_page(struct page *page, unsigned order) { } +static inline void destroy_compound_gigantic_page(struct page *page, + unsigned long order) { } +static inline int alloc_fresh_gigantic_page(struct hstate *h, + nodemask_t *nodes_allowed) { return 0; } +#endif + static void update_and_free_page(struct hstate *h, struct page *page) { int i; - VM_BUG_ON(h->order >= MAX_ORDER); + if (hstate_is_gigantic(h) && !gigantic_page_supported()) + return; h->nr_huge_pages--; h->nr_huge_pages_node[page_to_nid(page)]--; for (i = 0; i < pages_per_huge_page(h); i++) { page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | - 1 << PG_active | 1 << PG_reserved | - 1 << PG_private | 1 << PG_writeback); + 1 << PG_active | 1 << PG_private | + 1 << PG_writeback); } VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); set_compound_page_dtor(page, NULL); set_page_refcounted(page); - arch_release_hugepage(page); - __free_pages(page, huge_page_order(h)); + if (hstate_is_gigantic(h)) { + destroy_compound_gigantic_page(page, huge_page_order(h)); + free_gigantic_page(page, huge_page_order(h)); + } else { + arch_release_hugepage(page); + __free_pages(page, huge_page_order(h)); + } } struct hstate *size_to_hstate(unsigned long size) @@ -602,7 +856,7 @@ struct hstate *size_to_hstate(unsigned long size) return NULL; } -static void free_huge_page(struct page *page) +void free_huge_page(struct page *page) { /* * Can't pass hstate in here because it is called from the @@ -627,7 +881,7 @@ static void free_huge_page(struct page *page) if (restore_reserve) h->resv_huge_pages++; - if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { + if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ list_del(&page->lru); update_and_free_page(h, page); @@ -731,9 +985,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; - if (h->order >= MAX_ORDER) - return NULL; - page = alloc_pages_exact_node(nid, htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, @@ -749,79 +1000,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) return page; } -/* - * common helper functions for hstate_next_node_to_{alloc|free}. - * We may have allocated or freed a huge page based on a different - * nodes_allowed previously, so h->next_node_to_{alloc|free} might - * be outside of *nodes_allowed. Ensure that we use an allowed - * node for alloc or free. - */ -static int next_node_allowed(int nid, nodemask_t *nodes_allowed) -{ - nid = next_node(nid, *nodes_allowed); - if (nid == MAX_NUMNODES) - nid = first_node(*nodes_allowed); - VM_BUG_ON(nid >= MAX_NUMNODES); - - return nid; -} - -static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) -{ - if (!node_isset(nid, *nodes_allowed)) - nid = next_node_allowed(nid, nodes_allowed); - return nid; -} - -/* - * returns the previously saved node ["this node"] from which to - * allocate a persistent huge page for the pool and advance the - * next node from which to allocate, handling wrap at end of node - * mask. - */ -static int hstate_next_node_to_alloc(struct hstate *h, - nodemask_t *nodes_allowed) -{ - int nid; - - VM_BUG_ON(!nodes_allowed); - - nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); - h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); - - return nid; -} - -/* - * helper for free_pool_huge_page() - return the previously saved - * node ["this node"] from which to free a huge page. Advance the - * next node id whether or not we find a free huge page to free so - * that the next attempt to free addresses the next node. - */ -static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) -{ - int nid; - - VM_BUG_ON(!nodes_allowed); - - nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); - h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); - - return nid; -} - -#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ - for (nr_nodes = nodes_weight(*mask); \ - nr_nodes > 0 && \ - ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ - nr_nodes--) - -#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ - for (nr_nodes = nodes_weight(*mask); \ - nr_nodes > 0 && \ - ((node = hstate_next_node_to_free(hs, mask)) || 1); \ - nr_nodes--) - static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) { struct page *page; @@ -925,7 +1103,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) struct page *page; unsigned int r_nid; - if (h->order >= MAX_ORDER) + if (hstate_is_gigantic(h)) return NULL; /* @@ -1118,7 +1296,7 @@ static void return_unused_surplus_pages(struct hstate *h, h->resv_huge_pages -= unused_resv_pages; /* Cannot return gigantic pages currently */ - if (h->order >= MAX_ORDER) + if (hstate_is_gigantic(h)) return; nr_pages = min(unused_resv_pages, h->surplus_huge_pages); @@ -1134,6 +1312,7 @@ static void return_unused_surplus_pages(struct hstate *h, while (nr_pages--) { if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) break; + cond_resched_lock(&hugetlb_lock); } } @@ -1150,45 +1329,34 @@ static void return_unused_surplus_pages(struct hstate *h, static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct address_space *mapping = vma->vm_file->f_mapping; - struct inode *inode = mapping->host; - - if (vma->vm_flags & VM_MAYSHARE) { - pgoff_t idx = vma_hugecache_offset(h, vma, addr); - return region_chg(&inode->i_mapping->private_list, - idx, idx + 1); + struct resv_map *resv; + pgoff_t idx; + long chg; - } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + resv = vma_resv_map(vma); + if (!resv) return 1; - } else { - long err; - pgoff_t idx = vma_hugecache_offset(h, vma, addr); - struct resv_map *resv = vma_resv_map(vma); + idx = vma_hugecache_offset(h, vma, addr); + chg = region_chg(resv, idx, idx + 1); - err = region_chg(&resv->regions, idx, idx + 1); - if (err < 0) - return err; - return 0; - } + if (vma->vm_flags & VM_MAYSHARE) + return chg; + else + return chg < 0 ? chg : 0; } static void vma_commit_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct address_space *mapping = vma->vm_file->f_mapping; - struct inode *inode = mapping->host; - - if (vma->vm_flags & VM_MAYSHARE) { - pgoff_t idx = vma_hugecache_offset(h, vma, addr); - region_add(&inode->i_mapping->private_list, idx, idx + 1); + struct resv_map *resv; + pgoff_t idx; - } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - pgoff_t idx = vma_hugecache_offset(h, vma, addr); - struct resv_map *resv = vma_resv_map(vma); + resv = vma_resv_map(vma); + if (!resv) + return; - /* Mark this page used in the map. */ - region_add(&resv->regions, idx, idx + 1); - } + idx = vma_hugecache_offset(h, vma, addr); + region_add(resv, idx, idx + 1); } static struct page *alloc_huge_page(struct vm_area_struct *vma, @@ -1218,24 +1386,17 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return ERR_PTR(-ENOSPC); ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); - if (ret) { - if (chg || avoid_reserve) - hugepage_subpool_put_pages(spool, 1); - return ERR_PTR(-ENOSPC); - } + if (ret) + goto out_subpool_put; + spin_lock(&hugetlb_lock); page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); if (!page) { spin_unlock(&hugetlb_lock); page = alloc_buddy_huge_page(h, NUMA_NO_NODE); - if (!page) { - hugetlb_cgroup_uncharge_cgroup(idx, - pages_per_huge_page(h), - h_cg); - if (chg || avoid_reserve) - hugepage_subpool_put_pages(spool, 1); - return ERR_PTR(-ENOSPC); - } + if (!page) + goto out_uncharge_cgroup; + spin_lock(&hugetlb_lock); list_move(&page->lru, &h->hugepage_activelist); /* Fall through */ @@ -1247,6 +1408,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, vma_commit_reservation(h, vma, addr); return page; + +out_uncharge_cgroup: + hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); +out_subpool_put: + if (chg || avoid_reserve) + hugepage_subpool_put_pages(spool, 1); + return ERR_PTR(-ENOSPC); } /* @@ -1294,7 +1462,7 @@ found: return 1; } -static void prep_compound_huge_page(struct page *page, int order) +static void __init prep_compound_huge_page(struct page *page, int order) { if (unlikely(order > (MAX_ORDER - 1))) prep_compound_gigantic_page(page, order); @@ -1328,7 +1496,7 @@ static void __init gather_bootmem_prealloc(void) * fix confusing memory reports from free(1) and another * side-effects, like CommitLimit going negative. */ - if (h->order > (MAX_ORDER - 1)) + if (hstate_is_gigantic(h)) adjust_managed_page_count(page, 1 << h->order); } } @@ -1338,7 +1506,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) unsigned long i; for (i = 0; i < h->max_huge_pages; ++i) { - if (h->order >= MAX_ORDER) { + if (hstate_is_gigantic(h)) { if (!alloc_bootmem_huge_page(h)) break; } else if (!alloc_fresh_huge_page(h, @@ -1354,7 +1522,7 @@ static void __init hugetlb_init_hstates(void) for_each_hstate(h) { /* oversize hugepages were init'ed in early boot */ - if (h->order < MAX_ORDER) + if (!hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); } } @@ -1388,7 +1556,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count, { int i; - if (h->order >= MAX_ORDER) + if (hstate_is_gigantic(h)) return; for_each_node_mask(i, *nodes_allowed) { @@ -1451,7 +1619,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, { unsigned long min_count, ret; - if (h->order >= MAX_ORDER) + if (hstate_is_gigantic(h) && !gigantic_page_supported()) return h->max_huge_pages; /* @@ -1478,7 +1646,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * and reducing the surplus. */ spin_unlock(&hugetlb_lock); - ret = alloc_fresh_huge_page(h, nodes_allowed); + if (hstate_is_gigantic(h)) + ret = alloc_fresh_gigantic_page(h, nodes_allowed); + else + ret = alloc_fresh_huge_page(h, nodes_allowed); spin_lock(&hugetlb_lock); if (!ret) goto out; @@ -1509,6 +1680,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, while (min_count < persistent_huge_pages(h)) { if (!free_pool_huge_page(h, nodes_allowed, 0)) break; + cond_resched_lock(&hugetlb_lock); } while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, 1)) @@ -1577,7 +1749,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, goto out; h = kobj_to_hstate(kobj, &nid); - if (h->order >= MAX_ORDER) { + if (hstate_is_gigantic(h) && !gigantic_page_supported()) { err = -EINVAL; goto out; } @@ -1660,7 +1832,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, unsigned long input; struct hstate *h = kobj_to_hstate(kobj, NULL); - if (h->order >= MAX_ORDER) + if (hstate_is_gigantic(h)) return -EINVAL; err = kstrtoul(buf, 10, &input); @@ -1944,16 +2116,15 @@ static void __exit hugetlb_exit(void) } kobject_put(hugepages_kobj); + kfree(htlb_fault_mutex_table); } module_exit(hugetlb_exit); static int __init hugetlb_init(void) { - /* Some platform decide whether they support huge pages at boot - * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when - * there is no such support - */ - if (HPAGE_SHIFT == 0) + int i; + + if (!hugepages_supported()) return 0; if (!size_to_hstate(default_hstate_size)) { @@ -1973,6 +2144,17 @@ static int __init hugetlb_init(void) hugetlb_register_all_nodes(); hugetlb_cgroup_file_init(); +#ifdef CONFIG_SMP + num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); +#else + num_fault_mutexes = 1; +#endif + htlb_fault_mutex_table = + kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL); + BUG_ON(!htlb_fault_mutex_table); + + for (i = 0; i < num_fault_mutexes; i++) + mutex_init(&htlb_fault_mutex_table[i]); return 0; } module_init(hugetlb_init); @@ -2069,9 +2251,12 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, unsigned long tmp; int ret; + if (!hugepages_supported()) + return -ENOTSUPP; + tmp = h->max_huge_pages; - if (write && h->order >= MAX_ORDER) + if (write && hstate_is_gigantic(h) && !gigantic_page_supported()) return -EINVAL; table->data = &tmp; @@ -2122,9 +2307,12 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, unsigned long tmp; int ret; + if (!hugepages_supported()) + return -ENOTSUPP; + tmp = h->nr_overcommit_huge_pages; - if (write && h->order >= MAX_ORDER) + if (write && hstate_is_gigantic(h)) return -EINVAL; table->data = &tmp; @@ -2147,6 +2335,8 @@ out: void hugetlb_report_meminfo(struct seq_file *m) { struct hstate *h = &default_hstate; + if (!hugepages_supported()) + return; seq_printf(m, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" @@ -2163,6 +2353,8 @@ void hugetlb_report_meminfo(struct seq_file *m) int hugetlb_report_node_meminfo(int nid, char *buf) { struct hstate *h = &default_hstate; + if (!hugepages_supported()) + return 0; return sprintf(buf, "Node %d HugePages_Total: %5u\n" "Node %d HugePages_Free: %5u\n" @@ -2177,6 +2369,9 @@ void hugetlb_show_meminfo(void) struct hstate *h; int nid; + if (!hugepages_supported()) + return; + for_each_node_state(nid, N_MEMORY) for_each_hstate(h) pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", @@ -2251,41 +2446,30 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) * after this open call completes. It is therefore safe to take a * new reference here without additional locking. */ - if (resv) + if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) kref_get(&resv->refs); } -static void resv_map_put(struct vm_area_struct *vma) -{ - struct resv_map *resv = vma_resv_map(vma); - - if (!resv) - return; - kref_put(&resv->refs, resv_map_release); -} - static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); struct resv_map *resv = vma_resv_map(vma); struct hugepage_subpool *spool = subpool_vma(vma); - unsigned long reserve; - unsigned long start; - unsigned long end; + unsigned long reserve, start, end; - if (resv) { - start = vma_hugecache_offset(h, vma, vma->vm_start); - end = vma_hugecache_offset(h, vma, vma->vm_end); + if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + return; - reserve = (end - start) - - region_count(&resv->regions, start, end); + start = vma_hugecache_offset(h, vma, vma->vm_start); + end = vma_hugecache_offset(h, vma, vma->vm_end); - resv_map_put(vma); + reserve = (end - start) - region_count(resv, start, end); - if (reserve) { - hugetlb_acct_memory(h, -reserve); - hugepage_subpool_put_pages(spool, reserve); - } + kref_put(&resv->refs, resv_map_release); + + if (reserve) { + hugetlb_acct_memory(h, -reserve); + hugepage_subpool_put_pages(spool, reserve); } } @@ -2336,6 +2520,31 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, update_mmu_cache(vma, address, ptep); } +static int is_hugetlb_entry_migration(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_migration_entry(swp)) + return 1; + else + return 0; +} + +static int is_hugetlb_entry_hwpoisoned(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_hwpoison_entry(swp)) + return 1; + else + return 0; +} int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) @@ -2375,7 +2584,24 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); - if (!huge_pte_none(huge_ptep_get(src_pte))) { + entry = huge_ptep_get(src_pte); + if (huge_pte_none(entry)) { /* skip none entry */ + ; + } else if (unlikely(is_hugetlb_entry_migration(entry) || + is_hugetlb_entry_hwpoisoned(entry))) { + swp_entry_t swp_entry = pte_to_swp_entry(entry); + + if (is_write_migration_entry(swp_entry) && cow) { + /* + * COW mappings require pages in both + * parent and child to be set to read. + */ + make_migration_entry_read(&swp_entry); + entry = swp_entry_to_pte(swp_entry); + set_huge_pte_at(src, addr, src_pte, entry); + } + set_huge_pte_at(dst, addr, dst_pte, entry); + } else { if (cow) huge_ptep_set_wrprotect(src, addr, src_pte); entry = huge_ptep_get(src_pte); @@ -2394,32 +2620,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, return ret; } -static int is_hugetlb_entry_migration(pte_t pte) -{ - swp_entry_t swp; - - if (huge_pte_none(pte) || pte_present(pte)) - return 0; - swp = pte_to_swp_entry(pte); - if (non_swap_entry(swp) && is_migration_entry(swp)) - return 1; - else - return 0; -} - -static int is_hugetlb_entry_hwpoisoned(pte_t pte) -{ - swp_entry_t swp; - - if (huge_pte_none(pte) || pte_present(pte)) - return 0; - swp = pte_to_swp_entry(pte); - if (non_swap_entry(swp) && is_hwpoison_entry(swp)) - return 1; - else - return 0; -} - void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) @@ -2661,7 +2861,8 @@ retry_avoidcopy: BUG_ON(huge_pte_none(pte)); spin_lock(ptl); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); - if (likely(pte_same(huge_ptep_get(ptep), pte))) + if (likely(ptep && + pte_same(huge_ptep_get(ptep), pte))) goto retry_avoidcopy; /* * race occurs while re-acquiring page table @@ -2705,7 +2906,7 @@ retry_avoidcopy: */ spin_lock(ptl); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); - if (likely(pte_same(huge_ptep_get(ptep), pte))) { + if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { ClearPagePrivate(new_page); /* Break COW */ @@ -2761,15 +2962,14 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, } static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, unsigned int flags) + struct address_space *mapping, pgoff_t idx, + unsigned long address, pte_t *ptep, unsigned int flags) { struct hstate *h = hstate_vma(vma); int ret = VM_FAULT_SIGBUS; int anon_rmap = 0; - pgoff_t idx; unsigned long size; struct page *page; - struct address_space *mapping; pte_t new_pte; spinlock_t *ptl; @@ -2784,9 +2984,6 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, return ret; } - mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, address); - /* * Use page lock to guard against racing truncation * before we get page_table_lock. @@ -2871,8 +3068,7 @@ retry: if (anon_rmap) { ClearPagePrivate(page); hugepage_add_new_anon_rmap(page, vma, address); - } - else + } else page_dup_rmap(page); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); @@ -2896,17 +3092,53 @@ backout_unlocked: goto out; } +#ifdef CONFIG_SMP +static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, + pgoff_t idx, unsigned long address) +{ + unsigned long key[2]; + u32 hash; + + if (vma->vm_flags & VM_SHARED) { + key[0] = (unsigned long) mapping; + key[1] = idx; + } else { + key[0] = (unsigned long) mm; + key[1] = address >> huge_page_shift(h); + } + + hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); + + return hash & (num_fault_mutexes - 1); +} +#else +/* + * For uniprocesor systems we always use a single mutex, so just + * return 0 and avoid the hashing overhead. + */ +static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, + pgoff_t idx, unsigned long address) +{ + return 0; +} +#endif + int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { - pte_t *ptep; - pte_t entry; + pte_t *ptep, entry; spinlock_t *ptl; int ret; + u32 hash; + pgoff_t idx; struct page *page = NULL; struct page *pagecache_page = NULL; - static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); + struct address_space *mapping; address &= huge_page_mask(h); @@ -2925,15 +3157,20 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!ptep) return VM_FAULT_OOM; + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, address); + /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - mutex_lock(&hugetlb_instantiation_mutex); + hash = fault_mutex_hash(h, mm, vma, mapping, idx, address); + mutex_lock(&htlb_fault_mutex_table[hash]); + entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { - ret = hugetlb_no_page(mm, vma, address, ptep, flags); + ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); goto out_mutex; } @@ -3002,8 +3239,7 @@ out_ptl: put_page(page); out_mutex: - mutex_unlock(&hugetlb_instantiation_mutex); - + mutex_unlock(&htlb_fault_mutex_table[hash]); return ret; } @@ -3120,6 +3356,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, BUG_ON(address >= end); flush_cache_range(vma, address, end); + mmu_notifier_invalidate_range_start(mm, start, end); mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); for (; address < end; address += huge_page_size(h)) { spinlock_t *ptl; @@ -3149,6 +3386,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, */ flush_tlb_range(vma, start, end); mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); + mmu_notifier_invalidate_range_end(mm, start, end); return pages << h->order; } @@ -3161,6 +3399,7 @@ int hugetlb_reserve_pages(struct inode *inode, long ret, chg; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); + struct resv_map *resv_map; /* * Only apply hugepage reservation if asked. At fault time, an @@ -3176,10 +3415,13 @@ int hugetlb_reserve_pages(struct inode *inode, * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !vma is a shm mapping */ - if (!vma || vma->vm_flags & VM_MAYSHARE) - chg = region_chg(&inode->i_mapping->private_list, from, to); - else { - struct resv_map *resv_map = resv_map_alloc(); + if (!vma || vma->vm_flags & VM_MAYSHARE) { + resv_map = inode_resv_map(inode); + + chg = region_chg(resv_map, from, to); + + } else { + resv_map = resv_map_alloc(); if (!resv_map) return -ENOMEM; @@ -3222,20 +3464,23 @@ int hugetlb_reserve_pages(struct inode *inode, * else has to be done for private mappings here */ if (!vma || vma->vm_flags & VM_MAYSHARE) - region_add(&inode->i_mapping->private_list, from, to); + region_add(resv_map, from, to); return 0; out_err: - if (vma) - resv_map_put(vma); + if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + kref_put(&resv_map->refs, resv_map_release); return ret; } void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) { struct hstate *h = hstate_inode(inode); - long chg = region_truncate(&inode->i_mapping->private_list, offset); + struct resv_map *resv_map = inode_resv_map(inode); + long chg = 0; struct hugepage_subpool *spool = subpool_inode(inode); + if (resv_map) + chg = region_truncate(resv_map, offset); spin_lock(&inode->i_lock); inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); @@ -3446,7 +3691,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, #else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */ /* Can be overriden by architectures */ -__attribute__((weak)) struct page * +struct page * __weak follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud, int write) { diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index cb00829..9aae6f4 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -30,7 +30,6 @@ struct hugetlb_cgroup { #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) -struct cgroup_subsys hugetlb_subsys __read_mostly; static struct hugetlb_cgroup *root_h_cgroup __read_mostly; static inline @@ -42,7 +41,7 @@ struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) static inline struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) { - return hugetlb_cgroup_from_css(task_css(task, hugetlb_subsys_id)); + return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); } static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) @@ -53,7 +52,7 @@ static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) static inline struct hugetlb_cgroup * parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) { - return hugetlb_cgroup_from_css(css_parent(&h_cg->css)); + return hugetlb_cgroup_from_css(h_cg->css.parent); } static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) @@ -182,7 +181,7 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, again: rcu_read_lock(); h_cg = hugetlb_cgroup_from_task(current); - if (!css_tryget(&h_cg->css)) { + if (!css_tryget_online(&h_cg->css)) { rcu_read_unlock(); goto again; } @@ -254,15 +253,16 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, return res_counter_read_u64(&h_cg->hugepage[idx], name); } -static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buffer) +static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { int idx, name, ret; unsigned long long val; - struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); - idx = MEMFILE_IDX(cft->private); - name = MEMFILE_ATTR(cft->private); + buf = strstrip(buf); + idx = MEMFILE_IDX(of_cft(of)->private); + name = MEMFILE_ATTR(of_cft(of)->private); switch (name) { case RES_LIMIT: @@ -272,7 +272,7 @@ static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, break; } /* This function does all necessary parse...reuse it */ - ret = res_counter_memparse_write_strategy(buffer, &val); + ret = res_counter_memparse_write_strategy(buf, &val); if (ret) break; ret = res_counter_set_limit(&h_cg->hugepage[idx], val); @@ -281,17 +281,17 @@ static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, ret = -EINVAL; break; } - return ret; + return ret ?: nbytes; } -static int hugetlb_cgroup_reset(struct cgroup_subsys_state *css, - unsigned int event) +static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { int idx, name, ret = 0; - struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); - idx = MEMFILE_IDX(event); - name = MEMFILE_ATTR(event); + idx = MEMFILE_IDX(of_cft(of)->private); + name = MEMFILE_ATTR(of_cft(of)->private); switch (name) { case RES_MAX_USAGE: @@ -304,7 +304,7 @@ static int hugetlb_cgroup_reset(struct cgroup_subsys_state *css, ret = -EINVAL; break; } - return ret; + return ret ?: nbytes; } static char *mem_fmt(char *buf, int size, unsigned long hsize) @@ -332,7 +332,7 @@ static void __init __hugetlb_cgroup_file_init(int idx) snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); cft->read_u64 = hugetlb_cgroup_read_u64; - cft->write_string = hugetlb_cgroup_write; + cft->write = hugetlb_cgroup_write; /* Add the usage file */ cft = &h->cgroup_files[1]; @@ -344,23 +344,22 @@ static void __init __hugetlb_cgroup_file_init(int idx) cft = &h->cgroup_files[2]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); - cft->trigger = hugetlb_cgroup_reset; + cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the failcntfile */ cft = &h->cgroup_files[3]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); - cft->trigger = hugetlb_cgroup_reset; + cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; /* NULL terminate the last cft */ cft = &h->cgroup_files[4]; memset(cft, 0, sizeof(*cft)); - WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); - - return; + WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, + h->cgroup_files)); } void __init hugetlb_cgroup_file_init(void) @@ -402,10 +401,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) return; } -struct cgroup_subsys hugetlb_subsys = { - .name = "hugetlb", +struct cgroup_subsys hugetlb_cgrp_subsys = { .css_alloc = hugetlb_cgroup_css_alloc, .css_offline = hugetlb_cgroup_css_offline, .css_free = hugetlb_cgroup_css_free, - .subsys_id = hugetlb_subsys_id, }; diff --git a/mm/internal.h b/mm/internal.h index 29e1e76..7f22a11f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -11,6 +11,7 @@ #ifndef __MM_INTERNAL_H #define __MM_INTERNAL_H +#include <linux/fs.h> #include <linux/mm.h> void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, @@ -21,6 +22,20 @@ static inline void set_page_count(struct page *page, int v) atomic_set(&page->_count, v); } +extern int __do_page_cache_readahead(struct address_space *mapping, + struct file *filp, pgoff_t offset, unsigned long nr_to_read, + unsigned long lookahead_size); + +/* + * Submit IO for the read-ahead request in file_ra_state. + */ +static inline unsigned long ra_submit(struct file_ra_state *ra, + struct address_space *mapping, struct file *filp) +{ + return __do_page_cache_readahead(mapping, filp, + ra->start, ra->size, ra->async_size); +} + /* * Turn a non-refcounted page (->_count == 0) into refcounted with * a count of one. @@ -119,7 +134,7 @@ struct compact_control { unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ - bool sync; /* Synchronous migration */ + enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool finished_update_free; /* True when the zone cached pfns are * no longer being updated @@ -129,7 +144,10 @@ struct compact_control { int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; - bool contended; /* True if a lock was contended */ + bool contended; /* True if a lock was contended, or + * need_resched() true during async + * compaction + */ }; unsigned long @@ -154,6 +172,11 @@ static inline unsigned long page_order(struct page *page) return page_private(page); } +static inline bool is_cow_mapping(vm_flags_t flags) +{ + return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; +} + /* mm/util.c */ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent); @@ -169,26 +192,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) } /* - * Called only in fault path, to determine if a new page is being - * mapped into a LOCKED vma. If it is, mark page as mlocked. - */ -static inline int mlocked_vma_newpage(struct vm_area_struct *vma, - struct page *page) -{ - VM_BUG_ON_PAGE(PageLRU(page), page); - - if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) - return 0; - - if (!TestSetPageMlocked(page)) { - mod_zone_page_state(page_zone(page), NR_MLOCK, - hpage_nr_pages(page)); - count_vm_event(UNEVICTABLE_PGMLOCKED); - } - return 1; -} - -/* * must be called with vma's mmap_sem held for read or write, and page locked. */ extern void mlock_vma_page(struct page *page); @@ -230,10 +233,6 @@ extern unsigned long vma_address(struct page *page, struct vm_area_struct *vma); #endif #else /* !CONFIG_MMU */ -static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p) -{ - return 0; -} static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } static inline void mlock_migrate_page(struct page *new, struct page *old) { } @@ -370,5 +369,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ +#define ALLOC_FAIR 0x100 /* fair zone allocation */ #endif /* __MM_INTERNAL_H */ diff --git a/mm/iov_iter.c b/mm/iov_iter.c new file mode 100644 index 0000000..7b5dbd1 --- /dev/null +++ b/mm/iov_iter.c @@ -0,0 +1,743 @@ +#include <linux/export.h> +#include <linux/uio.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + size_t skip, copy, left, wanted; + const struct iovec *iov; + char __user *buf; + void *kaddr, *from; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + iov = i->iov; + skip = i->iov_offset; + buf = iov->iov_base + skip; + copy = min(bytes, iov->iov_len - skip); + + if (!fault_in_pages_writeable(buf, copy)) { + kaddr = kmap_atomic(page); + from = kaddr + offset; + + /* first chunk, usually the only one */ + left = __copy_to_user_inatomic(buf, from, copy); + copy -= left; + skip += copy; + from += copy; + bytes -= copy; + + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_to_user_inatomic(buf, from, copy); + copy -= left; + skip = copy; + from += copy; + bytes -= copy; + } + if (likely(!bytes)) { + kunmap_atomic(kaddr); + goto done; + } + offset = from - kaddr; + buf += copy; + kunmap_atomic(kaddr); + copy = min(bytes, iov->iov_len - skip); + } + /* Too bad - revert to non-atomic kmap */ + kaddr = kmap(page); + from = kaddr + offset; + left = __copy_to_user(buf, from, copy); + copy -= left; + skip += copy; + from += copy; + bytes -= copy; + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_to_user(buf, from, copy); + copy -= left; + skip = copy; + from += copy; + bytes -= copy; + } + kunmap(page); +done: + if (skip == iov->iov_len) { + iov++; + skip = 0; + } + i->count -= wanted - bytes; + i->nr_segs -= iov - i->iov; + i->iov = iov; + i->iov_offset = skip; + return wanted - bytes; +} + +static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + size_t skip, copy, left, wanted; + const struct iovec *iov; + char __user *buf; + void *kaddr, *to; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + iov = i->iov; + skip = i->iov_offset; + buf = iov->iov_base + skip; + copy = min(bytes, iov->iov_len - skip); + + if (!fault_in_pages_readable(buf, copy)) { + kaddr = kmap_atomic(page); + to = kaddr + offset; + + /* first chunk, usually the only one */ + left = __copy_from_user_inatomic(to, buf, copy); + copy -= left; + skip += copy; + to += copy; + bytes -= copy; + + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_from_user_inatomic(to, buf, copy); + copy -= left; + skip = copy; + to += copy; + bytes -= copy; + } + if (likely(!bytes)) { + kunmap_atomic(kaddr); + goto done; + } + offset = to - kaddr; + buf += copy; + kunmap_atomic(kaddr); + copy = min(bytes, iov->iov_len - skip); + } + /* Too bad - revert to non-atomic kmap */ + kaddr = kmap(page); + to = kaddr + offset; + left = __copy_from_user(to, buf, copy); + copy -= left; + skip += copy; + to += copy; + bytes -= copy; + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_from_user(to, buf, copy); + copy -= left; + skip = copy; + to += copy; + bytes -= copy; + } + kunmap(page); +done: + if (skip == iov->iov_len) { + iov++; + skip = 0; + } + i->count -= wanted - bytes; + i->nr_segs -= iov - i->iov; + i->iov = iov; + i->iov_offset = skip; + return wanted - bytes; +} + +static size_t __iovec_copy_from_user_inatomic(char *vaddr, + const struct iovec *iov, size_t base, size_t bytes) +{ + size_t copied = 0, left = 0; + + while (bytes) { + char __user *buf = iov->iov_base + base; + int copy = min(bytes, iov->iov_len - base); + + base = 0; + left = __copy_from_user_inatomic(vaddr, buf, copy); + copied += copy; + bytes -= copy; + vaddr += copy; + iov++; + + if (unlikely(left)) + break; + } + return copied - left; +} + +/* + * Copy as much as we can into the page and return the number of bytes which + * were successfully copied. If a fault is encountered then return the number of + * bytes which were copied. + */ +static size_t copy_from_user_atomic_iovec(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t copied; + + kaddr = kmap_atomic(page); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; + left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, + i->iov, i->iov_offset, bytes); + } + kunmap_atomic(kaddr); + + return copied; +} + +static void advance_iovec(struct iov_iter *i, size_t bytes) +{ + BUG_ON(i->count < bytes); + + if (likely(i->nr_segs == 1)) { + i->iov_offset += bytes; + i->count -= bytes; + } else { + const struct iovec *iov = i->iov; + size_t base = i->iov_offset; + unsigned long nr_segs = i->nr_segs; + + /* + * The !iov->iov_len check ensures we skip over unlikely + * zero-length segments (without overruning the iovec). + */ + while (bytes || unlikely(i->count && !iov->iov_len)) { + int copy; + + copy = min(bytes, iov->iov_len - base); + BUG_ON(!i->count || i->count < copy); + i->count -= copy; + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + nr_segs--; + base = 0; + } + } + i->iov = iov; + i->iov_offset = base; + i->nr_segs = nr_segs; + } +} + +/* + * Fault in the first iovec of the given iov_iter, to a maximum length + * of bytes. Returns 0 on success, or non-zero if the memory could not be + * accessed (ie. because it is an invalid address). + * + * writev-intensive code may want this to prefault several iovecs -- that + * would be possible (callers must not rely on the fact that _only_ the + * first iovec will be faulted with the current implementation). + */ +int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) +{ + if (!(i->type & ITER_BVEC)) { + char __user *buf = i->iov->iov_base + i->iov_offset; + bytes = min(bytes, i->iov->iov_len - i->iov_offset); + return fault_in_pages_readable(buf, bytes); + } + return 0; +} +EXPORT_SYMBOL(iov_iter_fault_in_readable); + +static unsigned long alignment_iovec(const struct iov_iter *i) +{ + const struct iovec *iov = i->iov; + unsigned long res; + size_t size = i->count; + size_t n; + + if (!size) + return 0; + + res = (unsigned long)iov->iov_base + i->iov_offset; + n = iov->iov_len - i->iov_offset; + if (n >= size) + return res | size; + size -= n; + res |= n; + while (size > (++iov)->iov_len) { + res |= (unsigned long)iov->iov_base | iov->iov_len; + size -= iov->iov_len; + } + res |= (unsigned long)iov->iov_base | size; + return res; +} + +void iov_iter_init(struct iov_iter *i, int direction, + const struct iovec *iov, unsigned long nr_segs, + size_t count) +{ + /* It will get better. Eventually... */ + if (segment_eq(get_fs(), KERNEL_DS)) + direction |= ITER_KVEC; + i->type = direction; + i->iov = iov; + i->nr_segs = nr_segs; + i->iov_offset = 0; + i->count = count; +} +EXPORT_SYMBOL(iov_iter_init); + +static ssize_t get_pages_iovec(struct iov_iter *i, + struct page **pages, size_t maxsize, + size_t *start) +{ + size_t offset = i->iov_offset; + const struct iovec *iov = i->iov; + size_t len; + unsigned long addr; + int n; + int res; + + len = iov->iov_len - offset; + if (len > i->count) + len = i->count; + if (len > maxsize) + len = maxsize; + addr = (unsigned long)iov->iov_base + offset; + len += *start = addr & (PAGE_SIZE - 1); + addr &= ~(PAGE_SIZE - 1); + n = (len + PAGE_SIZE - 1) / PAGE_SIZE; + res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages); + if (unlikely(res < 0)) + return res; + return (res == n ? len : res * PAGE_SIZE) - *start; +} + +static ssize_t get_pages_alloc_iovec(struct iov_iter *i, + struct page ***pages, size_t maxsize, + size_t *start) +{ + size_t offset = i->iov_offset; + const struct iovec *iov = i->iov; + size_t len; + unsigned long addr; + void *p; + int n; + int res; + + len = iov->iov_len - offset; + if (len > i->count) + len = i->count; + if (len > maxsize) + len = maxsize; + addr = (unsigned long)iov->iov_base + offset; + len += *start = addr & (PAGE_SIZE - 1); + addr &= ~(PAGE_SIZE - 1); + n = (len + PAGE_SIZE - 1) / PAGE_SIZE; + + p = kmalloc(n * sizeof(struct page *), GFP_KERNEL); + if (!p) + p = vmalloc(n * sizeof(struct page *)); + if (!p) + return -ENOMEM; + + res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p); + if (unlikely(res < 0)) { + kvfree(p); + return res; + } + *pages = p; + return (res == n ? len : res * PAGE_SIZE) - *start; +} + +static int iov_iter_npages_iovec(const struct iov_iter *i, int maxpages) +{ + size_t offset = i->iov_offset; + size_t size = i->count; + const struct iovec *iov = i->iov; + int npages = 0; + int n; + + for (n = 0; size && n < i->nr_segs; n++, iov++) { + unsigned long addr = (unsigned long)iov->iov_base + offset; + size_t len = iov->iov_len - offset; + offset = 0; + if (unlikely(!len)) /* empty segment */ + continue; + if (len > size) + len = size; + npages += (addr + len + PAGE_SIZE - 1) / PAGE_SIZE + - addr / PAGE_SIZE; + if (npages >= maxpages) /* don't bother going further */ + return maxpages; + size -= len; + offset = 0; + } + return min(npages, maxpages); +} + +static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len) +{ + char *from = kmap_atomic(page); + memcpy(to, from + offset, len); + kunmap_atomic(from); +} + +static void memcpy_to_page(struct page *page, size_t offset, char *from, size_t len) +{ + char *to = kmap_atomic(page); + memcpy(to + offset, from, len); + kunmap_atomic(to); +} + +static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + size_t skip, copy, wanted; + const struct bio_vec *bvec; + void *kaddr, *from; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + bvec = i->bvec; + skip = i->iov_offset; + copy = min_t(size_t, bytes, bvec->bv_len - skip); + + kaddr = kmap_atomic(page); + from = kaddr + offset; + memcpy_to_page(bvec->bv_page, skip + bvec->bv_offset, from, copy); + skip += copy; + from += copy; + bytes -= copy; + while (bytes) { + bvec++; + copy = min(bytes, (size_t)bvec->bv_len); + memcpy_to_page(bvec->bv_page, bvec->bv_offset, from, copy); + skip = copy; + from += copy; + bytes -= copy; + } + kunmap_atomic(kaddr); + if (skip == bvec->bv_len) { + bvec++; + skip = 0; + } + i->count -= wanted - bytes; + i->nr_segs -= bvec - i->bvec; + i->bvec = bvec; + i->iov_offset = skip; + return wanted - bytes; +} + +static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + size_t skip, copy, wanted; + const struct bio_vec *bvec; + void *kaddr, *to; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + bvec = i->bvec; + skip = i->iov_offset; + + kaddr = kmap_atomic(page); + + to = kaddr + offset; + + copy = min(bytes, bvec->bv_len - skip); + + memcpy_from_page(to, bvec->bv_page, bvec->bv_offset + skip, copy); + + to += copy; + skip += copy; + bytes -= copy; + + while (bytes) { + bvec++; + copy = min(bytes, (size_t)bvec->bv_len); + memcpy_from_page(to, bvec->bv_page, bvec->bv_offset, copy); + skip = copy; + to += copy; + bytes -= copy; + } + kunmap_atomic(kaddr); + if (skip == bvec->bv_len) { + bvec++; + skip = 0; + } + i->count -= wanted; + i->nr_segs -= bvec - i->bvec; + i->bvec = bvec; + i->iov_offset = skip; + return wanted; +} + +static size_t copy_from_user_bvec(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t left; + const struct bio_vec *bvec; + size_t base = i->iov_offset; + + kaddr = kmap_atomic(page); + for (left = bytes, bvec = i->bvec; left; bvec++, base = 0) { + size_t copy = min(left, bvec->bv_len - base); + if (!bvec->bv_len) + continue; + memcpy_from_page(kaddr + offset, bvec->bv_page, + bvec->bv_offset + base, copy); + offset += copy; + left -= copy; + } + kunmap_atomic(kaddr); + return bytes; +} + +static void advance_bvec(struct iov_iter *i, size_t bytes) +{ + BUG_ON(i->count < bytes); + + if (likely(i->nr_segs == 1)) { + i->iov_offset += bytes; + i->count -= bytes; + } else { + const struct bio_vec *bvec = i->bvec; + size_t base = i->iov_offset; + unsigned long nr_segs = i->nr_segs; + + /* + * The !iov->iov_len check ensures we skip over unlikely + * zero-length segments (without overruning the iovec). + */ + while (bytes || unlikely(i->count && !bvec->bv_len)) { + int copy; + + copy = min(bytes, bvec->bv_len - base); + BUG_ON(!i->count || i->count < copy); + i->count -= copy; + bytes -= copy; + base += copy; + if (bvec->bv_len == base) { + bvec++; + nr_segs--; + base = 0; + } + } + i->bvec = bvec; + i->iov_offset = base; + i->nr_segs = nr_segs; + } +} + +static unsigned long alignment_bvec(const struct iov_iter *i) +{ + const struct bio_vec *bvec = i->bvec; + unsigned long res; + size_t size = i->count; + size_t n; + + if (!size) + return 0; + + res = bvec->bv_offset + i->iov_offset; + n = bvec->bv_len - i->iov_offset; + if (n >= size) + return res | size; + size -= n; + res |= n; + while (size > (++bvec)->bv_len) { + res |= bvec->bv_offset | bvec->bv_len; + size -= bvec->bv_len; + } + res |= bvec->bv_offset | size; + return res; +} + +static ssize_t get_pages_bvec(struct iov_iter *i, + struct page **pages, size_t maxsize, + size_t *start) +{ + const struct bio_vec *bvec = i->bvec; + size_t len = bvec->bv_len - i->iov_offset; + if (len > i->count) + len = i->count; + if (len > maxsize) + len = maxsize; + *start = bvec->bv_offset + i->iov_offset; + + get_page(*pages = bvec->bv_page); + + return len; +} + +static ssize_t get_pages_alloc_bvec(struct iov_iter *i, + struct page ***pages, size_t maxsize, + size_t *start) +{ + const struct bio_vec *bvec = i->bvec; + size_t len = bvec->bv_len - i->iov_offset; + if (len > i->count) + len = i->count; + if (len > maxsize) + len = maxsize; + *start = bvec->bv_offset + i->iov_offset; + + *pages = kmalloc(sizeof(struct page *), GFP_KERNEL); + if (!*pages) + return -ENOMEM; + + get_page(**pages = bvec->bv_page); + + return len; +} + +static int iov_iter_npages_bvec(const struct iov_iter *i, int maxpages) +{ + size_t offset = i->iov_offset; + size_t size = i->count; + const struct bio_vec *bvec = i->bvec; + int npages = 0; + int n; + + for (n = 0; size && n < i->nr_segs; n++, bvec++) { + size_t len = bvec->bv_len - offset; + offset = 0; + if (unlikely(!len)) /* empty segment */ + continue; + if (len > size) + len = size; + npages++; + if (npages >= maxpages) /* don't bother going further */ + return maxpages; + size -= len; + offset = 0; + } + return min(npages, maxpages); +} + +size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + if (i->type & ITER_BVEC) + return copy_page_to_iter_bvec(page, offset, bytes, i); + else + return copy_page_to_iter_iovec(page, offset, bytes, i); +} +EXPORT_SYMBOL(copy_page_to_iter); + +size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + if (i->type & ITER_BVEC) + return copy_page_from_iter_bvec(page, offset, bytes, i); + else + return copy_page_from_iter_iovec(page, offset, bytes, i); +} +EXPORT_SYMBOL(copy_page_from_iter); + +size_t iov_iter_copy_from_user_atomic(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + if (i->type & ITER_BVEC) + return copy_from_user_bvec(page, i, offset, bytes); + else + return copy_from_user_atomic_iovec(page, i, offset, bytes); +} +EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); + +void iov_iter_advance(struct iov_iter *i, size_t size) +{ + if (i->type & ITER_BVEC) + advance_bvec(i, size); + else + advance_iovec(i, size); +} +EXPORT_SYMBOL(iov_iter_advance); + +/* + * Return the count of just the current iov_iter segment. + */ +size_t iov_iter_single_seg_count(const struct iov_iter *i) +{ + if (i->nr_segs == 1) + return i->count; + else if (i->type & ITER_BVEC) + return min(i->count, i->iov->iov_len - i->iov_offset); + else + return min(i->count, i->bvec->bv_len - i->iov_offset); +} +EXPORT_SYMBOL(iov_iter_single_seg_count); + +unsigned long iov_iter_alignment(const struct iov_iter *i) +{ + if (i->type & ITER_BVEC) + return alignment_bvec(i); + else + return alignment_iovec(i); +} +EXPORT_SYMBOL(iov_iter_alignment); + +ssize_t iov_iter_get_pages(struct iov_iter *i, + struct page **pages, size_t maxsize, + size_t *start) +{ + if (i->type & ITER_BVEC) + return get_pages_bvec(i, pages, maxsize, start); + else + return get_pages_iovec(i, pages, maxsize, start); +} +EXPORT_SYMBOL(iov_iter_get_pages); + +ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, + struct page ***pages, size_t maxsize, + size_t *start) +{ + if (i->type & ITER_BVEC) + return get_pages_alloc_bvec(i, pages, maxsize, start); + else + return get_pages_alloc_iovec(i, pages, maxsize, start); +} +EXPORT_SYMBOL(iov_iter_get_pages_alloc); + +int iov_iter_npages(const struct iov_iter *i, int maxpages) +{ + if (i->type & ITER_BVEC) + return iov_iter_npages_bvec(i, maxpages); + else + return iov_iter_npages_iovec(i, maxpages); +} +EXPORT_SYMBOL(iov_iter_npages); diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index ff0d977..dcdcadb 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -18,6 +18,8 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) "kmemleak: " fmt + #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> @@ -50,25 +52,25 @@ static int __init kmemleak_test_init(void) printk(KERN_INFO "Kmemleak testing\n"); /* make some orphan objects */ - pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); + pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); + pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); + pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); + pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); + pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); + pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); + pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); + pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); #ifndef CONFIG_MODULES - pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n", + pr_info("kmem_cache_alloc(files_cachep) = %p\n", kmem_cache_alloc(files_cachep, GFP_KERNEL)); - pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n", + pr_info("kmem_cache_alloc(files_cachep) = %p\n", kmem_cache_alloc(files_cachep, GFP_KERNEL)); #endif - pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); - pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); - pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); - pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); - pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); /* * Add elements to a list. They should only appear as orphan @@ -76,7 +78,7 @@ static int __init kmemleak_test_init(void) */ for (i = 0; i < 10; i++) { elem = kzalloc(sizeof(*elem), GFP_KERNEL); - pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem); + pr_info("kzalloc(sizeof(*elem)) = %p\n", elem); if (!elem) return -ENOMEM; INIT_LIST_HEAD(&elem->list); @@ -85,7 +87,7 @@ static int __init kmemleak_test_init(void) for_each_possible_cpu(i) { per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL); - pr_info("kmemleak: kmalloc(129) = %p\n", + pr_info("kmalloc(129) = %p\n", per_cpu(kmemleak_test_pointer, i)); } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 31f01c5..3cda50c 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -192,15 +192,15 @@ static struct kmem_cache *object_cache; static struct kmem_cache *scan_area_cache; /* set if tracing memory operations is enabled */ -static atomic_t kmemleak_enabled = ATOMIC_INIT(0); +static int kmemleak_enabled; /* set in the late_initcall if there were no errors */ -static atomic_t kmemleak_initialized = ATOMIC_INIT(0); +static int kmemleak_initialized; /* enables or disables early logging of the memory operations */ -static atomic_t kmemleak_early_log = ATOMIC_INIT(1); +static int kmemleak_early_log = 1; /* set if a kmemleak warning was issued */ -static atomic_t kmemleak_warning = ATOMIC_INIT(0); +static int kmemleak_warning; /* set if a fatal kmemleak error has occurred */ -static atomic_t kmemleak_error = ATOMIC_INIT(0); +static int kmemleak_error; /* minimum and maximum address that may be valid pointers */ static unsigned long min_addr = ULONG_MAX; @@ -218,7 +218,8 @@ static int kmemleak_stack_scan = 1; static DEFINE_MUTEX(scan_mutex); /* setting kmemleak=on, will set this var, skipping the disable */ static int kmemleak_skip_disable; - +/* If there are leaks that can be reported */ +static bool kmemleak_found_leaks; /* * Early object allocation/freeing logging. Kmemleak is initialized after the @@ -267,7 +268,7 @@ static void kmemleak_disable(void); #define kmemleak_warn(x...) do { \ pr_warning(x); \ dump_stack(); \ - atomic_set(&kmemleak_warning, 1); \ + kmemleak_warning = 1; \ } while (0) /* @@ -386,7 +387,7 @@ static void dump_object_info(struct kmemleak_object *object) pr_notice(" min_count = %d\n", object->min_count); pr_notice(" count = %d\n", object->count); pr_notice(" flags = 0x%lx\n", object->flags); - pr_notice(" checksum = %d\n", object->checksum); + pr_notice(" checksum = %u\n", object->checksum); pr_notice(" backtrace:\n"); print_stack_trace(&trace, 4); } @@ -805,7 +806,7 @@ static void __init log_early(int op_type, const void *ptr, size_t size, unsigned long flags; struct early_log *log; - if (atomic_read(&kmemleak_error)) { + if (kmemleak_error) { /* kmemleak stopped recording, just count the requests */ crt_early_log++; return; @@ -840,7 +841,7 @@ static void early_alloc(struct early_log *log) unsigned long flags; int i; - if (!atomic_read(&kmemleak_enabled) || !log->ptr || IS_ERR(log->ptr)) + if (!kmemleak_enabled || !log->ptr || IS_ERR(log->ptr)) return; /* @@ -893,9 +894,9 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, { pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) create_object((unsigned long)ptr, size, min_count, gfp); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_ALLOC, ptr, size, min_count); } EXPORT_SYMBOL_GPL(kmemleak_alloc); @@ -919,11 +920,11 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) * Percpu allocations are only scanned and not reported as leaks * (min_count is set to 0). */ - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) for_each_possible_cpu(cpu) create_object((unsigned long)per_cpu_ptr(ptr, cpu), size, 0, GFP_KERNEL); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0); } EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); @@ -939,9 +940,9 @@ void __ref kmemleak_free(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) delete_object_full((unsigned long)ptr); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_FREE, ptr, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free); @@ -959,9 +960,9 @@ void __ref kmemleak_free_part(const void *ptr, size_t size) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) delete_object_part((unsigned long)ptr, size); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_FREE_PART, ptr, size, 0); } EXPORT_SYMBOL_GPL(kmemleak_free_part); @@ -979,16 +980,50 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr) pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) for_each_possible_cpu(cpu) delete_object_full((unsigned long)per_cpu_ptr(ptr, cpu)); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free_percpu); /** + * kmemleak_update_trace - update object allocation stack trace + * @ptr: pointer to beginning of the object + * + * Override the object allocation stack trace for cases where the actual + * allocation place is not always useful. + */ +void __ref kmemleak_update_trace(const void *ptr) +{ + struct kmemleak_object *object; + unsigned long flags; + + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (!kmemleak_enabled || IS_ERR_OR_NULL(ptr)) + return; + + object = find_and_get_object((unsigned long)ptr, 1); + if (!object) { +#ifdef DEBUG + kmemleak_warn("Updating stack trace for unknown object at %p\n", + ptr); +#endif + return; + } + + spin_lock_irqsave(&object->lock, flags); + object->trace_len = __save_stack_trace(object->trace); + spin_unlock_irqrestore(&object->lock, flags); + + put_object(object); +} +EXPORT_SYMBOL(kmemleak_update_trace); + +/** * kmemleak_not_leak - mark an allocated object as false positive * @ptr: pointer to beginning of the object * @@ -999,9 +1034,9 @@ void __ref kmemleak_not_leak(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) make_gray_object((unsigned long)ptr); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_not_leak); @@ -1019,9 +1054,9 @@ void __ref kmemleak_ignore(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) make_black_object((unsigned long)ptr); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_IGNORE, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_ignore); @@ -1041,9 +1076,9 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && size && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && size && !IS_ERR(ptr)) add_scan_area((unsigned long)ptr, size, gfp); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); } EXPORT_SYMBOL(kmemleak_scan_area); @@ -1061,9 +1096,9 @@ void __ref kmemleak_no_scan(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) object_no_scan((unsigned long)ptr); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_no_scan); @@ -1088,7 +1123,7 @@ static bool update_checksum(struct kmemleak_object *object) */ static int scan_should_stop(void) { - if (!atomic_read(&kmemleak_enabled)) + if (!kmemleak_enabled) return 1; /* @@ -1299,7 +1334,7 @@ static void kmemleak_scan(void) /* * Struct page scanning for each node. */ - lock_memory_hotplug(); + get_online_mems(); for_each_online_node(i) { unsigned long start_pfn = node_start_pfn(i); unsigned long end_pfn = node_end_pfn(i); @@ -1317,7 +1352,7 @@ static void kmemleak_scan(void) scan_block(page, page + 1, NULL, 1); } } - unlock_memory_hotplug(); + put_online_mems(); /* * Scanning the task stacks (may introduce false negatives). @@ -1382,9 +1417,12 @@ static void kmemleak_scan(void) } rcu_read_unlock(); - if (new_leaks) + if (new_leaks) { + kmemleak_found_leaks = true; + pr_info("%d new suspected memory leaks (see " "/sys/kernel/debug/kmemleak)\n", new_leaks); + } } @@ -1545,11 +1583,6 @@ static int kmemleak_open(struct inode *inode, struct file *file) return seq_open(file, &kmemleak_seq_ops); } -static int kmemleak_release(struct inode *inode, struct file *file) -{ - return seq_release(inode, file); -} - static int dump_str_object_info(const char *str) { unsigned long flags; @@ -1592,8 +1625,12 @@ static void kmemleak_clear(void) spin_unlock_irqrestore(&object->lock, flags); } rcu_read_unlock(); + + kmemleak_found_leaks = false; } +static void __kmemleak_do_cleanup(void); + /* * File write operation to configure kmemleak at run-time. The following * commands can be written to the /sys/kernel/debug/kmemleak file: @@ -1606,7 +1643,8 @@ static void kmemleak_clear(void) * disable it) * scan - trigger a memory scan * clear - mark all current reported unreferenced kmemleak objects as - * grey to ignore printing them + * grey to ignore printing them, or free all kmemleak objects + * if kmemleak has been disabled. * dump=... - dump information about the object found at the given address */ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, @@ -1616,9 +1654,6 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, int buf_size; int ret; - if (!atomic_read(&kmemleak_enabled)) - return -EBUSY; - buf_size = min(size, (sizeof(buf) - 1)); if (strncpy_from_user(buf, user_buf, buf_size) < 0) return -EFAULT; @@ -1628,6 +1663,19 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, if (ret < 0) return ret; + if (strncmp(buf, "clear", 5) == 0) { + if (kmemleak_enabled) + kmemleak_clear(); + else + __kmemleak_do_cleanup(); + goto out; + } + + if (!kmemleak_enabled) { + ret = -EBUSY; + goto out; + } + if (strncmp(buf, "off", 3) == 0) kmemleak_disable(); else if (strncmp(buf, "stack=on", 8) == 0) @@ -1651,8 +1699,6 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, } } else if (strncmp(buf, "scan", 4) == 0) kmemleak_scan(); - else if (strncmp(buf, "clear", 5) == 0) - kmemleak_clear(); else if (strncmp(buf, "dump=", 5) == 0) ret = dump_str_object_info(buf + 5); else @@ -1674,9 +1720,19 @@ static const struct file_operations kmemleak_fops = { .read = seq_read, .write = kmemleak_write, .llseek = seq_lseek, - .release = kmemleak_release, + .release = seq_release, }; +static void __kmemleak_do_cleanup(void) +{ + struct kmemleak_object *object; + + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) + delete_object_full(object->pointer); + rcu_read_unlock(); +} + /* * Stop the memory scanning thread and free the kmemleak internal objects if * no previous scan thread (otherwise, kmemleak may still have some useful @@ -1684,18 +1740,14 @@ static const struct file_operations kmemleak_fops = { */ static void kmemleak_do_cleanup(struct work_struct *work) { - struct kmemleak_object *object; - bool cleanup = scan_thread == NULL; - mutex_lock(&scan_mutex); stop_scan_thread(); - if (cleanup) { - rcu_read_lock(); - list_for_each_entry_rcu(object, &object_list, object_list) - delete_object_full(object->pointer); - rcu_read_unlock(); - } + if (!kmemleak_found_leaks) + __kmemleak_do_cleanup(); + else + pr_info("Kmemleak disabled without freeing internal data. " + "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n"); mutex_unlock(&scan_mutex); } @@ -1708,14 +1760,14 @@ static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); static void kmemleak_disable(void) { /* atomically check whether it was already invoked */ - if (atomic_cmpxchg(&kmemleak_error, 0, 1)) + if (cmpxchg(&kmemleak_error, 0, 1)) return; /* stop any memory operation tracing */ - atomic_set(&kmemleak_enabled, 0); + kmemleak_enabled = 0; /* check whether it is too early for a kernel thread */ - if (atomic_read(&kmemleak_initialized)) + if (kmemleak_initialized) schedule_work(&cleanup_work); pr_info("Kernel memory leak detector disabled\n"); @@ -1759,7 +1811,7 @@ void __init kmemleak_init(void) #ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF if (!kmemleak_skip_disable) { - atomic_set(&kmemleak_early_log, 0); + kmemleak_early_log = 0; kmemleak_disable(); return; } @@ -1777,12 +1829,12 @@ void __init kmemleak_init(void) /* the kernel is still in UP mode, so disabling the IRQs is enough */ local_irq_save(flags); - atomic_set(&kmemleak_early_log, 0); - if (atomic_read(&kmemleak_error)) { + kmemleak_early_log = 0; + if (kmemleak_error) { local_irq_restore(flags); return; } else - atomic_set(&kmemleak_enabled, 1); + kmemleak_enabled = 1; local_irq_restore(flags); /* @@ -1826,9 +1878,9 @@ void __init kmemleak_init(void) log->op_type); } - if (atomic_read(&kmemleak_warning)) { + if (kmemleak_warning) { print_log_trace(log); - atomic_set(&kmemleak_warning, 0); + kmemleak_warning = 0; } } } @@ -1840,9 +1892,9 @@ static int __init kmemleak_late_init(void) { struct dentry *dentry; - atomic_set(&kmemleak_initialized, 1); + kmemleak_initialized = 1; - if (atomic_read(&kmemleak_error)) { + if (kmemleak_error) { /* * Some error occurred and kmemleak was disabled. There is a * small chance that kmemleak_disable() was called immediately @@ -945,7 +945,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, pmd = mm_find_pmd(mm, addr); if (!pmd) goto out; - BUG_ON(pmd_trans_huge(*pmd)); mmun_start = addr; mmun_end = addr + PAGE_SIZE; @@ -1979,18 +1978,12 @@ void ksm_migrate_page(struct page *newpage, struct page *oldpage) #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_MEMORY_HOTREMOVE -static int just_wait(void *word) -{ - schedule(); - return 0; -} - static void wait_while_offlining(void) { while (ksm_run & KSM_RUN_OFFLINE) { mutex_unlock(&ksm_thread_mutex); wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE), - just_wait, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); mutex_lock(&ksm_thread_mutex); } } diff --git a/mm/list_lru.c b/mm/list_lru.c index 72f9dec..f1a0db1 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -87,11 +87,20 @@ restart: ret = isolate(item, &nlru->lock, cb_arg); switch (ret) { + case LRU_REMOVED_RETRY: + assert_spin_locked(&nlru->lock); case LRU_REMOVED: if (--nlru->nr_items == 0) node_clear(nid, lru->active_nodes); WARN_ON_ONCE(nlru->nr_items < 0); isolated++; + /* + * If the lru lock has been dropped, our list + * traversal is now invalid and so we have to + * restart from scratch. + */ + if (ret == LRU_REMOVED_RETRY) + goto restart; break; case LRU_ROTATE: list_move_tail(item, &nlru->list); @@ -103,6 +112,7 @@ restart: * The lru lock has been dropped, our list traversal is * now invalid and so we have to restart from scratch. */ + assert_spin_locked(&nlru->lock); goto restart; default: BUG(); @@ -114,7 +124,7 @@ restart: } EXPORT_SYMBOL_GPL(list_lru_walk_node); -int list_lru_init(struct list_lru *lru) +int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key) { int i; size_t size = sizeof(*lru->node) * nr_node_ids; @@ -126,12 +136,14 @@ int list_lru_init(struct list_lru *lru) nodes_clear(lru->active_nodes); for (i = 0; i < nr_node_ids; i++) { spin_lock_init(&lru->node[i].lock); + if (key) + lockdep_set_class(&lru->node[i].lock, key); INIT_LIST_HEAD(&lru->node[i].list); lru->node[i].nr_items = 0; } return 0; } -EXPORT_SYMBOL_GPL(list_lru_init); +EXPORT_SYMBOL_GPL(list_lru_init_key); void list_lru_destroy(struct list_lru *lru) { diff --git a/mm/madvise.c b/mm/madvise.c index 539eeb9..a402f8f 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -195,7 +195,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, for (; start < end; start += PAGE_SIZE) { index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - page = find_get_page(mapping, index); + page = find_get_entry(mapping, index); if (!radix_tree_exceptional_entry(page)) { if (page) page_cache_release(page); diff --git a/mm/memblock.c b/mm/memblock.c index 39a31e7..6d2f219 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -27,6 +27,9 @@ static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP +static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock; +#endif struct memblock memblock __initdata_memblock = { .memory.regions = memblock_memory_init_regions, @@ -37,6 +40,12 @@ struct memblock memblock __initdata_memblock = { .reserved.cnt = 1, /* empty dummy entry */ .reserved.max = INIT_MEMBLOCK_REGIONS, +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP + .physmem.regions = memblock_physmem_init_regions, + .physmem.cnt = 1, /* empty dummy entry */ + .physmem.max = INIT_PHYSMEM_REGIONS, +#endif + .bottom_up = false, .current_limit = MEMBLOCK_ALLOC_ANYWHERE, }; @@ -472,7 +481,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, } /** - * memblock_add_region - add new memblock region + * memblock_add_range - add new memblock region * @type: memblock type to add new region into * @base: base address of the new region * @size: size of the new region @@ -487,7 +496,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, * RETURNS: * 0 on success, -errno on failure. */ -static int __init_memblock memblock_add_region(struct memblock_type *type, +int __init_memblock memblock_add_range(struct memblock_type *type, phys_addr_t base, phys_addr_t size, int nid, unsigned long flags) { @@ -569,12 +578,12 @@ repeat: int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, int nid) { - return memblock_add_region(&memblock.memory, base, size, nid, 0); + return memblock_add_range(&memblock.memory, base, size, nid, 0); } int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { - return memblock_add_region(&memblock.memory, base, size, + return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0); } @@ -654,8 +663,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, return 0; } -static int __init_memblock __memblock_remove(struct memblock_type *type, - phys_addr_t base, phys_addr_t size) +int __init_memblock memblock_remove_range(struct memblock_type *type, + phys_addr_t base, phys_addr_t size) { int start_rgn, end_rgn; int i, ret; @@ -671,9 +680,10 @@ static int __init_memblock __memblock_remove(struct memblock_type *type, int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) { - return __memblock_remove(&memblock.memory, base, size); + return memblock_remove_range(&memblock.memory, base, size); } + int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) { memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", @@ -681,7 +691,8 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) (unsigned long long)base + size - 1, (void *)_RET_IP_); - return __memblock_remove(&memblock.reserved, base, size); + kmemleak_free_part(__va(base), size); + return memblock_remove_range(&memblock.reserved, base, size); } static int __init_memblock memblock_reserve_region(phys_addr_t base, @@ -696,7 +707,7 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base, (unsigned long long)base + size - 1, flags, (void *)_RET_IP_); - return memblock_add_region(_rgn, base, size, nid, flags); + return memblock_add_range(_rgn, base, size, nid, flags); } int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) @@ -758,17 +769,19 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) } /** - * __next_free_mem_range - next function for for_each_free_mem_range() + * __next__mem_range - next function for for_each_free_mem_range() etc. * @idx: pointer to u64 loop variable * @nid: node selector, %NUMA_NO_NODE for all nodes + * @type_a: pointer to memblock_type from where the range is taken + * @type_b: pointer to memblock_type which excludes memory from being taken * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL * - * Find the first free area from *@idx which matches @nid, fill the out + * Find the first area from *@idx which matches @nid, fill the out * parameters, and update *@idx for the next iteration. The lower 32bit of - * *@idx contains index into memory region and the upper 32bit indexes the - * areas before each reserved region. For example, if reserved regions + * *@idx contains index into type_a and the upper 32bit indexes the + * areas before each region in type_b. For example, if type_b regions * look like the following, * * 0:[0-16), 1:[32-48), 2:[128-130) @@ -780,53 +793,77 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) * As both region arrays are sorted, the function advances the two indices * in lockstep and returns each intersection. */ -void __init_memblock __next_free_mem_range(u64 *idx, int nid, - phys_addr_t *out_start, - phys_addr_t *out_end, int *out_nid) +void __init_memblock __next_mem_range(u64 *idx, int nid, + struct memblock_type *type_a, + struct memblock_type *type_b, + phys_addr_t *out_start, + phys_addr_t *out_end, int *out_nid) { - struct memblock_type *mem = &memblock.memory; - struct memblock_type *rsv = &memblock.reserved; - int mi = *idx & 0xffffffff; - int ri = *idx >> 32; + int idx_a = *idx & 0xffffffff; + int idx_b = *idx >> 32; - if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + if (WARN_ONCE(nid == MAX_NUMNODES, + "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) nid = NUMA_NO_NODE; - for ( ; mi < mem->cnt; mi++) { - struct memblock_region *m = &mem->regions[mi]; + for (; idx_a < type_a->cnt; idx_a++) { + struct memblock_region *m = &type_a->regions[idx_a]; + phys_addr_t m_start = m->base; phys_addr_t m_end = m->base + m->size; + int m_nid = memblock_get_region_node(m); /* only memory regions are associated with nodes, check it */ - if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) + if (nid != NUMA_NO_NODE && nid != m_nid) continue; - /* scan areas before each reservation for intersection */ - for ( ; ri < rsv->cnt + 1; ri++) { - struct memblock_region *r = &rsv->regions[ri]; - phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0; - phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX; + if (!type_b) { + if (out_start) + *out_start = m_start; + if (out_end) + *out_end = m_end; + if (out_nid) + *out_nid = m_nid; + idx_a++; + *idx = (u32)idx_a | (u64)idx_b << 32; + return; + } + + /* scan areas before each reservation */ + for (; idx_b < type_b->cnt + 1; idx_b++) { + struct memblock_region *r; + phys_addr_t r_start; + phys_addr_t r_end; - /* if ri advanced past mi, break out to advance mi */ + r = &type_b->regions[idx_b]; + r_start = idx_b ? r[-1].base + r[-1].size : 0; + r_end = idx_b < type_b->cnt ? + r->base : ULLONG_MAX; + + /* + * if idx_b advanced past idx_a, + * break out to advance idx_a + */ if (r_start >= m_end) break; /* if the two regions intersect, we're done */ if (m_start < r_end) { if (out_start) - *out_start = max(m_start, r_start); + *out_start = + max(m_start, r_start); if (out_end) *out_end = min(m_end, r_end); if (out_nid) - *out_nid = memblock_get_region_node(m); + *out_nid = m_nid; /* - * The region which ends first is advanced - * for the next iteration. + * The region which ends first is + * advanced for the next iteration. */ if (m_end <= r_end) - mi++; + idx_a++; else - ri++; - *idx = (u32)mi | (u64)ri << 32; + idx_b++; + *idx = (u32)idx_a | (u64)idx_b << 32; return; } } @@ -837,57 +874,80 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, } /** - * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() + * __next_mem_range_rev - generic next function for for_each_*_range_rev() + * + * Finds the next range from type_a which is not marked as unsuitable + * in type_b. + * * @idx: pointer to u64 loop variable * @nid: nid: node selector, %NUMA_NO_NODE for all nodes + * @type_a: pointer to memblock_type from where the range is taken + * @type_b: pointer to memblock_type which excludes memory from being taken * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL * - * Reverse of __next_free_mem_range(). - * - * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't - * be able to hot-remove hotpluggable memory used by the kernel. So this - * function skip hotpluggable regions if needed when allocating memory for the - * kernel. + * Reverse of __next_mem_range(). */ -void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, - phys_addr_t *out_start, - phys_addr_t *out_end, int *out_nid) +void __init_memblock __next_mem_range_rev(u64 *idx, int nid, + struct memblock_type *type_a, + struct memblock_type *type_b, + phys_addr_t *out_start, + phys_addr_t *out_end, int *out_nid) { - struct memblock_type *mem = &memblock.memory; - struct memblock_type *rsv = &memblock.reserved; - int mi = *idx & 0xffffffff; - int ri = *idx >> 32; + int idx_a = *idx & 0xffffffff; + int idx_b = *idx >> 32; if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) nid = NUMA_NO_NODE; if (*idx == (u64)ULLONG_MAX) { - mi = mem->cnt - 1; - ri = rsv->cnt; + idx_a = type_a->cnt - 1; + idx_b = type_b->cnt; } - for ( ; mi >= 0; mi--) { - struct memblock_region *m = &mem->regions[mi]; + for (; idx_a >= 0; idx_a--) { + struct memblock_region *m = &type_a->regions[idx_a]; + phys_addr_t m_start = m->base; phys_addr_t m_end = m->base + m->size; + int m_nid = memblock_get_region_node(m); /* only memory regions are associated with nodes, check it */ - if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) + if (nid != NUMA_NO_NODE && nid != m_nid) continue; /* skip hotpluggable memory regions if needed */ if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) continue; - /* scan areas before each reservation for intersection */ - for ( ; ri >= 0; ri--) { - struct memblock_region *r = &rsv->regions[ri]; - phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0; - phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX; + if (!type_b) { + if (out_start) + *out_start = m_start; + if (out_end) + *out_end = m_end; + if (out_nid) + *out_nid = m_nid; + idx_a++; + *idx = (u32)idx_a | (u64)idx_b << 32; + return; + } + + /* scan areas before each reservation */ + for (; idx_b >= 0; idx_b--) { + struct memblock_region *r; + phys_addr_t r_start; + phys_addr_t r_end; + + r = &type_b->regions[idx_b]; + r_start = idx_b ? r[-1].base + r[-1].size : 0; + r_end = idx_b < type_b->cnt ? + r->base : ULLONG_MAX; + /* + * if idx_b advanced past idx_a, + * break out to advance idx_a + */ - /* if ri advanced past mi, break out to advance mi */ if (r_end <= m_start) break; /* if the two regions intersect, we're done */ @@ -897,18 +957,17 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, if (out_end) *out_end = min(m_end, r_end); if (out_nid) - *out_nid = memblock_get_region_node(m); - + *out_nid = m_nid; if (m_start >= r_start) - mi--; + idx_a--; else - ri--; - *idx = (u32)mi | (u64)ri << 32; + idx_b--; + *idx = (u32)idx_a | (u64)idx_b << 32; return; } } } - + /* signal end of iteration */ *idx = ULLONG_MAX; } @@ -975,22 +1034,40 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, } #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, - phys_addr_t align, phys_addr_t max_addr, - int nid) +static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, + phys_addr_t align, phys_addr_t start, + phys_addr_t end, int nid) { phys_addr_t found; if (!align) align = SMP_CACHE_BYTES; - found = memblock_find_in_range_node(size, align, 0, max_addr, nid); - if (found && !memblock_reserve(found, size)) + found = memblock_find_in_range_node(size, align, start, end, nid); + if (found && !memblock_reserve(found, size)) { + /* + * The min_count is set to 0 so that memblock allocations are + * never reported as leaks. + */ + kmemleak_alloc(__va(found), size, 0, 0); return found; - + } return 0; } +phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, + phys_addr_t start, phys_addr_t end) +{ + return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE); +} + +static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, + phys_addr_t align, phys_addr_t max_addr, + int nid) +{ + return memblock_alloc_range_nid(size, align, 0, max_addr, nid); +} + phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid); @@ -1201,7 +1278,7 @@ void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) __func__, (u64)base, (u64)base + size - 1, (void *)_RET_IP_); kmemleak_free_part(__va(base), size); - __memblock_remove(&memblock.reserved, base, size); + memblock_remove_range(&memblock.reserved, base, size); } /* @@ -1253,7 +1330,7 @@ phys_addr_t __init memblock_mem_size(unsigned long limit_pfn) pages += end_pfn - start_pfn; } - return (phys_addr_t)pages << PAGE_SHIFT; + return PFN_PHYS(pages); } /* lowest address */ @@ -1271,16 +1348,14 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void) void __init memblock_enforce_memory_limit(phys_addr_t limit) { - unsigned long i; phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + struct memblock_region *r; if (!limit) return; /* find out max address */ - for (i = 0; i < memblock.memory.cnt; i++) { - struct memblock_region *r = &memblock.memory.regions[i]; - + for_each_memblock(memory, r) { if (limit <= r->size) { max_addr = r->base + limit; break; @@ -1289,8 +1364,10 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit) } /* truncate both memory and reserved regions */ - __memblock_remove(&memblock.memory, max_addr, (phys_addr_t)ULLONG_MAX); - __memblock_remove(&memblock.reserved, max_addr, (phys_addr_t)ULLONG_MAX); + memblock_remove_range(&memblock.memory, max_addr, + (phys_addr_t)ULLONG_MAX); + memblock_remove_range(&memblock.reserved, max_addr, + (phys_addr_t)ULLONG_MAX); } static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) @@ -1326,14 +1403,13 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn) { struct memblock_type *type = &memblock.memory; - int mid = memblock_search(type, (phys_addr_t)pfn << PAGE_SHIFT); + int mid = memblock_search(type, PFN_PHYS(pfn)); if (mid == -1) return -1; - *start_pfn = type->regions[mid].base >> PAGE_SHIFT; - *end_pfn = (type->regions[mid].base + type->regions[mid].size) - >> PAGE_SHIFT; + *start_pfn = PFN_DOWN(type->regions[mid].base); + *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size); return type->regions[mid].nid; } @@ -1379,13 +1455,12 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si void __init_memblock memblock_trim_memory(phys_addr_t align) { - int i; phys_addr_t start, end, orig_start, orig_end; - struct memblock_type *mem = &memblock.memory; + struct memblock_region *r; - for (i = 0; i < mem->cnt; i++) { - orig_start = mem->regions[i].base; - orig_end = mem->regions[i].base + mem->regions[i].size; + for_each_memblock(memory, r) { + orig_start = r->base; + orig_end = r->base + r->size; start = round_up(orig_start, align); end = round_down(orig_end, align); @@ -1393,11 +1468,12 @@ void __init_memblock memblock_trim_memory(phys_addr_t align) continue; if (start < end) { - mem->regions[i].base = start; - mem->regions[i].size = end - start; + r->base = start; + r->size = end - start; } else { - memblock_remove_region(mem, i); - i--; + memblock_remove_region(&memblock.memory, + r - memblock.memory.regions); + r--; } } } @@ -1407,6 +1483,11 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit) memblock.current_limit = limit; } +phys_addr_t __init_memblock memblock_get_current_limit(void) +{ + return memblock.current_limit; +} + static void __init_memblock memblock_dump(struct memblock_type *type, char *name) { unsigned long long base, size; @@ -1499,6 +1580,9 @@ static int __init memblock_init_debugfs(void) return -ENXIO; debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops); debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops); +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP + debugfs_create_file("physmem", S_IRUGO, root, &memblock.physmem, &memblock_debug_fops); +#endif return 0; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5b6b003..f009a14 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -66,8 +66,8 @@ #include <trace/events/vmscan.h> -struct cgroup_subsys mem_cgroup_subsys __read_mostly; -EXPORT_SYMBOL(mem_cgroup_subsys); +struct cgroup_subsys memory_cgrp_subsys __read_mostly; +EXPORT_SYMBOL(memory_cgrp_subsys); #define MEM_CGROUP_RECLAIM_RETRIES 5 static struct mem_cgroup *root_mem_cgroup __read_mostly; @@ -80,7 +80,7 @@ int do_swap_account __read_mostly; #ifdef CONFIG_MEMCG_SWAP_ENABLED static int really_do_swap_account __initdata = 1; #else -static int really_do_swap_account __initdata = 0; +static int really_do_swap_account __initdata; #endif #else @@ -357,10 +357,9 @@ struct mem_cgroup { struct cg_proto tcp_mem; #endif #if defined(CONFIG_MEMCG_KMEM) - /* analogous to slab_common's slab_caches list. per-memcg */ + /* analogous to slab_common's slab_caches list, but per-memcg; + * protected by memcg_slab_mutex */ struct list_head memcg_slab_caches; - /* Not a spinlock, we can take a lot of time walking the list */ - struct mutex slab_caches_mutex; /* Index in the kmem_cache->memcg_params->memcg_caches array */ int kmemcg_id; #endif @@ -527,18 +526,14 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { - /* - * The ID of the root cgroup is 0, but memcg treat 0 as an - * invalid ID, so we return (cgroup_id + 1). - */ - return memcg->css.cgroup->id + 1; + return memcg->css.id; } static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { struct cgroup_subsys_state *css; - css = css_from_id(id - 1, &mem_cgroup_subsys); + css = css_from_id(id, &memory_cgrp_subsys); return mem_cgroup_from_css(css); } @@ -571,7 +566,8 @@ void sock_update_memcg(struct sock *sk) memcg = mem_cgroup_from_task(current); cg_proto = sk->sk_prot->proto_cgroup(memcg); if (!mem_cgroup_is_root(memcg) && - memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) { + memcg_proto_active(cg_proto) && + css_tryget_online(&memcg->css)) { sk->sk_cgrp = cg_proto; } rcu_read_unlock(); @@ -677,9 +673,11 @@ static void disarm_static_keys(struct mem_cgroup *memcg) static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * -mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) +mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) { - VM_BUG_ON((unsigned)nid >= nr_node_ids); + int nid = zone_to_nid(zone); + int zid = zone_idx(zone); + return &memcg->nodeinfo[nid]->zoneinfo[zid]; } @@ -689,12 +687,12 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) } static struct mem_cgroup_per_zone * -page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) +mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) { int nid = page_to_nid(page); int zid = page_zonenum(page); - return mem_cgroup_zoneinfo(memcg, nid, zid); + return &memcg->nodeinfo[nid]->zoneinfo[zid]; } static struct mem_cgroup_tree_per_zone * @@ -712,11 +710,9 @@ soft_limit_tree_from_page(struct page *page) return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; } -static void -__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz, - unsigned long long new_usage_in_excess) +static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz, + unsigned long long new_usage_in_excess) { struct rb_node **p = &mctz->rb_root.rb_node; struct rb_node *parent = NULL; @@ -746,10 +742,8 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, mz->on_tree = true; } -static void -__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) +static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) { if (!mz->on_tree) return; @@ -757,13 +751,11 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, mz->on_tree = false; } -static void -mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) +static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) { spin_lock(&mctz->lock); - __mem_cgroup_remove_exceeded(memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); spin_unlock(&mctz->lock); } @@ -773,16 +765,14 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) unsigned long long excess; struct mem_cgroup_per_zone *mz; struct mem_cgroup_tree_per_zone *mctz; - int nid = page_to_nid(page); - int zid = page_zonenum(page); - mctz = soft_limit_tree_from_page(page); + mctz = soft_limit_tree_from_page(page); /* * Necessary to update all ancestors when hierarchy is used. * because their event counter is not touched. */ for (; memcg; memcg = parent_mem_cgroup(memcg)) { - mz = mem_cgroup_zoneinfo(memcg, nid, zid); + mz = mem_cgroup_page_zoneinfo(memcg, page); excess = res_counter_soft_limit_excess(&memcg->res); /* * We have to update the tree if mz is on RB-tree or @@ -792,12 +782,12 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) spin_lock(&mctz->lock); /* if on-tree, remove it */ if (mz->on_tree) - __mem_cgroup_remove_exceeded(memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); /* * Insert again. mz->usage_in_excess will be updated. * If excess is 0, no tree ops. */ - __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); + __mem_cgroup_insert_exceeded(mz, mctz, excess); spin_unlock(&mctz->lock); } } @@ -805,15 +795,15 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) { - int node, zone; - struct mem_cgroup_per_zone *mz; struct mem_cgroup_tree_per_zone *mctz; + struct mem_cgroup_per_zone *mz; + int nid, zid; - for_each_node(node) { - for (zone = 0; zone < MAX_NR_ZONES; zone++) { - mz = mem_cgroup_zoneinfo(memcg, node, zone); - mctz = soft_limit_tree_node_zone(node, zone); - mem_cgroup_remove_exceeded(memcg, mz, mctz); + for_each_node(nid) { + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; + mctz = soft_limit_tree_node_zone(nid, zid); + mem_cgroup_remove_exceeded(mz, mctz); } } } @@ -836,9 +826,9 @@ retry: * we will to add it back at the end of reclaim to its correct * position in the tree. */ - __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); if (!res_counter_soft_limit_excess(&mz->memcg->res) || - !css_tryget(&mz->memcg->css)) + !css_tryget_online(&mz->memcg->css)) goto retry; done: return mz; @@ -921,8 +911,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, bool anon, int nr_pages) { - preempt_disable(); - /* * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is * counted as CACHE even if it's on ANON LRU. @@ -947,12 +935,9 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, } __this_cpu_add(memcg->stat->nr_page_events, nr_pages); - - preempt_enable(); } -unsigned long -mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) +unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { struct mem_cgroup_per_zone *mz; @@ -960,46 +945,38 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) return mz->lru_size[lru]; } -static unsigned long -mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, - unsigned int lru_mask) +static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, + unsigned int lru_mask) { - struct mem_cgroup_per_zone *mz; - enum lru_list lru; - unsigned long ret = 0; - - mz = mem_cgroup_zoneinfo(memcg, nid, zid); - - for_each_lru(lru) { - if (BIT(lru) & lru_mask) - ret += mz->lru_size[lru]; - } - return ret; -} - -static unsigned long -mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask) -{ - u64 total = 0; + unsigned long nr = 0; int zid; - for (zid = 0; zid < MAX_NR_ZONES; zid++) - total += mem_cgroup_zone_nr_lru_pages(memcg, - nid, zid, lru_mask); + VM_BUG_ON((unsigned)nid >= nr_node_ids); - return total; + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + struct mem_cgroup_per_zone *mz; + enum lru_list lru; + + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; + nr += mz->lru_size[lru]; + } + } + return nr; } static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, unsigned int lru_mask) { + unsigned long nr = 0; int nid; - u64 total = 0; for_each_node_state(nid, N_MEMORY) - total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); - return total; + nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); + return nr; } static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, @@ -1072,26 +1049,28 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) if (unlikely(!p)) return NULL; - return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id)); + return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); } -struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) +static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { struct mem_cgroup *memcg = NULL; - if (!mm) - return NULL; - /* - * Because we have no locks, mm->owner's may be being moved to other - * cgroup. We use css_tryget() here even if this looks - * pessimistic (rather than adding locks here). - */ rcu_read_lock(); do { - memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (unlikely(!memcg)) - break; - } while (!css_tryget(&memcg->css)); + /* + * Page cache insertions can happen withou an + * actual mm context, e.g. during disk probing + * on boot, loopback IO, acct() writes etc. + */ + if (unlikely(!mm)) + memcg = root_mem_cgroup; + else { + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!memcg)) + memcg = root_mem_cgroup; + } + } while (!css_tryget_online(&memcg->css)); rcu_read_unlock(); return memcg; } @@ -1128,7 +1107,8 @@ skip_node: */ if (next_css) { if ((next_css == &root->css) || - ((next_css->flags & CSS_ONLINE) && css_tryget(next_css))) + ((next_css->flags & CSS_ONLINE) && + css_tryget_online(next_css))) return mem_cgroup_from_css(next_css); prev_css = next_css; @@ -1174,7 +1154,7 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, * would be returned all the time. */ if (position && position != root && - !css_tryget(&position->css)) + !css_tryget_online(&position->css)) position = NULL; } return position; @@ -1245,11 +1225,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, int uninitialized_var(seq); if (reclaim) { - int nid = zone_to_nid(reclaim->zone); - int zid = zone_idx(reclaim->zone); struct mem_cgroup_per_zone *mz; - mz = mem_cgroup_zoneinfo(root, nid, zid); + mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); iter = &mz->reclaim_iter[reclaim->priority]; if (prev && reclaim->generation != iter->generation) { iter->last_visited = NULL; @@ -1356,7 +1334,7 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, goto out; } - mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); + mz = mem_cgroup_zone_zoneinfo(memcg, zone); lruvec = &mz->lruvec; out: /* @@ -1415,7 +1393,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) pc->mem_cgroup = memcg = root_mem_cgroup; - mz = page_cgroup_zoneinfo(memcg, page); + mz = mem_cgroup_page_zoneinfo(memcg, page); lruvec = &mz->lruvec; out: /* @@ -1486,7 +1464,7 @@ bool task_in_mem_cgroup(struct task_struct *task, p = find_lock_task_mm(task); if (p) { - curr = try_get_mem_cgroup_from_mm(p->mm); + curr = get_mem_cgroup_from_mm(p->mm); task_unlock(p); } else { /* @@ -1500,8 +1478,6 @@ bool task_in_mem_cgroup(struct task_struct *task, css_get(&curr->css); rcu_read_unlock(); } - if (!curr) - return false; /* * We should check use_hierarchy of "memcg" not "curr". Because checking * use_hierarchy of "curr" here make this function true if hierarchy is @@ -1555,7 +1531,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) int mem_cgroup_swappiness(struct mem_cgroup *memcg) { /* root ? */ - if (!css_parent(&memcg->css)) + if (mem_cgroup_disabled() || !memcg->css.parent) return vm_swappiness; return memcg->swappiness; @@ -1599,23 +1575,12 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) } /* - * 2 routines for checking "mem" is under move_account() or not. + * A routine for checking "mem" is under move_account() or not. * - * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This - * is used for avoiding races in accounting. If true, - * pc->mem_cgroup may be overwritten. - * - * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or - * under hierarchy of moving cgroups. This is for - * waiting at hith-memory prressure caused by "move". + * Checking a cgroup is mc.from or mc.to or under hierarchy of + * moving cgroups. This is for waiting at high-memory pressure + * caused by "move". */ - -static bool mem_cgroup_stolen(struct mem_cgroup *memcg) -{ - VM_BUG_ON(!rcu_read_lock_held()); - return atomic_read(&memcg->moving_account) > 0; -} - static bool mem_cgroup_under_move(struct mem_cgroup *memcg) { struct mem_cgroup *from; @@ -1658,7 +1623,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) * Take this lock when * - a code tries to modify page's memcg while it's USED. * - a code tries to modify page state accounting in a memcg. - * see mem_cgroup_stolen(), too. */ static void move_lock_mem_cgroup(struct mem_cgroup *memcg, unsigned long *flags) @@ -1683,15 +1647,8 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, */ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { - /* - * protects memcg_name and makes sure that parallel ooms do not - * interleave - */ + /* oom_info_lock ensures that parallel ooms do not interleave */ static DEFINE_MUTEX(oom_info_lock); - struct cgroup *task_cgrp; - struct cgroup *mem_cgrp; - static char memcg_name[PATH_MAX]; - int ret; struct mem_cgroup *iter; unsigned int i; @@ -1701,36 +1658,14 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) mutex_lock(&oom_info_lock); rcu_read_lock(); - mem_cgrp = memcg->css.cgroup; - task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); + pr_info("Task in "); + pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); + pr_info(" killed as a result of limit of "); + pr_cont_cgroup_path(memcg->css.cgroup); + pr_info("\n"); - ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); - if (ret < 0) { - /* - * Unfortunately, we are unable to convert to a useful name - * But we'll still print out the usage information - */ - rcu_read_unlock(); - goto done; - } rcu_read_unlock(); - pr_info("Task in %s killed", memcg_name); - - rcu_read_lock(); - ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); - if (ret < 0) { - rcu_read_unlock(); - goto done; - } - rcu_read_unlock(); - - /* - * Continues from above, so we don't need an KERN_ level - */ - pr_cont(" as a result of limit of %s\n", memcg_name); -done: - pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, @@ -1745,13 +1680,8 @@ done: res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); for_each_mem_cgroup_tree(iter, memcg) { - pr_info("Memory cgroup stats"); - - rcu_read_lock(); - ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX); - if (!ret) - pr_cont(" for %s", memcg_name); - rcu_read_unlock(); + pr_info("Memory cgroup stats for "); + pr_cont_cgroup_path(iter->css.cgroup); pr_cont(":"); for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { @@ -2327,12 +2257,11 @@ cleanup: } /* - * Currently used to update mapped file statistics, but the routine can be - * generalized to update other statistics as well. + * Used to update mapped file or writeback or other statistics. * * Notes: Race condition * - * We usually use page_cgroup_lock() for accessing page_cgroup member but + * We usually use lock_page_cgroup() for accessing page_cgroup member but * it tends to be costly. But considering some conditions, we doesn't need * to do so _always_. * @@ -2346,8 +2275,8 @@ cleanup: * by flags. * * Considering "move", this is an only case we see a race. To make the race - * small, we check mm->moving_account and detect there are possibility of race - * If there is, we take a lock. + * small, we check memcg->moving_account and detect there are possibility + * of race or not. If there is, we take a lock. */ void __mem_cgroup_begin_update_page_stat(struct page *page, @@ -2365,9 +2294,10 @@ again: * If this memory cgroup is not under account moving, we don't * need to take move_lock_mem_cgroup(). Because we already hold * rcu_read_lock(), any calls to move_account will be delayed until - * rcu_read_unlock() if mem_cgroup_stolen() == true. + * rcu_read_unlock(). */ - if (!mem_cgroup_stolen(memcg)) + VM_BUG_ON(!rcu_read_lock_held()); + if (atomic_read(&memcg->moving_account) <= 0) return; move_lock_mem_cgroup(memcg, flags); @@ -2475,7 +2405,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) */ static void drain_local_stock(struct work_struct *dummy) { - struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); + struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); } @@ -2622,7 +2552,7 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, } -/* See __mem_cgroup_try_charge() for details */ +/* See mem_cgroup_try_charge() for details */ enum { CHARGE_OK, /* success */ CHARGE_RETRY, /* need to retry but retry is not bad */ @@ -2695,45 +2625,35 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, return CHARGE_NOMEM; } -/* - * __mem_cgroup_try_charge() does - * 1. detect memcg to be charged against from passed *mm and *ptr, - * 2. update res_counter - * 3. call memory reclaim if necessary. - * - * In some special case, if the task is fatal, fatal_signal_pending() or - * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup - * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon - * as possible without any hazards. 2: all pages should have a valid - * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg - * pointer, that is treated as a charge to root_mem_cgroup. - * - * So __mem_cgroup_try_charge() will return - * 0 ... on success, filling *ptr with a valid memcg pointer. - * -ENOMEM ... charge failure because of resource limits. - * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup. +/** + * mem_cgroup_try_charge - try charging a memcg + * @memcg: memcg to charge + * @nr_pages: number of pages to charge + * @oom: trigger OOM if reclaim fails * - * Unlike the exported interface, an "oom" parameter is added. if oom==true, - * the oom-killer can be invoked. + * Returns 0 if @memcg was charged successfully, -EINTR if the charge + * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. */ -static int __mem_cgroup_try_charge(struct mm_struct *mm, - gfp_t gfp_mask, - unsigned int nr_pages, - struct mem_cgroup **ptr, - bool oom) +static int mem_cgroup_try_charge(struct mem_cgroup *memcg, + gfp_t gfp_mask, + unsigned int nr_pages, + bool oom) { unsigned int batch = max(CHARGE_BATCH, nr_pages); int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; - struct mem_cgroup *memcg = NULL; int ret; + if (mem_cgroup_is_root(memcg)) + goto done; /* - * Unlike gloval-vm's OOM-kill, we're not in memory shortage - * in system level. So, allow to go ahead dying process in addition to - * MEMDIE process. + * Unlike in global OOM situations, memcg is not in a physical + * memory shortage. Allow dying and OOM-killed tasks to + * bypass the last charges so that they can exit quickly and + * free their memory. */ - if (unlikely(test_thread_flag(TIF_MEMDIE) - || fatal_signal_pending(current))) + if (unlikely(test_thread_flag(TIF_MEMDIE) || + fatal_signal_pending(current) || + current->flags & PF_EXITING)) goto bypass; if (unlikely(task_in_memcg_oom(current))) @@ -2741,73 +2661,16 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (gfp_mask & __GFP_NOFAIL) oom = false; - - /* - * We always charge the cgroup the mm_struct belongs to. - * The mm_struct's mem_cgroup changes on task migration if the - * thread group leader migrates. It's possible that mm is not - * set, if so charge the root memcg (happens for pagecache usage). - */ - if (!*ptr && !mm) - *ptr = root_mem_cgroup; again: - if (*ptr) { /* css should be a valid one */ - memcg = *ptr; - if (mem_cgroup_is_root(memcg)) - goto done; - if (consume_stock(memcg, nr_pages)) - goto done; - css_get(&memcg->css); - } else { - struct task_struct *p; - - rcu_read_lock(); - p = rcu_dereference(mm->owner); - /* - * Because we don't have task_lock(), "p" can exit. - * In that case, "memcg" can point to root or p can be NULL with - * race with swapoff. Then, we have small risk of mis-accouning. - * But such kind of mis-account by race always happens because - * we don't have cgroup_mutex(). It's overkill and we allo that - * small race, here. - * (*) swapoff at el will charge against mm-struct not against - * task-struct. So, mm->owner can be NULL. - */ - memcg = mem_cgroup_from_task(p); - if (!memcg) - memcg = root_mem_cgroup; - if (mem_cgroup_is_root(memcg)) { - rcu_read_unlock(); - goto done; - } - if (consume_stock(memcg, nr_pages)) { - /* - * It seems dagerous to access memcg without css_get(). - * But considering how consume_stok works, it's not - * necessary. If consume_stock success, some charges - * from this memcg are cached on this cpu. So, we - * don't need to call css_get()/css_tryget() before - * calling consume_stock(). - */ - rcu_read_unlock(); - goto done; - } - /* after here, we may be blocked. we need to get refcnt */ - if (!css_tryget(&memcg->css)) { - rcu_read_unlock(); - goto again; - } - rcu_read_unlock(); - } + if (consume_stock(memcg, nr_pages)) + goto done; do { bool invoke_oom = oom && !nr_oom_retries; /* If killed, bypass charge */ - if (fatal_signal_pending(current)) { - css_put(&memcg->css); + if (fatal_signal_pending(current)) goto bypass; - } ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, invoke_oom); @@ -2816,17 +2679,12 @@ again: break; case CHARGE_RETRY: /* not in OOM situation but retry */ batch = nr_pages; - css_put(&memcg->css); - memcg = NULL; goto again; case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ - css_put(&memcg->css); goto nomem; case CHARGE_NOMEM: /* OOM routine works */ - if (!oom || invoke_oom) { - css_put(&memcg->css); + if (!oom || invoke_oom) goto nomem; - } nr_oom_retries--; break; } @@ -2834,20 +2692,44 @@ again: if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); - css_put(&memcg->css); done: - *ptr = memcg; return 0; nomem: - if (!(gfp_mask & __GFP_NOFAIL)) { - *ptr = NULL; + if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; - } bypass: - *ptr = root_mem_cgroup; return -EINTR; } +/** + * mem_cgroup_try_charge_mm - try charging a mm + * @mm: mm_struct to charge + * @nr_pages: number of pages to charge + * @oom: trigger OOM if reclaim fails + * + * Returns the charged mem_cgroup associated with the given mm_struct or + * NULL the charge failed. + */ +static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, + gfp_t gfp_mask, + unsigned int nr_pages, + bool oom) + +{ + struct mem_cgroup *memcg; + int ret; + + memcg = get_mem_cgroup_from_mm(mm); + ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom); + css_put(&memcg->css); + if (ret == -EINTR) + memcg = root_mem_cgroup; + else if (ret) + memcg = NULL; + + return memcg; +} + /* * Somemtimes we have to undo a charge we got by try_charge(). * This function is for that and do uncharge, put css's refcnt. @@ -2885,9 +2767,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, /* * A helper function to get mem_cgroup from ID. must be called under - * rcu_read_lock(). The caller is responsible for calling css_tryget if - * the mem_cgroup is used for charging. (dropping refcnt from swap can be - * called against removed memcg.) + * rcu_read_lock(). The caller is responsible for calling + * css_tryget_online() if the mem_cgroup is used for charging. (dropping + * refcnt from swap can be called against removed memcg.) */ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) { @@ -2910,14 +2792,14 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { memcg = pc->mem_cgroup; - if (memcg && !css_tryget(&memcg->css)) + if (memcg && !css_tryget_online(&memcg->css)) memcg = NULL; } else if (PageSwapCache(page)) { ent.val = page_private(page); id = lookup_swap_cgroup_id(ent); rcu_read_lock(); memcg = mem_cgroup_lookup(id); - if (memcg && !css_tryget(&memcg->css)) + if (memcg && !css_tryget_online(&memcg->css)) memcg = NULL; rcu_read_unlock(); } @@ -2999,6 +2881,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, static DEFINE_MUTEX(set_limit_mutex); #ifdef CONFIG_MEMCG_KMEM +/* + * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or + * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. + */ +static DEFINE_MUTEX(memcg_slab_mutex); + static DEFINE_MUTEX(activate_kmem_mutex); static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) @@ -3031,10 +2919,10 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) print_slabinfo_header(m); - mutex_lock(&memcg->slab_caches_mutex); + mutex_lock(&memcg_slab_mutex); list_for_each_entry(params, &memcg->memcg_slab_caches, list) cache_show(memcg_params_to_cache(params), m); - mutex_unlock(&memcg->slab_caches_mutex); + mutex_unlock(&memcg_slab_mutex); return 0; } @@ -3043,20 +2931,17 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) { struct res_counter *fail_res; - struct mem_cgroup *_memcg; int ret = 0; ret = res_counter_charge(&memcg->kmem, size, &fail_res); if (ret) return ret; - _memcg = memcg; - ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, - &_memcg, oom_gfp_allowed(gfp)); - + ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT, + oom_gfp_allowed(gfp)); if (ret == -EINTR) { /* - * __mem_cgroup_try_charge() chosed to bypass to root due to + * mem_cgroup_try_charge() chosed to bypass to root due to * OOM kill or fatal signal. Since our only options are to * either fail the allocation or charge it to this cgroup, do * it as a temporary condition. But we can't fail. From a @@ -3066,7 +2951,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) * * This condition will only trigger if the task entered * memcg_charge_kmem in a sane state, but was OOM-killed during - * __mem_cgroup_try_charge() above. Tasks that were already + * mem_cgroup_try_charge() above. Tasks that were already * dying when the allocation triggers should have been already * directed to the root cgroup in memcontrol.h */ @@ -3139,8 +3024,6 @@ void memcg_update_array_size(int num) memcg_limited_groups_array_size = memcg_caches_array_size(num); } -static void kmem_cache_destroy_work_func(struct work_struct *w); - int memcg_update_cache_size(struct kmem_cache *s, int num_groups) { struct memcg_cache_params *cur_params = s->memcg_params; @@ -3214,8 +3097,7 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, if (memcg) { s->memcg_params->memcg = memcg; s->memcg_params->root_cache = root_cache; - INIT_WORK(&s->memcg_params->destroy, - kmem_cache_destroy_work_func); + css_get(&memcg->css); } else s->memcg_params->is_root_cache = true; @@ -3224,30 +3106,44 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, void memcg_free_cache_params(struct kmem_cache *s) { + if (!s->memcg_params) + return; + if (!s->memcg_params->is_root_cache) + css_put(&s->memcg_params->memcg->css); kfree(s->memcg_params); } -void memcg_register_cache(struct kmem_cache *s) +static void memcg_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) { - struct kmem_cache *root; - struct mem_cgroup *memcg; + static char memcg_name_buf[NAME_MAX + 1]; /* protected by + memcg_slab_mutex */ + struct kmem_cache *cachep; int id; - if (is_root_cache(s)) - return; + lockdep_assert_held(&memcg_slab_mutex); + + id = memcg_cache_id(memcg); /* - * Holding the slab_mutex assures nobody will touch the memcg_caches - * array while we are modifying it. + * Since per-memcg caches are created asynchronously on first + * allocation (see memcg_kmem_get_cache()), several threads can try to + * create the same cache, but only one of them may succeed. */ - lockdep_assert_held(&slab_mutex); - - root = s->memcg_params->root_cache; - memcg = s->memcg_params->memcg; - id = memcg_cache_id(memcg); + if (cache_from_memcg_idx(root_cache, id)) + return; - css_get(&memcg->css); + cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); + cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); + /* + * If we could not create a memcg cache, do not complain, because + * that's not critical at all as we can always proceed with the root + * cache. + */ + if (!cachep) + return; + list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); /* * Since readers won't lock (see cache_from_memcg_idx()), we need a @@ -3256,51 +3152,30 @@ void memcg_register_cache(struct kmem_cache *s) */ smp_wmb(); - /* - * Initialize the pointer to this cache in its parent's memcg_params - * before adding it to the memcg_slab_caches list, otherwise we can - * fail to convert memcg_params_to_cache() while traversing the list. - */ - VM_BUG_ON(root->memcg_params->memcg_caches[id]); - root->memcg_params->memcg_caches[id] = s; - - mutex_lock(&memcg->slab_caches_mutex); - list_add(&s->memcg_params->list, &memcg->memcg_slab_caches); - mutex_unlock(&memcg->slab_caches_mutex); + BUG_ON(root_cache->memcg_params->memcg_caches[id]); + root_cache->memcg_params->memcg_caches[id] = cachep; } -void memcg_unregister_cache(struct kmem_cache *s) +static void memcg_unregister_cache(struct kmem_cache *cachep) { - struct kmem_cache *root; + struct kmem_cache *root_cache; struct mem_cgroup *memcg; int id; - if (is_root_cache(s)) - return; + lockdep_assert_held(&memcg_slab_mutex); - /* - * Holding the slab_mutex assures nobody will touch the memcg_caches - * array while we are modifying it. - */ - lockdep_assert_held(&slab_mutex); + BUG_ON(is_root_cache(cachep)); - root = s->memcg_params->root_cache; - memcg = s->memcg_params->memcg; + root_cache = cachep->memcg_params->root_cache; + memcg = cachep->memcg_params->memcg; id = memcg_cache_id(memcg); - mutex_lock(&memcg->slab_caches_mutex); - list_del(&s->memcg_params->list); - mutex_unlock(&memcg->slab_caches_mutex); + BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); + root_cache->memcg_params->memcg_caches[id] = NULL; - /* - * Clear the pointer to this cache in its parent's memcg_params only - * after removing it from the memcg_slab_caches list, otherwise we can - * fail to convert memcg_params_to_cache() while traversing the list. - */ - VM_BUG_ON(!root->memcg_params->memcg_caches[id]); - root->memcg_params->memcg_caches[id] = NULL; + list_del(&cachep->memcg_params->list); - css_put(&memcg->css); + kmem_cache_destroy(cachep); } /* @@ -3334,195 +3209,74 @@ static inline void memcg_resume_kmem_account(void) current->memcg_kmem_skip_account--; } -static void kmem_cache_destroy_work_func(struct work_struct *w) -{ - struct kmem_cache *cachep; - struct memcg_cache_params *p; - - p = container_of(w, struct memcg_cache_params, destroy); - - cachep = memcg_params_to_cache(p); - - /* - * If we get down to 0 after shrink, we could delete right away. - * However, memcg_release_pages() already puts us back in the workqueue - * in that case. If we proceed deleting, we'll get a dangling - * reference, and removing the object from the workqueue in that case - * is unnecessary complication. We are not a fast path. - * - * Note that this case is fundamentally different from racing with - * shrink_slab(): if memcg_cgroup_destroy_cache() is called in - * kmem_cache_shrink, not only we would be reinserting a dead cache - * into the queue, but doing so from inside the worker racing to - * destroy it. - * - * So if we aren't down to zero, we'll just schedule a worker and try - * again - */ - if (atomic_read(&cachep->memcg_params->nr_pages) != 0) - kmem_cache_shrink(cachep); - else - kmem_cache_destroy(cachep); -} - -void mem_cgroup_destroy_cache(struct kmem_cache *cachep) -{ - if (!cachep->memcg_params->dead) - return; - - /* - * There are many ways in which we can get here. - * - * We can get to a memory-pressure situation while the delayed work is - * still pending to run. The vmscan shrinkers can then release all - * cache memory and get us to destruction. If this is the case, we'll - * be executed twice, which is a bug (the second time will execute over - * bogus data). In this case, cancelling the work should be fine. - * - * But we can also get here from the worker itself, if - * kmem_cache_shrink is enough to shake all the remaining objects and - * get the page count to 0. In this case, we'll deadlock if we try to - * cancel the work (the worker runs with an internal lock held, which - * is the same lock we would hold for cancel_work_sync().) - * - * Since we can't possibly know who got us here, just refrain from - * running if there is already work pending - */ - if (work_pending(&cachep->memcg_params->destroy)) - return; - /* - * We have to defer the actual destroying to a workqueue, because - * we might currently be in a context that cannot sleep. - */ - schedule_work(&cachep->memcg_params->destroy); -} - -static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, - struct kmem_cache *s) -{ - struct kmem_cache *new = NULL; - static char *tmp_name = NULL; - static DEFINE_MUTEX(mutex); /* protects tmp_name */ - - BUG_ON(!memcg_can_account_kmem(memcg)); - - mutex_lock(&mutex); - /* - * kmem_cache_create_memcg duplicates the given name and - * cgroup_name for this name requires RCU context. - * This static temporary buffer is used to prevent from - * pointless shortliving allocation. - */ - if (!tmp_name) { - tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); - if (!tmp_name) - goto out; - } - - rcu_read_lock(); - snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name, - memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup)); - rcu_read_unlock(); - - new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, - (s->flags & ~SLAB_PANIC), s->ctor, s); - if (new) - new->allocflags |= __GFP_KMEMCG; - else - new = s; -out: - mutex_unlock(&mutex); - return new; -} - -void kmem_cache_destroy_memcg_children(struct kmem_cache *s) +int __memcg_cleanup_cache_params(struct kmem_cache *s) { struct kmem_cache *c; - int i; - - if (!s->memcg_params) - return; - if (!s->memcg_params->is_root_cache) - return; + int i, failed = 0; - /* - * If the cache is being destroyed, we trust that there is no one else - * requesting objects from it. Even if there are, the sanity checks in - * kmem_cache_destroy should caught this ill-case. - * - * Still, we don't want anyone else freeing memcg_caches under our - * noses, which can happen if a new memcg comes to life. As usual, - * we'll take the activate_kmem_mutex to protect ourselves against - * this. - */ - mutex_lock(&activate_kmem_mutex); + mutex_lock(&memcg_slab_mutex); for_each_memcg_cache_index(i) { c = cache_from_memcg_idx(s, i); if (!c) continue; - /* - * We will now manually delete the caches, so to avoid races - * we need to cancel all pending destruction workers and - * proceed with destruction ourselves. - * - * kmem_cache_destroy() will call kmem_cache_shrink internally, - * and that could spawn the workers again: it is likely that - * the cache still have active pages until this very moment. - * This would lead us back to mem_cgroup_destroy_cache. - * - * But that will not execute at all if the "dead" flag is not - * set, so flip it down to guarantee we are in control. - */ - c->memcg_params->dead = false; - cancel_work_sync(&c->memcg_params->destroy); - kmem_cache_destroy(c); + memcg_unregister_cache(c); + + if (cache_from_memcg_idx(s, i)) + failed++; } - mutex_unlock(&activate_kmem_mutex); + mutex_unlock(&memcg_slab_mutex); + return failed; } -struct create_work { - struct mem_cgroup *memcg; - struct kmem_cache *cachep; - struct work_struct work; -}; - -static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +static void memcg_unregister_all_caches(struct mem_cgroup *memcg) { struct kmem_cache *cachep; - struct memcg_cache_params *params; + struct memcg_cache_params *params, *tmp; if (!memcg_kmem_is_active(memcg)) return; - mutex_lock(&memcg->slab_caches_mutex); - list_for_each_entry(params, &memcg->memcg_slab_caches, list) { + mutex_lock(&memcg_slab_mutex); + list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { cachep = memcg_params_to_cache(params); - cachep->memcg_params->dead = true; - schedule_work(&cachep->memcg_params->destroy); + kmem_cache_shrink(cachep); + if (atomic_read(&cachep->memcg_params->nr_pages) == 0) + memcg_unregister_cache(cachep); } - mutex_unlock(&memcg->slab_caches_mutex); + mutex_unlock(&memcg_slab_mutex); } -static void memcg_create_cache_work_func(struct work_struct *w) +struct memcg_register_cache_work { + struct mem_cgroup *memcg; + struct kmem_cache *cachep; + struct work_struct work; +}; + +static void memcg_register_cache_func(struct work_struct *w) { - struct create_work *cw; + struct memcg_register_cache_work *cw = + container_of(w, struct memcg_register_cache_work, work); + struct mem_cgroup *memcg = cw->memcg; + struct kmem_cache *cachep = cw->cachep; - cw = container_of(w, struct create_work, work); - memcg_create_kmem_cache(cw->memcg, cw->cachep); - css_put(&cw->memcg->css); + mutex_lock(&memcg_slab_mutex); + memcg_register_cache(memcg, cachep); + mutex_unlock(&memcg_slab_mutex); + + css_put(&memcg->css); kfree(cw); } /* * Enqueue the creation of a per-memcg kmem_cache. */ -static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { - struct create_work *cw; + struct memcg_register_cache_work *cw; - cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); + cw = kmalloc(sizeof(*cw), GFP_NOWAIT); if (cw == NULL) { css_put(&memcg->css); return; @@ -3531,17 +3285,17 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, cw->memcg = memcg; cw->cachep = cachep; - INIT_WORK(&cw->work, memcg_create_cache_work_func); + INIT_WORK(&cw->work, memcg_register_cache_func); schedule_work(&cw->work); } -static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void memcg_schedule_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { /* * We need to stop accounting when we kmalloc, because if the * corresponding kmalloc cache is not yet created, the first allocation - * in __memcg_create_cache_enqueue will recurse. + * in __memcg_schedule_register_cache will recurse. * * However, it is better to enclose the whole function. Depending on * the debugging options enabled, INIT_WORK(), for instance, can @@ -3550,9 +3304,27 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, * the safest choice is to do it like this, wrapping the whole function. */ memcg_stop_kmem_account(); - __memcg_create_cache_enqueue(memcg, cachep); + __memcg_schedule_register_cache(memcg, cachep); memcg_resume_kmem_account(); } + +int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) +{ + int res; + + res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, + PAGE_SIZE << order); + if (!res) + atomic_add(1 << order, &cachep->memcg_params->nr_pages); + return res; +} + +void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) +{ + memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); + atomic_sub(1 << order, &cachep->memcg_params->nr_pages); +} + /* * Return the kmem_cache we're supposed to use for a slab allocation. * We try to use the current memcg's version of the cache. @@ -3591,7 +3363,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, } /* The corresponding put will be done in the workqueue. */ - if (!css_tryget(&memcg->css)) + if (!css_tryget_online(&memcg->css)) goto out; rcu_read_unlock(); @@ -3603,22 +3375,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, * * However, there are some clashes that can arrive from locking. * For instance, because we acquire the slab_mutex while doing - * kmem_cache_dup, this means no further allocation could happen - * with the slab_mutex held. - * - * Also, because cache creation issue get_online_cpus(), this - * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, - * that ends up reversed during cpu hotplug. (cpuset allocates - * a bunch of GFP_KERNEL memory during cpuup). Due to all that, - * better to defer everything. + * memcg_create_kmem_cache, this means no further allocation + * could happen with the slab_mutex held. So it's better to + * defer everything. */ - memcg_create_cache_enqueue(memcg, cachep); + memcg_schedule_register_cache(memcg, cachep); return cachep; out: rcu_read_unlock(); return cachep; } -EXPORT_SYMBOL(__memcg_kmem_get_cache); /* * We need to verify if the allocation against current->mm->owner's memcg is @@ -3645,11 +3411,12 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) /* * Disabling accounting is only relevant for some specific memcg * internal allocations. Therefore we would initially not have such - * check here, since direct calls to the page allocator that are marked - * with GFP_KMEMCG only happen outside memcg core. We are mostly - * concerned with cache allocations, and by having this test at - * memcg_kmem_get_cache, we are already able to relay the allocation to - * the root cache and bypass the memcg cache altogether. + * check here, since direct calls to the page allocator that are + * accounted to kmemcg (alloc_kmem_pages and friends) only happen + * outside memcg core. We are mostly concerned with cache allocations, + * and by having this test at memcg_kmem_get_cache, we are already able + * to relay the allocation to the root cache and bypass the memcg cache + * altogether. * * There is one exception, though: the SLUB allocator does not create * large order caches, but rather service large kmallocs directly from @@ -3669,15 +3436,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) if (!current->mm || current->memcg_kmem_skip_account) return true; - memcg = try_get_mem_cgroup_from_mm(current->mm); - - /* - * very rare case described in mem_cgroup_from_task. Unfortunately there - * isn't much we can do without complicating this too much, and it would - * be gfp-dependent anyway. Just let it go - */ - if (unlikely(!memcg)) - return true; + memcg = get_mem_cgroup_from_mm(current->mm); if (!memcg_can_account_kmem(memcg)) { css_put(&memcg->css); @@ -3744,7 +3503,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) memcg_uncharge_kmem(memcg, PAGE_SIZE << order); } #else -static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) { } #endif /* CONFIG_MEMCG_KMEM */ @@ -3780,19 +3539,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline -void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, - struct mem_cgroup *to, - unsigned int nr_pages, - enum mem_cgroup_stat_index idx) -{ - /* Update stat data for mem_cgroup */ - preempt_disable(); - __this_cpu_sub(from->stat->count[idx], nr_pages); - __this_cpu_add(to->stat->count[idx], nr_pages); - preempt_enable(); -} - /** * mem_cgroup_move_account - move account of the page * @page: the page @@ -3838,13 +3584,19 @@ static int mem_cgroup_move_account(struct page *page, move_lock_mem_cgroup(from, &flags); - if (!anon && page_mapped(page)) - mem_cgroup_move_account_page_stat(from, to, nr_pages, - MEM_CGROUP_STAT_FILE_MAPPED); + if (!anon && page_mapped(page)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], + nr_pages); + } - if (PageWriteback(page)) - mem_cgroup_move_account_page_stat(from, to, nr_pages, - MEM_CGROUP_STAT_WRITEBACK); + if (PageWriteback(page)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], + nr_pages); + } mem_cgroup_charge_statistics(from, page, anon, -nr_pages); @@ -3930,19 +3682,19 @@ out: return ret; } -/* - * Charge the memory controller for page usage. - * Return - * 0 if the charge was successful - * < 0 if the cgroup is over its limit - */ -static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, enum charge_type ctype) +int mem_cgroup_charge_anon(struct page *page, + struct mm_struct *mm, gfp_t gfp_mask) { - struct mem_cgroup *memcg = NULL; unsigned int nr_pages = 1; + struct mem_cgroup *memcg; bool oom = true; - int ret; + + if (mem_cgroup_disabled()) + return 0; + + VM_BUG_ON_PAGE(page_mapped(page), page); + VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); + VM_BUG_ON(!mm); if (PageTransHuge(page)) { nr_pages <<= compound_order(page); @@ -3954,25 +3706,14 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, oom = false; } - ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); - if (ret == -ENOMEM) - return ret; - __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false); + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom); + if (!memcg) + return -ENOMEM; + __mem_cgroup_commit_charge(memcg, page, nr_pages, + MEM_CGROUP_CHARGE_TYPE_ANON, false); return 0; } -int mem_cgroup_newpage_charge(struct page *page, - struct mm_struct *mm, gfp_t gfp_mask) -{ - if (mem_cgroup_disabled()) - return 0; - VM_BUG_ON_PAGE(page_mapped(page), page); - VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); - VM_BUG_ON(!mm); - return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_ANON); -} - /* * While swap-in, try_charge -> commit or cancel, the page is locked. * And when try_charge() successfully returns, one refcnt to memcg without @@ -3984,7 +3725,7 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, gfp_t mask, struct mem_cgroup **memcgp) { - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = NULL; struct page_cgroup *pc; int ret; @@ -3997,31 +3738,29 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, * in turn serializes uncharging. */ if (PageCgroupUsed(pc)) - return 0; - if (!do_swap_account) - goto charge_cur_mm; - memcg = try_get_mem_cgroup_from_page(page); + goto out; + if (do_swap_account) + memcg = try_get_mem_cgroup_from_page(page); if (!memcg) - goto charge_cur_mm; - *memcgp = memcg; - ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); + memcg = get_mem_cgroup_from_mm(mm); + ret = mem_cgroup_try_charge(memcg, mask, 1, true); css_put(&memcg->css); if (ret == -EINTR) - ret = 0; - return ret; -charge_cur_mm: - ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); - if (ret == -EINTR) - ret = 0; - return ret; + memcg = root_mem_cgroup; + else if (ret) + return ret; +out: + *memcgp = memcg; + return 0; } int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, gfp_t gfp_mask, struct mem_cgroup **memcgp) { - *memcgp = NULL; - if (mem_cgroup_disabled()) + if (mem_cgroup_disabled()) { + *memcgp = NULL; return 0; + } /* * A racing thread's fault, or swapoff, may have already * updated the pte, and even removed page from swap cache: in @@ -4029,12 +3768,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, * there's also a KSM case which does need to charge the page. */ if (!PageSwapCache(page)) { - int ret; + struct mem_cgroup *memcg; - ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true); - if (ret == -EINTR) - ret = 0; - return ret; + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); + if (!memcg) + return -ENOMEM; + *memcgp = memcg; + return 0; } return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); } @@ -4078,11 +3818,11 @@ void mem_cgroup_commit_charge_swapin(struct page *page, MEM_CGROUP_CHARGE_TYPE_ANON); } -int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, +int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { - struct mem_cgroup *memcg = NULL; enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; + struct mem_cgroup *memcg; int ret; if (mem_cgroup_disabled()) @@ -4090,15 +3830,20 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (PageCompound(page)) return 0; - if (!PageSwapCache(page)) - ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); - else { /* page is swapcache/shmem */ + if (PageSwapCache(page)) { /* shmem */ ret = __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); - if (!ret) - __mem_cgroup_commit_charge_swapin(page, memcg, type); + if (ret) + return ret; + __mem_cgroup_commit_charge_swapin(page, memcg, type); + return 0; } - return ret; + + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); + if (!memcg) + return -ENOMEM; + __mem_cgroup_commit_charge(memcg, page, 1, type, false); + return 0; } static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, @@ -4378,8 +4123,8 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) memcg = mem_cgroup_lookup(id); if (memcg) { /* - * We uncharge this because swap is freed. - * This memcg can be obsolete one. We avoid calling css_tryget + * We uncharge this because swap is freed. This memcg can + * be obsolete one. We avoid calling css_tryget_online(). */ if (!mem_cgroup_is_root(memcg)) res_counter_uncharge(&memcg->memsw, PAGE_SIZE); @@ -4833,7 +4578,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, break; } while (1); } - __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); excess = res_counter_soft_limit_excess(&mz->memcg->res); /* * One school of thought says that we should not add @@ -4844,7 +4589,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, * term TODO. */ /* If excess == 0, no tree ops */ - __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); + __mem_cgroup_insert_exceeded(mz, mctz, excess); spin_unlock(&mctz->lock); css_put(&mz->memcg->css); loop++; @@ -4911,9 +4656,9 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, if (mem_cgroup_move_parent(page, pc, memcg)) { /* found lock contention or "pc" is obsolete. */ busy = page; - cond_resched(); } else busy = NULL; + cond_resched(); } while (!list_empty(list)); } @@ -4964,18 +4709,28 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) } while (usage > 0); } +/* + * Test whether @memcg has children, dead or alive. Note that this + * function doesn't care whether @memcg has use_hierarchy enabled and + * returns %true if there are child csses according to the cgroup + * hierarchy. Testing use_hierarchy is the caller's responsiblity. + */ static inline bool memcg_has_children(struct mem_cgroup *memcg) { - lockdep_assert_held(&memcg_create_mutex); + bool ret; + /* - * The lock does not prevent addition or deletion to the list - * of children, but it prevents a new child from being - * initialized based on this parent in css_online(), so it's - * enough to decide whether hierarchically inherited - * attributes can still be changed or not. + * The lock does not prevent addition or deletion of children, but + * it prevents a new child from being initialized based on this + * parent in css_online(), so it's enough to decide whether + * hierarchically inherited attributes can still be changed or not. */ - return memcg->use_hierarchy && - !list_empty(&memcg->css.cgroup->children); + lockdep_assert_held(&memcg_create_mutex); + + rcu_read_lock(); + ret = css_next_child(NULL, &memcg->css); + rcu_read_unlock(); + return ret; } /* @@ -4987,11 +4742,6 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg) static int mem_cgroup_force_empty(struct mem_cgroup *memcg) { int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; - struct cgroup *cgrp = memcg->css.cgroup; - - /* returns EBUSY if there is a task or if we come here twice. */ - if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) - return -EBUSY; /* we call try-to-free pages for make this cgroup empty */ lru_add_drain_all(); @@ -5011,20 +4761,19 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) } } - lru_add_drain(); - mem_cgroup_reparent_charges(memcg); return 0; } -static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css, - unsigned int event) +static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); if (mem_cgroup_is_root(memcg)) return -EINVAL; - return mem_cgroup_force_empty(memcg); + return mem_cgroup_force_empty(memcg) ?: nbytes; } static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, @@ -5038,7 +4787,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, { int retval = 0; struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css)); + struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); mutex_lock(&memcg_create_mutex); @@ -5055,7 +4804,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, */ if ((!parent_memcg || !parent_memcg->use_hierarchy) && (val == 1 || val == 0)) { - if (list_empty(&memcg->css.cgroup->children)) + if (!memcg_has_children(memcg)) memcg->use_hierarchy = val; else retval = -EBUSY; @@ -5172,7 +4921,8 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, * of course permitted. */ mutex_lock(&memcg_create_mutex); - if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg)) + if (cgroup_has_tasks(memcg->css.cgroup) || + (memcg->use_hierarchy && memcg_has_children(memcg))) err = -EBUSY; mutex_unlock(&memcg_create_mutex); if (err) @@ -5189,13 +4939,14 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, * Make sure we have enough space for this cgroup in each root cache's * memcg_params. */ + mutex_lock(&memcg_slab_mutex); err = memcg_update_all_caches(memcg_id + 1); + mutex_unlock(&memcg_slab_mutex); if (err) goto out_rmid; memcg->kmemcg_id = memcg_id; INIT_LIST_HEAD(&memcg->memcg_slab_caches); - mutex_init(&memcg->slab_caches_mutex); /* * We couldn't have accounted to this cgroup, because it hasn't got the @@ -5273,17 +5024,18 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg, * The user of this function is... * RES_LIMIT. */ -static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, - const char *buffer) +static ssize_t mem_cgroup_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); enum res_type type; int name; unsigned long long val; int ret; - type = MEMFILE_TYPE(cft->private); - name = MEMFILE_ATTR(cft->private); + buf = strstrip(buf); + type = MEMFILE_TYPE(of_cft(of)->private); + name = MEMFILE_ATTR(of_cft(of)->private); switch (name) { case RES_LIMIT: @@ -5292,7 +5044,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, break; } /* This function does all necessary parse...reuse it */ - ret = res_counter_memparse_write_strategy(buffer, &val); + ret = res_counter_memparse_write_strategy(buf, &val); if (ret) break; if (type == _MEM) @@ -5305,7 +5057,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, return -EINVAL; break; case RES_SOFT_LIMIT: - ret = res_counter_memparse_write_strategy(buffer, &val); + ret = res_counter_memparse_write_strategy(buf, &val); if (ret) break; /* @@ -5322,7 +5074,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, ret = -EINVAL; /* should be BUG() ? */ break; } - return ret; + return ret ?: nbytes; } static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, @@ -5335,8 +5087,8 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, if (!memcg->use_hierarchy) goto out; - while (css_parent(&memcg->css)) { - memcg = mem_cgroup_from_css(css_parent(&memcg->css)); + while (memcg->css.parent) { + memcg = mem_cgroup_from_css(memcg->css.parent); if (!memcg->use_hierarchy) break; tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); @@ -5349,14 +5101,15 @@ out: *memsw_limit = min_memsw_limit; } -static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) +static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); int name; enum res_type type; - type = MEMFILE_TYPE(event); - name = MEMFILE_ATTR(event); + type = MEMFILE_TYPE(of_cft(of)->private); + name = MEMFILE_ATTR(of_cft(of)->private); switch (name) { case RES_MAX_USAGE: @@ -5381,7 +5134,7 @@ static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) break; } - return 0; + return nbytes; } static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, @@ -5540,7 +5293,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) for_each_online_node(nid) for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = mem_cgroup_zoneinfo(memcg, nid, zid); + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; rstat = &mz->lruvec.reclaim_stat; recent_rotated[0] += rstat->recent_rotated[0]; @@ -5570,22 +5323,14 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); - - if (val > 100 || !parent) - return -EINVAL; - - mutex_lock(&memcg_create_mutex); - /* If under hierarchy, only empty-root can set this value */ - if ((parent->use_hierarchy) || memcg_has_children(memcg)) { - mutex_unlock(&memcg_create_mutex); + if (val > 100) return -EINVAL; - } - - memcg->swappiness = val; - mutex_unlock(&memcg_create_mutex); + if (css->parent) + memcg->swappiness = val; + else + vm_swappiness = val; return 0; } @@ -5670,8 +5415,12 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) { struct mem_cgroup_eventfd_list *ev; + spin_lock(&memcg_oom_lock); + list_for_each_entry(ev, &memcg->oom_notify, list) eventfd_signal(ev->eventfd, 1); + + spin_unlock(&memcg_oom_lock); return 0; } @@ -5917,22 +5666,15 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); /* cannot set to root cgroup and only 0 and 1 are allowed */ - if (!parent || !((val == 0) || (val == 1))) + if (!css->parent || !((val == 0) || (val == 1))) return -EINVAL; - mutex_lock(&memcg_create_mutex); - /* oom-kill-disable is a flag for subhierarchy. */ - if ((parent->use_hierarchy) || memcg_has_children(memcg)) { - mutex_unlock(&memcg_create_mutex); - return -EINVAL; - } memcg->oom_kill_disable = val; if (!val) memcg_oom_recover(memcg); - mutex_unlock(&memcg_create_mutex); + return 0; } @@ -5972,10 +5714,10 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) * which is then paired with css_put during uncharge resp. here. * * Although this might sound strange as this path is called from - * css_offline() when the referencemight have dropped down to 0 - * and shouldn't be incremented anymore (css_tryget would fail) - * we do not have other options because of the kmem allocations - * lifetime. + * css_offline() when the referencemight have dropped down to 0 and + * shouldn't be incremented anymore (css_tryget_online() would + * fail) we do not have other options because of the kmem + * allocations lifetime. */ css_get(&memcg->css); @@ -6094,9 +5836,10 @@ static void memcg_event_ptable_queue_proc(struct file *file, * Input must be in format '<event_fd> <control_fd> <args>'. * Interpretation of args is defined by control file implementation. */ -static int memcg_write_event_control(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buffer) +static ssize_t memcg_write_event_control(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { + struct cgroup_subsys_state *css = of_css(of); struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_event *event; struct cgroup_subsys_state *cfile_css; @@ -6107,15 +5850,17 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css, char *endp; int ret; - efd = simple_strtoul(buffer, &endp, 10); + buf = strstrip(buf); + + efd = simple_strtoul(buf, &endp, 10); if (*endp != ' ') return -EINVAL; - buffer = endp + 1; + buf = endp + 1; - cfd = simple_strtoul(buffer, &endp, 10); + cfd = simple_strtoul(buf, &endp, 10); if ((*endp != ' ') && (*endp != '\0')) return -EINVAL; - buffer = endp + 1; + buf = endp + 1; event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) @@ -6183,19 +5928,17 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css, * automatically removed on cgroup destruction but the removal is * asynchronous, so take an extra ref on @css. */ - rcu_read_lock(); - + cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent, + &memory_cgrp_subsys); ret = -EINVAL; - cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, - &mem_cgroup_subsys); - if (cfile_css == css && css_tryget(css)) - ret = 0; - - rcu_read_unlock(); - if (ret) + if (IS_ERR(cfile_css)) goto out_put_cfile; + if (cfile_css != css) { + css_put(cfile_css); + goto out_put_cfile; + } - ret = event->register_event(memcg, event->eventfd, buffer); + ret = event->register_event(memcg, event->eventfd, buf); if (ret) goto out_put_css; @@ -6208,7 +5951,7 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css, fdput(cfile); fdput(efile); - return 0; + return nbytes; out_put_css: css_put(css); @@ -6233,25 +5976,25 @@ static struct cftype mem_cgroup_files[] = { { .name = "max_usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, { .name = "limit_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), - .write_string = mem_cgroup_write, + .write = mem_cgroup_write, .read_u64 = mem_cgroup_read_u64, }, { .name = "soft_limit_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), - .write_string = mem_cgroup_write, + .write = mem_cgroup_write, .read_u64 = mem_cgroup_read_u64, }, { .name = "failcnt", .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, { @@ -6260,17 +6003,16 @@ static struct cftype mem_cgroup_files[] = { }, { .name = "force_empty", - .trigger = mem_cgroup_force_empty_write, + .write = mem_cgroup_force_empty_write, }, { .name = "use_hierarchy", - .flags = CFTYPE_INSANE, .write_u64 = mem_cgroup_hierarchy_write, .read_u64 = mem_cgroup_hierarchy_read, }, { .name = "cgroup.event_control", /* XXX: for compat */ - .write_string = memcg_write_event_control, + .write = memcg_write_event_control, .flags = CFTYPE_NO_PREFIX, .mode = S_IWUGO, }, @@ -6303,7 +6045,7 @@ static struct cftype mem_cgroup_files[] = { { .name = "kmem.limit_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), - .write_string = mem_cgroup_write, + .write = mem_cgroup_write, .read_u64 = mem_cgroup_read_u64, }, { @@ -6314,13 +6056,13 @@ static struct cftype mem_cgroup_files[] = { { .name = "kmem.failcnt", .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, { .name = "kmem.max_usage_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, #ifdef CONFIG_SLABINFO @@ -6343,19 +6085,19 @@ static struct cftype memsw_cgroup_files[] = { { .name = "memsw.max_usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, { .name = "memsw.limit_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), - .write_string = mem_cgroup_write, + .write = mem_cgroup_write, .read_u64 = mem_cgroup_read_u64, }, { .name = "memsw.failcnt", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, { }, /* terminate */ @@ -6533,9 +6275,9 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); + struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); - if (css->cgroup->id > MEM_CGROUP_ID_MAX) + if (css->id > MEM_CGROUP_ID_MAX) return -ENOSPC; if (!parent) @@ -6566,11 +6308,11 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) * unfortunate state in our controller. */ if (parent != root_mem_cgroup) - mem_cgroup_subsys.broken_hierarchy = true; + memory_cgrp_subsys.broken_hierarchy = true; } mutex_unlock(&memcg_create_mutex); - return memcg_init_kmem(memcg, &mem_cgroup_subsys); + return memcg_init_kmem(memcg, &memory_cgrp_subsys); } /* @@ -6620,7 +6362,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) css_for_each_descendant_post(iter, css) mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); - mem_cgroup_destroy_all_caches(memcg); + memcg_unregister_all_caches(memcg); vmpressure_cleanup(&memcg->vmpressure); } @@ -6630,7 +6372,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) /* * XXX: css_offline() would be where we should reparent all * memory to prepare the cgroup for destruction. However, - * memcg does not do css_tryget() and res_counter charging + * memcg does not do css_tryget_online() and res_counter charging * under the same RCU lock region, which means that charging * could race with offlining. Offlining only happens to * cgroups with no tasks in them but charges can show up @@ -6644,9 +6386,9 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) * lookup_swap_cgroup_id() * rcu_read_lock() * mem_cgroup_lookup() - * css_tryget() + * css_tryget_online() * rcu_read_unlock() - * disable css_tryget() + * disable css_tryget_online() * call_rcu() * offline_css() * reparent_charges() @@ -6668,6 +6410,29 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) __mem_cgroup_free(memcg); } +/** + * mem_cgroup_css_reset - reset the states of a mem_cgroup + * @css: the target css + * + * Reset the states of the mem_cgroup associated with @css. This is + * invoked when the userland requests disabling on the default hierarchy + * but the memcg is pinned through dependency. The memcg should stop + * applying policies and should revert to the vanilla state as it may be + * made visible again. + * + * The current implementation only resets the essential configurations. + * This needs to be expanded to cover all the visible parts. + */ +static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + mem_cgroup_resize_limit(memcg, ULLONG_MAX); + mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX); + memcg_update_kmem_limit(memcg, ULLONG_MAX); + res_counter_set_soft_limit(&memcg->res, ULLONG_MAX); +} + #ifdef CONFIG_MMU /* Handlers for move charge at task migration. */ #define PRECHARGE_COUNT_AT_ONCE 256 @@ -6712,8 +6477,7 @@ one_by_one: batch_count = PRECHARGE_COUNT_AT_ONCE; cond_resched(); } - ret = __mem_cgroup_try_charge(NULL, - GFP_KERNEL, 1, &memcg, false); + ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false); if (ret) /* mem_cgroup_clear_mc() will do uncharge later */ return ret; @@ -6817,16 +6581,20 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, pgoff = pte_to_pgoff(ptent); /* page is moved even if it's not RSS of this task(page-faulted). */ - page = find_get_page(mapping, pgoff); - #ifdef CONFIG_SWAP /* shmem/tmpfs may report page out on swap: account for that too. */ - if (radix_tree_exceptional_entry(page)) { - swp_entry_t swap = radix_to_swp_entry(page); - if (do_swap_account) - *entry = swap; - page = find_get_page(swap_address_space(swap), swap.val); - } + if (shmem_mapping(mapping)) { + page = find_get_entry(mapping, pgoff); + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swp = radix_to_swp_entry(page); + if (do_swap_account) + *entry = swp; + page = find_get_page(swap_address_space(swp), swp.val); + } + } else + page = find_get_page(mapping, pgoff); +#else + page = find_get_page(mapping, pgoff); #endif return page; } @@ -7259,31 +7027,31 @@ static void mem_cgroup_move_task(struct cgroup_subsys_state *css, /* * Cgroup retains root cgroups across [un]mount cycles making it necessary - * to verify sane_behavior flag on each mount attempt. + * to verify whether we're attached to the default hierarchy on each mount + * attempt. */ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) { /* - * use_hierarchy is forced with sane_behavior. cgroup core + * use_hierarchy is forced on the default hierarchy. cgroup core * guarantees that @root doesn't have any children, so turning it * on for the root memcg is enough. */ - if (cgroup_sane_behavior(root_css->cgroup)) + if (cgroup_on_dfl(root_css->cgroup)) mem_cgroup_from_css(root_css)->use_hierarchy = true; } -struct cgroup_subsys mem_cgroup_subsys = { - .name = "memory", - .subsys_id = mem_cgroup_subsys_id, +struct cgroup_subsys memory_cgrp_subsys = { .css_alloc = mem_cgroup_css_alloc, .css_online = mem_cgroup_css_online, .css_offline = mem_cgroup_css_offline, .css_free = mem_cgroup_css_free, + .css_reset = mem_cgroup_css_reset, .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .attach = mem_cgroup_move_task, .bind = mem_cgroup_bind, - .base_cftypes = mem_cgroup_files, + .legacy_cftypes = mem_cgroup_files, .early_init = 0, }; @@ -7300,7 +7068,8 @@ __setup("swapaccount=", enable_swap_account); static void __init memsw_file_init(void) { - WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files)); + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, + memsw_cgroup_files)); } static void __init enable_swap_cgroup(void) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 90002ea..a013bc9 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -145,14 +145,10 @@ static int hwpoison_filter_task(struct page *p) return -EINVAL; css = mem_cgroup_css(mem); - /* root_mem_cgroup has NULL dentries */ - if (!css->cgroup->dentry) - return -EINVAL; - - ino = css->cgroup->dentry->d_inode->i_ino; + ino = cgroup_ino(css->cgroup); css_put(css); - if (ino != hwpoison_filter_memcg) + if (!ino || ino != hwpoison_filter_memcg) return -EINVAL; return 0; @@ -208,9 +204,9 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, #endif si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; - if ((flags & MF_ACTION_REQUIRED) && t == current) { + if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { si.si_code = BUS_MCEERR_AR; - ret = force_sig_info(SIGBUS, &si, t); + ret = force_sig_info(SIGBUS, &si, current); } else { /* * Don't use force here, it's convenient if the signal @@ -384,20 +380,51 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, } } -static int task_early_kill(struct task_struct *tsk) +/* + * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO) + * on behalf of the thread group. Return task_struct of the (first found) + * dedicated thread if found, and return NULL otherwise. + * + * We already hold read_lock(&tasklist_lock) in the caller, so we don't + * have to call rcu_read_lock/unlock() in this function. + */ +static struct task_struct *find_early_kill_thread(struct task_struct *tsk) { + struct task_struct *t; + + for_each_thread(tsk, t) + if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY)) + return t; + return NULL; +} + +/* + * Determine whether a given process is "early kill" process which expects + * to be signaled when some page under the process is hwpoisoned. + * Return task_struct of the dedicated thread (main thread unless explicitly + * specified) if the process is "early kill," and otherwise returns NULL. + */ +static struct task_struct *task_early_kill(struct task_struct *tsk, + int force_early) +{ + struct task_struct *t; if (!tsk->mm) - return 0; - if (tsk->flags & PF_MCE_PROCESS) - return !!(tsk->flags & PF_MCE_EARLY); - return sysctl_memory_failure_early_kill; + return NULL; + if (force_early) + return tsk; + t = find_early_kill_thread(tsk); + if (t) + return t; + if (sysctl_memory_failure_early_kill) + return tsk; + return NULL; } /* * Collect processes when the error hit an anonymous page. */ static void collect_procs_anon(struct page *page, struct list_head *to_kill, - struct to_kill **tkc) + struct to_kill **tkc, int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -408,20 +435,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, if (av == NULL) /* Not actually mapped anymore */ return; - pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff = page_to_pgoff(page); read_lock(&tasklist_lock); for_each_process (tsk) { struct anon_vma_chain *vmac; + struct task_struct *t = task_early_kill(tsk, force_early); - if (!task_early_kill(tsk)) + if (!t) continue; anon_vma_interval_tree_foreach(vmac, &av->rb_root, pgoff, pgoff) { vma = vmac->vma; if (!page_mapped_in_vma(page, vma)) continue; - if (vma->vm_mm == tsk->mm) - add_to_kill(tsk, page, vma, to_kill, tkc); + if (vma->vm_mm == t->mm) + add_to_kill(t, page, vma, to_kill, tkc); } } read_unlock(&tasklist_lock); @@ -432,7 +460,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, * Collect processes when the error hit a file mapped page. */ static void collect_procs_file(struct page *page, struct list_head *to_kill, - struct to_kill **tkc) + struct to_kill **tkc, int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -441,11 +469,11 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, mutex_lock(&mapping->i_mmap_mutex); read_lock(&tasklist_lock); for_each_process(tsk) { - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff_t pgoff = page_to_pgoff(page); + struct task_struct *t = task_early_kill(tsk, force_early); - if (!task_early_kill(tsk)) + if (!t) continue; - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { /* @@ -455,8 +483,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, * Assume applications who requested early kill want * to be informed of all such data corruptions. */ - if (vma->vm_mm == tsk->mm) - add_to_kill(tsk, page, vma, to_kill, tkc); + if (vma->vm_mm == t->mm) + add_to_kill(t, page, vma, to_kill, tkc); } } read_unlock(&tasklist_lock); @@ -469,7 +497,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, * First preallocate one tokill structure outside the spin locks, * so that we can kill at least one process reasonably reliable. */ -static void collect_procs(struct page *page, struct list_head *tokill) +static void collect_procs(struct page *page, struct list_head *tokill, + int force_early) { struct to_kill *tk; @@ -480,9 +509,9 @@ static void collect_procs(struct page *page, struct list_head *tokill) if (!tk) return; if (PageAnon(page)) - collect_procs_anon(page, tokill, &tk); + collect_procs_anon(page, tokill, &tk, force_early); else - collect_procs_file(page, tokill, &tk); + collect_procs_file(page, tokill, &tk, force_early); kfree(tk); } @@ -866,8 +895,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, struct page *hpage = *hpagep; struct page *ppage; + /* + * Here we are interested only in user-mapped pages, so skip any + * other types of pages. + */ if (PageReserved(p) || PageSlab(p)) return SWAP_SUCCESS; + if (!(PageLRU(hpage) || PageHuge(p))) + return SWAP_SUCCESS; /* * This check implies we don't kill processes if their pages @@ -876,8 +911,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, if (!page_mapped(hpage)) return SWAP_SUCCESS; - if (PageKsm(p)) + if (PageKsm(p)) { + pr_err("MCE %#lx: can't handle KSM pages.\n", pfn); return SWAP_FAIL; + } if (PageSwapCache(p)) { printk(KERN_ERR @@ -967,7 +1004,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * there's nothing that can be done. */ if (kill) - collect_procs(ppage, &tokill); + collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); ret = try_to_unmap(ppage, ttu); if (ret != SWAP_SUCCESS) @@ -1085,15 +1122,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags) return 0; } else if (PageHuge(hpage)) { /* - * Check "just unpoisoned", "filter hit", and - * "race with other subpage." + * Check "filter hit" and "race with other subpage." */ lock_page(hpage); - if (!PageHWPoison(hpage) - || (hwpoison_filter(p) && TestClearPageHWPoison(p)) - || (p != hpage && TestSetPageHWPoison(hpage))) { - atomic_long_sub(nr_pages, &num_poisoned_pages); - return 0; + if (PageHWPoison(hpage)) { + if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) + || (p != hpage && TestSetPageHWPoison(hpage))) { + atomic_long_sub(nr_pages, &num_poisoned_pages); + unlock_page(hpage); + return 0; + } } set_page_hwpoison_huge_page(hpage); res = dequeue_hwpoisoned_huge_page(hpage); @@ -1129,17 +1167,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) action_result(pfn, "free buddy, 2nd try", DELAYED); return 0; } - action_result(pfn, "non LRU", IGNORED); - put_page(p); - return -EBUSY; } } - /* - * Lock the page and wait for writeback to finish. - * It's very difficult to mess with pages currently under IO - * and in many cases impossible, so we just avoid it here. - */ lock_page(hpage); /* @@ -1156,6 +1186,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags) */ if (!PageHWPoison(p)) { printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); + atomic_long_sub(nr_pages, &num_poisoned_pages); + put_page(hpage); res = 0; goto out; } @@ -1167,6 +1199,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) return 0; } + if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p)) + goto identify_page_state; + /* * For error on the tail page, we should set PG_hwpoison * on the head page to show that the hugepage is hwpoisoned @@ -1187,6 +1222,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) if (PageHuge(p)) set_page_hwpoison_huge_page(hpage); + /* + * It's very difficult to mess with pages currently under IO + * and in many cases impossible, so we just avoid it here. + */ wait_on_page_writeback(p); /* @@ -1198,7 +1237,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) */ if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) != SWAP_SUCCESS) { - printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); + action_result(pfn, "unmapping failed", IGNORED); res = -EBUSY; goto out; } @@ -1212,6 +1251,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) goto out; } +identify_page_state: res = -EBUSY; /* * The first check uses the current page flags which may not have any @@ -1299,7 +1339,7 @@ static void memory_failure_work_func(struct work_struct *work) unsigned long proc_flags; int gotten; - mf_cpu = &__get_cpu_var(memory_failure_cpu); + mf_cpu = this_cpu_ptr(&memory_failure_cpu); for (;;) { spin_lock_irqsave(&mf_cpu->lock, proc_flags); gotten = kfifo_get(&mf_cpu->fifo, &entry); @@ -1504,7 +1544,7 @@ static int soft_offline_huge_page(struct page *page, int flags) /* Keep page count to indicate a given hugepage is isolated. */ list_move(&hpage->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { pr_info("soft offline: %#lx: migration failed %d, type %lx\n", @@ -1585,7 +1625,7 @@ static int __soft_offline_page(struct page *page, int flags) inc_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); list_add(&page->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { if (!list_empty(&pagelist)) { @@ -1665,11 +1705,7 @@ int soft_offline_page(struct page *page, int flags) } } - /* - * The lock_memory_hotplug prevents a race with memory hotplug. - * This is a big hammer, a better would be nicer. - */ - lock_memory_hotplug(); + get_online_mems(); /* * Isolate the page, so that it doesn't get reallocated if it @@ -1680,7 +1716,7 @@ int soft_offline_page(struct page *page, int flags) set_migratetype_isolate(page, true); ret = get_any_page(page, pfn, flags); - unlock_memory_hotplug(); + put_online_mems(); if (ret > 0) { /* for in-use pages */ if (PageHuge(page)) ret = soft_offline_huge_page(page, flags); diff --git a/mm/memory.c b/mm/memory.c index 22dfa61..8b44f76 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -60,6 +60,7 @@ #include <linux/migrate.h> #include <linux/string.h> #include <linux/dma-debug.h> +#include <linux/debugfs.h> #include <asm/io.h> #include <asm/pgalloc.h> @@ -231,17 +232,18 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long #endif } -void tlb_flush_mmu(struct mmu_gather *tlb) +static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) { - struct mmu_gather_batch *batch; - - if (!tlb->need_flush) - return; tlb->need_flush = 0; tlb_flush(tlb); #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb_table_flush(tlb); #endif +} + +static void tlb_flush_mmu_free(struct mmu_gather *tlb) +{ + struct mmu_gather_batch *batch; for (batch = &tlb->local; batch; batch = batch->next) { free_pages_and_swap_cache(batch->pages, batch->nr); @@ -250,6 +252,14 @@ void tlb_flush_mmu(struct mmu_gather *tlb) tlb->active = &tlb->local; } +void tlb_flush_mmu(struct mmu_gather *tlb) +{ + if (!tlb->need_flush) + return; + tlb_flush_mmu_tlbonly(tlb); + tlb_flush_mmu_free(tlb); +} + /* tlb_finish_mmu * Called at the end of the shootdown operation to free up any resources * that were required. @@ -688,11 +698,6 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } -static inline bool is_cow_mapping(vm_flags_t flags) -{ - return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; -} - /* * vm_normal_page -- This function gets the "struct page" associated with a pte. * @@ -746,7 +751,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn = pte_pfn(pte); if (HAVE_PTE_SPECIAL) { - if (likely(!pte_special(pte))) + if (likely(!pte_special(pte) || pte_numa(pte))) goto check_pfn; if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; @@ -772,14 +777,15 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, } } - if (is_zero_pfn(pfn)) - return NULL; check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); return NULL; } + if (is_zero_pfn(pfn)) + return NULL; + /* * NOTE! We still have PageReserved() pages in the page tables. * eg. VDSO mappings can cause them to exist. @@ -1126,8 +1132,10 @@ again: if (PageAnon(page)) rss[MM_ANONPAGES]--; else { - if (pte_dirty(ptent)) + if (pte_dirty(ptent)) { + force_flush = 1; set_page_dirty(page); + } if (pte_young(ptent) && likely(!(vma->vm_flags & VM_SEQ_READ))) mark_page_accessed(page); @@ -1136,9 +1144,10 @@ again: page_remove_rmap(page); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); - force_flush = !__tlb_remove_page(tlb, page); - if (force_flush) + if (unlikely(!__tlb_remove_page(tlb, page))) { + force_flush = 1; break; + } continue; } /* @@ -1173,18 +1182,11 @@ again: add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(start_pte, ptl); - /* - * mmu_gather ran out of room to batch pages, we break out of - * the PTE lock to avoid doing the potential expensive TLB invalidate - * and page-free while holding it. - */ + /* Do the actual TLB flush before dropping ptl */ if (force_flush) { unsigned long old_end; - force_flush = 0; - /* * Flush the TLB just for the previous segment, * then update the range to be the remaining @@ -1192,11 +1194,21 @@ again: */ old_end = tlb->end; tlb->end = addr; - - tlb_flush_mmu(tlb); - + tlb_flush_mmu_tlbonly(tlb); tlb->start = addr; tlb->end = old_end; + } + pte_unmap_unlock(start_pte, ptl); + + /* + * If we forced a TLB flush (either due to running out of + * batch buffers or because we needed to flush dirty TLB + * entries before releasing the ptl), free the batched + * memory too. Restart if we didn't do everything. + */ + if (force_flush) { + force_flush = 0; + tlb_flush_mmu_free(tlb); if (addr != end) goto again; @@ -1320,9 +1332,9 @@ static void unmap_single_vma(struct mmu_gather *tlb, * It is undesirable to test vma->vm_file as it * should be non-null for valid hugetlb area. * However, vm_file will be NULL in the error - * cleanup path of do_mmap_pgoff. When + * cleanup path of mmap_region. When * hugetlbfs ->mmap method fails, - * do_mmap_pgoff() nullifies vma->vm_file + * mmap_region() nullifies vma->vm_file * before calling this function to clean up. * Since no pte has actually been setup, it is * safe to do nothing in this case. @@ -1441,617 +1453,6 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL_GPL(zap_vma_ptes); -/** - * follow_page_mask - look up a page descriptor from a user-virtual address - * @vma: vm_area_struct mapping @address - * @address: virtual address to look up - * @flags: flags modifying lookup behaviour - * @page_mask: on output, *page_mask is set according to the size of the page - * - * @flags can have FOLL_ flags set, defined in <linux/mm.h> - * - * Returns the mapped (struct page *), %NULL if no mapping exists, or - * an error pointer if there is a mapping to something not represented - * by a page descriptor (see also vm_normal_page()). - */ -struct page *follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned int *page_mask) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - spinlock_t *ptl; - struct page *page; - struct mm_struct *mm = vma->vm_mm; - - *page_mask = 0; - - page = follow_huge_addr(mm, address, flags & FOLL_WRITE); - if (!IS_ERR(page)) { - BUG_ON(flags & FOLL_GET); - goto out; - } - - page = NULL; - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto no_page_table; - - pud = pud_offset(pgd, address); - if (pud_none(*pud)) - goto no_page_table; - if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { - if (flags & FOLL_GET) - goto out; - page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); - goto out; - } - if (unlikely(pud_bad(*pud))) - goto no_page_table; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - goto no_page_table; - if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { - page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); - if (flags & FOLL_GET) { - /* - * Refcount on tail pages are not well-defined and - * shouldn't be taken. The caller should handle a NULL - * return when trying to follow tail pages. - */ - if (PageHead(page)) - get_page(page); - else { - page = NULL; - goto out; - } - } - goto out; - } - if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) - goto no_page_table; - if (pmd_trans_huge(*pmd)) { - if (flags & FOLL_SPLIT) { - split_huge_page_pmd(vma, address, pmd); - goto split_fallthrough; - } - ptl = pmd_lock(mm, pmd); - if (likely(pmd_trans_huge(*pmd))) { - if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(ptl); - wait_split_huge_page(vma->anon_vma, pmd); - } else { - page = follow_trans_huge_pmd(vma, address, - pmd, flags); - spin_unlock(ptl); - *page_mask = HPAGE_PMD_NR - 1; - goto out; - } - } else - spin_unlock(ptl); - /* fall through */ - } -split_fallthrough: - if (unlikely(pmd_bad(*pmd))) - goto no_page_table; - - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); - - pte = *ptep; - if (!pte_present(pte)) { - swp_entry_t entry; - /* - * KSM's break_ksm() relies upon recognizing a ksm page - * even while it is being migrated, so for that case we - * need migration_entry_wait(). - */ - if (likely(!(flags & FOLL_MIGRATION))) - goto no_page; - if (pte_none(pte) || pte_file(pte)) - goto no_page; - entry = pte_to_swp_entry(pte); - if (!is_migration_entry(entry)) - goto no_page; - pte_unmap_unlock(ptep, ptl); - migration_entry_wait(mm, pmd, address); - goto split_fallthrough; - } - if ((flags & FOLL_NUMA) && pte_numa(pte)) - goto no_page; - if ((flags & FOLL_WRITE) && !pte_write(pte)) - goto unlock; - - page = vm_normal_page(vma, address, pte); - if (unlikely(!page)) { - if ((flags & FOLL_DUMP) || - !is_zero_pfn(pte_pfn(pte))) - goto bad_page; - page = pte_page(pte); - } - - if (flags & FOLL_GET) - get_page_foll(page); - if (flags & FOLL_TOUCH) { - if ((flags & FOLL_WRITE) && - !pte_dirty(pte) && !PageDirty(page)) - set_page_dirty(page); - /* - * pte_mkyoung() would be more correct here, but atomic care - * is needed to avoid losing the dirty bit: it is easier to use - * mark_page_accessed(). - */ - mark_page_accessed(page); - } - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - /* - * The preliminary mapping check is mainly to avoid the - * pointless overhead of lock_page on the ZERO_PAGE - * which might bounce very badly if there is contention. - * - * If the page is already locked, we don't need to - * handle it now - vmscan will handle it later if and - * when it attempts to reclaim the page. - */ - if (page->mapping && trylock_page(page)) { - lru_add_drain(); /* push cached pages to LRU */ - /* - * Because we lock page here, and migration is - * blocked by the pte's page reference, and we - * know the page is still mapped, we don't even - * need to check for file-cache page truncation. - */ - mlock_vma_page(page); - unlock_page(page); - } - } -unlock: - pte_unmap_unlock(ptep, ptl); -out: - return page; - -bad_page: - pte_unmap_unlock(ptep, ptl); - return ERR_PTR(-EFAULT); - -no_page: - pte_unmap_unlock(ptep, ptl); - if (!pte_none(pte)) - return page; - -no_page_table: - /* - * When core dumping an enormous anonymous area that nobody - * has touched so far, we don't want to allocate unnecessary pages or - * page tables. Return error instead of NULL to skip handle_mm_fault, - * then get_dump_page() will return NULL to leave a hole in the dump. - * But we can only make this optimization where a hole would surely - * be zero-filled if handle_mm_fault() actually did handle it. - */ - if ((flags & FOLL_DUMP) && - (!vma->vm_ops || !vma->vm_ops->fault)) - return ERR_PTR(-EFAULT); - return page; -} - -static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) -{ - return stack_guard_page_start(vma, addr) || - stack_guard_page_end(vma, addr+PAGE_SIZE); -} - -/** - * __get_user_pages() - pin user pages in memory - * @tsk: task_struct of target task - * @mm: mm_struct of target mm - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @gup_flags: flags modifying pin behaviour - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. Or NULL, if caller - * only intends to ensure the pages are faulted in. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. - * @nonblocking: whether waiting for disk IO or mmap_sem contention - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. Each page returned must be released - * with a put_page() call when it is finished with. vmas will only - * remain valid while mmap_sem is held. - * - * Must be called with mmap_sem held for read or write. - * - * __get_user_pages walks a process's page tables and takes a reference to - * each struct page that each user address corresponds to at a given - * instant. That is, it takes the page that would be accessed if a user - * thread accesses the given user virtual address at that instant. - * - * This does not guarantee that the page exists in the user mappings when - * __get_user_pages returns, and there may even be a completely different - * page there in some cases (eg. if mmapped pagecache has been invalidated - * and subsequently re faulted). However it does guarantee that the page - * won't be freed completely. And mostly callers simply care that the page - * contains data that was valid *at some point in time*. Typically, an IO - * or similar operation cannot guarantee anything stronger anyway because - * locks can't be held over the syscall boundary. - * - * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If - * the page is written to, set_page_dirty (or set_page_dirty_lock, as - * appropriate) must be called after the page is finished with, and - * before put_page is called. - * - * If @nonblocking != NULL, __get_user_pages will not wait for disk IO - * or mmap_sem contention, and if waiting is needed to pin all pages, - * *@nonblocking will be set to 0. - * - * In most cases, get_user_pages or get_user_pages_fast should be used - * instead of __get_user_pages. __get_user_pages should be used only if - * you need some special @gup_flags. - */ -long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *nonblocking) -{ - long i; - unsigned long vm_flags; - unsigned int page_mask; - - if (!nr_pages) - return 0; - - VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); - - /* - * Require read or write permissions. - * If FOLL_FORCE is set, we only require the "MAY" flags. - */ - vm_flags = (gup_flags & FOLL_WRITE) ? - (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - vm_flags &= (gup_flags & FOLL_FORCE) ? - (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); - - /* - * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault - * would be called on PROT_NONE ranges. We must never invoke - * handle_mm_fault on PROT_NONE ranges or the NUMA hinting - * page faults would unprotect the PROT_NONE ranges if - * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd - * bitflag. So to avoid that, don't set FOLL_NUMA if - * FOLL_FORCE is set. - */ - if (!(gup_flags & FOLL_FORCE)) - gup_flags |= FOLL_NUMA; - - i = 0; - - do { - struct vm_area_struct *vma; - - vma = find_extend_vma(mm, start); - if (!vma && in_gate_area(mm, start)) { - unsigned long pg = start & PAGE_MASK; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - /* user gate pages are read-only */ - if (gup_flags & FOLL_WRITE) - return i ? : -EFAULT; - if (pg > TASK_SIZE) - pgd = pgd_offset_k(pg); - else - pgd = pgd_offset_gate(mm, pg); - BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd, pg); - BUG_ON(pud_none(*pud)); - pmd = pmd_offset(pud, pg); - if (pmd_none(*pmd)) - return i ? : -EFAULT; - VM_BUG_ON(pmd_trans_huge(*pmd)); - pte = pte_offset_map(pmd, pg); - if (pte_none(*pte)) { - pte_unmap(pte); - return i ? : -EFAULT; - } - vma = get_gate_vma(mm); - if (pages) { - struct page *page; - - page = vm_normal_page(vma, start, *pte); - if (!page) { - if (!(gup_flags & FOLL_DUMP) && - is_zero_pfn(pte_pfn(*pte))) - page = pte_page(*pte); - else { - pte_unmap(pte); - return i ? : -EFAULT; - } - } - pages[i] = page; - get_page(page); - } - pte_unmap(pte); - page_mask = 0; - goto next_page; - } - - if (!vma || - (vma->vm_flags & (VM_IO | VM_PFNMAP)) || - !(vm_flags & vma->vm_flags)) - return i ? : -EFAULT; - - if (is_vm_hugetlb_page(vma)) { - i = follow_hugetlb_page(mm, vma, pages, vmas, - &start, &nr_pages, i, gup_flags); - continue; - } - - do { - struct page *page; - unsigned int foll_flags = gup_flags; - unsigned int page_increm; - - /* - * If we have a pending SIGKILL, don't keep faulting - * pages and potentially allocating memory. - */ - if (unlikely(fatal_signal_pending(current))) - return i ? i : -ERESTARTSYS; - - cond_resched(); - while (!(page = follow_page_mask(vma, start, - foll_flags, &page_mask))) { - int ret; - unsigned int fault_flags = 0; - - /* For mlock, just skip the stack guard page. */ - if (foll_flags & FOLL_MLOCK) { - if (stack_guard_page(vma, start)) - goto next_page; - } - if (foll_flags & FOLL_WRITE) - fault_flags |= FAULT_FLAG_WRITE; - if (nonblocking) - fault_flags |= FAULT_FLAG_ALLOW_RETRY; - if (foll_flags & FOLL_NOWAIT) - fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT); - - ret = handle_mm_fault(mm, vma, start, - fault_flags); - - if (ret & VM_FAULT_ERROR) { - if (ret & VM_FAULT_OOM) - return i ? i : -ENOMEM; - if (ret & (VM_FAULT_HWPOISON | - VM_FAULT_HWPOISON_LARGE)) { - if (i) - return i; - else if (gup_flags & FOLL_HWPOISON) - return -EHWPOISON; - else - return -EFAULT; - } - if (ret & VM_FAULT_SIGBUS) - return i ? i : -EFAULT; - BUG(); - } - - if (tsk) { - if (ret & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - } - - if (ret & VM_FAULT_RETRY) { - if (nonblocking) - *nonblocking = 0; - return i; - } - - /* - * The VM_FAULT_WRITE bit tells us that - * do_wp_page has broken COW when necessary, - * even if maybe_mkwrite decided not to set - * pte_write. We can thus safely do subsequent - * page lookups as if they were reads. But only - * do so when looping for pte_write is futile: - * in some cases userspace may also be wanting - * to write to the gotten user page, which a - * read fault here might prevent (a readonly - * page might get reCOWed by userspace write). - */ - if ((ret & VM_FAULT_WRITE) && - !(vma->vm_flags & VM_WRITE)) - foll_flags &= ~FOLL_WRITE; - - cond_resched(); - } - if (IS_ERR(page)) - return i ? i : PTR_ERR(page); - if (pages) { - pages[i] = page; - - flush_anon_page(vma, page, start); - flush_dcache_page(page); - page_mask = 0; - } -next_page: - if (vmas) { - vmas[i] = vma; - page_mask = 0; - } - page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); - if (page_increm > nr_pages) - page_increm = nr_pages; - i += page_increm; - start += page_increm * PAGE_SIZE; - nr_pages -= page_increm; - } while (nr_pages && start < vma->vm_end); - } while (nr_pages); - return i; -} -EXPORT_SYMBOL(__get_user_pages); - -/* - * fixup_user_fault() - manually resolve a user page fault - * @tsk: the task_struct to use for page fault accounting, or - * NULL if faults are not to be recorded. - * @mm: mm_struct of target mm - * @address: user address - * @fault_flags:flags to pass down to handle_mm_fault() - * - * This is meant to be called in the specific scenario where for locking reasons - * we try to access user memory in atomic context (within a pagefault_disable() - * section), this returns -EFAULT, and we want to resolve the user fault before - * trying again. - * - * Typically this is meant to be used by the futex code. - * - * The main difference with get_user_pages() is that this function will - * unconditionally call handle_mm_fault() which will in turn perform all the - * necessary SW fixup of the dirty and young bits in the PTE, while - * handle_mm_fault() only guarantees to update these in the struct page. - * - * This is important for some architectures where those bits also gate the - * access permission to the page because they are maintained in software. On - * such architectures, gup() will not be enough to make a subsequent access - * succeed. - * - * This should be called with the mm_sem held for read. - */ -int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, - unsigned long address, unsigned int fault_flags) -{ - struct vm_area_struct *vma; - int ret; - - vma = find_extend_vma(mm, address); - if (!vma || address < vma->vm_start) - return -EFAULT; - - ret = handle_mm_fault(mm, vma, address, fault_flags); - if (ret & VM_FAULT_ERROR) { - if (ret & VM_FAULT_OOM) - return -ENOMEM; - if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) - return -EHWPOISON; - if (ret & VM_FAULT_SIGBUS) - return -EFAULT; - BUG(); - } - if (tsk) { - if (ret & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - } - return 0; -} - -/* - * get_user_pages() - pin user pages in memory - * @tsk: the task_struct to use for page fault accounting, or - * NULL if faults are not to be recorded. - * @mm: mm_struct of target mm - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to by the caller - * @force: whether to force write access even if user mapping is - * readonly. This will result in the page being COWed even - * in MAP_SHARED mappings. You do not want this. - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. Or NULL, if caller - * only intends to ensure the pages are faulted in. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. Each page returned must be released - * with a put_page() call when it is finished with. vmas will only - * remain valid while mmap_sem is held. - * - * Must be called with mmap_sem held for read or write. - * - * get_user_pages walks a process's page tables and takes a reference to - * each struct page that each user address corresponds to at a given - * instant. That is, it takes the page that would be accessed if a user - * thread accesses the given user virtual address at that instant. - * - * This does not guarantee that the page exists in the user mappings when - * get_user_pages returns, and there may even be a completely different - * page there in some cases (eg. if mmapped pagecache has been invalidated - * and subsequently re faulted). However it does guarantee that the page - * won't be freed completely. And mostly callers simply care that the page - * contains data that was valid *at some point in time*. Typically, an IO - * or similar operation cannot guarantee anything stronger anyway because - * locks can't be held over the syscall boundary. - * - * If write=0, the page must not be written to. If the page is written to, - * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called - * after the page is finished with, and before put_page is called. - * - * get_user_pages is typically used for fewer-copy IO operations, to get a - * handle on the memory by some means other than accesses via the user virtual - * addresses. The pages may be submitted for DMA to devices or accessed via - * their kernel linear mapping (via the kmap APIs). Care should be taken to - * use the correct cache flushing APIs. - * - * See also get_user_pages_fast, for performance critical applications. - */ -long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, int write, - int force, struct page **pages, struct vm_area_struct **vmas) -{ - int flags = FOLL_TOUCH; - - if (pages) - flags |= FOLL_GET; - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; - - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, - NULL); -} -EXPORT_SYMBOL(get_user_pages); - -/** - * get_dump_page() - pin user page in memory while writing it to core dump - * @addr: user address - * - * Returns struct page pointer of user page pinned for dump, - * to be freed afterwards by page_cache_release() or put_page(). - * - * Returns NULL on any kind of failure - a hole must then be inserted into - * the corefile, to preserve alignment with its headers; and also returns - * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - - * allowing a hole to be left in the corefile to save diskspace. - * - * Called without mmap_sem, but after all other threads have been killed. - */ -#ifdef CONFIG_ELF_CORE -struct page *get_dump_page(unsigned long addr) -{ - struct vm_area_struct *vma; - struct page *page; - - if (__get_user_pages(current, current->mm, addr, 1, - FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, - NULL) < 1) - return NULL; - flush_cache_page(vma, addr, page_to_pfn(page)); - return page; -} -#endif /* CONFIG_ELF_CORE */ - pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { @@ -2587,6 +1988,38 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo } /* + * Notify the address space that the page is about to become writable so that + * it can prohibit this or wait for the page to get into an appropriate state. + * + * We do this without the lock held, so that it can sleep if it needs to. + */ +static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, + unsigned long address) +{ + struct vm_fault vmf; + int ret; + + vmf.virtual_address = (void __user *)(address & PAGE_MASK); + vmf.pgoff = page->index; + vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; + vmf.page = page; + + ret = vma->vm_ops->page_mkwrite(vma, &vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) + return ret; + if (unlikely(!(ret & VM_FAULT_LOCKED))) { + lock_page(page); + if (!page->mapping) { + unlock_page(page); + return 0; /* retry */ + } + ret |= VM_FAULT_LOCKED; + } else + VM_BUG_ON_PAGE(!PageLocked(page), page); + return ret; +} + +/* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address * and decrementing the shared-page counter for the old page. @@ -2668,42 +2101,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * get_user_pages(.write=1, .force=1). */ if (vma->vm_ops && vma->vm_ops->page_mkwrite) { - struct vm_fault vmf; int tmp; - - vmf.virtual_address = (void __user *)(address & - PAGE_MASK); - vmf.pgoff = old_page->index; - vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - vmf.page = old_page; - - /* - * Notify the address space that the page is about to - * become writable so that it can prohibit this or wait - * for the page to get into an appropriate state. - * - * We do this without the lock held, so that it can - * sleep if it needs to. - */ page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); - - tmp = vma->vm_ops->page_mkwrite(vma, &vmf); - if (unlikely(tmp & - (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { - ret = tmp; - goto unwritable_page; + tmp = do_page_mkwrite(vma, old_page, address); + if (unlikely(!tmp || (tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { + page_cache_release(old_page); + return tmp; } - if (unlikely(!(tmp & VM_FAULT_LOCKED))) { - lock_page(old_page); - if (!old_page->mapping) { - ret = 0; /* retry the fault */ - unlock_page(old_page); - goto unwritable_page; - } - } else - VM_BUG_ON_PAGE(!PageLocked(old_page), old_page); - /* * Since we dropped the lock we need to revalidate * the PTE as someone else may have changed it. If @@ -2748,11 +2154,11 @@ reuse: * bit after it clear all dirty ptes, but before a racing * do_wp_page installs a dirty pte. * - * __do_fault is protected similarly. + * do_shared_fault is protected similarly. */ if (!page_mkwrite) { wait_on_page_locked(dirty_page); - set_page_dirty_balance(dirty_page, page_mkwrite); + set_page_dirty_balance(dirty_page); /* file_update_time outside page_lock */ if (vma->vm_file) file_update_time(vma->vm_file); @@ -2798,7 +2204,7 @@ gotten: } __SetPageUptodate(new_page); - if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) + if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) goto oom_free_new; mmun_start = address & PAGE_MASK; @@ -2892,10 +2298,6 @@ oom: if (old_page) page_cache_release(old_page); return VM_FAULT_OOM; - -unwritable_page: - page_cache_release(old_page); - return ret; } static void unmap_mapping_range_vma(struct vm_area_struct *vma, @@ -3255,7 +2657,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, */ __SetPageUptodate(page); - if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) + if (mem_cgroup_charge_anon(page, mm, GFP_KERNEL)) goto oom_free_page; entry = mk_pte(page, vma->vm_page_prot); @@ -3286,53 +2688,11 @@ oom: return VM_FAULT_OOM; } -/* - * __do_fault() tries to create a new page mapping. It aggressively - * tries to share with existing pages, but makes a separate copy if - * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid - * the next page fault. - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte neither mapped nor locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. - */ -static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +static int __do_fault(struct vm_area_struct *vma, unsigned long address, + pgoff_t pgoff, unsigned int flags, struct page **page) { - pte_t *page_table; - spinlock_t *ptl; - struct page *page; - struct page *cow_page; - pte_t entry; - int anon = 0; - struct page *dirty_page = NULL; struct vm_fault vmf; int ret; - int page_mkwrite = 0; - - /* - * If we do COW later, allocate page befor taking lock_page() - * on the file cache page. This will reduce lock holding time. - */ - if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { - - if (unlikely(anon_vma_prepare(vma))) - return VM_FAULT_OOM; - - cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (!cow_page) - return VM_FAULT_OOM; - - if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) { - page_cache_release(cow_page); - return VM_FAULT_OOM; - } - } else - cow_page = NULL; vmf.virtual_address = (void __user *)(address & PAGE_MASK); vmf.pgoff = pgoff; @@ -3340,151 +2700,319 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, vmf.page = NULL; ret = vma->vm_ops->fault(vma, &vmf); - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | - VM_FAULT_RETRY))) - goto uncharge_out; + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + return ret; if (unlikely(PageHWPoison(vmf.page))) { if (ret & VM_FAULT_LOCKED) unlock_page(vmf.page); - ret = VM_FAULT_HWPOISON; page_cache_release(vmf.page); - goto uncharge_out; + return VM_FAULT_HWPOISON; } - /* - * For consistency in subsequent calls, make the faulted page always - * locked. - */ if (unlikely(!(ret & VM_FAULT_LOCKED))) lock_page(vmf.page); else VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); + *page = vmf.page; + return ret; +} + +/** + * do_set_pte - setup new PTE entry for given page and add reverse page mapping. + * + * @vma: virtual memory area + * @address: user virtual address + * @page: page to map + * @pte: pointer to target page table entry + * @write: true, if new entry is writable + * @anon: true, if it's anonymous page + * + * Caller must hold page table lock relevant for @pte. + * + * Target users are page handler itself and implementations of + * vm_ops->map_pages. + */ +void do_set_pte(struct vm_area_struct *vma, unsigned long address, + struct page *page, pte_t *pte, bool write, bool anon) +{ + pte_t entry; + + flush_icache_page(vma, page); + entry = mk_pte(page, vma->vm_page_prot); + if (write) + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) + pte_mksoft_dirty(entry); + if (anon) { + inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, address); + } else { + inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES); + page_add_file_rmap(page); + } + set_pte_at(vma->vm_mm, address, pte, entry); + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, address, pte); +} + +static unsigned long fault_around_bytes = rounddown_pow_of_two(65536); + +static inline unsigned long fault_around_pages(void) +{ + return fault_around_bytes >> PAGE_SHIFT; +} + +static inline unsigned long fault_around_mask(void) +{ + return ~(fault_around_bytes - 1) & PAGE_MASK; +} + +#ifdef CONFIG_DEBUG_FS +static int fault_around_bytes_get(void *data, u64 *val) +{ + *val = fault_around_bytes; + return 0; +} + +/* + * fault_around_pages() and fault_around_mask() expects fault_around_bytes + * rounded down to nearest page order. It's what do_fault_around() expects to + * see. + */ +static int fault_around_bytes_set(void *data, u64 val) +{ + if (val / PAGE_SIZE > PTRS_PER_PTE) + return -EINVAL; + if (val > PAGE_SIZE) + fault_around_bytes = rounddown_pow_of_two(val); + else + fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */ + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops, + fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); + +static int __init fault_around_debugfs(void) +{ + void *ret; + + ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL, + &fault_around_bytes_fops); + if (!ret) + pr_warn("Failed to create fault_around_bytes in debugfs"); + return 0; +} +late_initcall(fault_around_debugfs); +#endif + +/* + * do_fault_around() tries to map few pages around the fault address. The hope + * is that the pages will be needed soon and this will lower the number of + * faults to handle. + * + * It uses vm_ops->map_pages() to map the pages, which skips the page if it's + * not ready to be mapped: not up-to-date, locked, etc. + * + * This function is called with the page table lock taken. In the split ptlock + * case the page table lock only protects only those entries which belong to + * the page table corresponding to the fault address. + * + * This function doesn't cross the VMA boundaries, in order to call map_pages() + * only once. + * + * fault_around_pages() defines how many pages we'll try to map. + * do_fault_around() expects it to return a power of two less than or equal to + * PTRS_PER_PTE. + * + * The virtual address of the area that we map is naturally aligned to the + * fault_around_pages() value (and therefore to page order). This way it's + * easier to guarantee that we don't cross page table boundaries. + */ +static void do_fault_around(struct vm_area_struct *vma, unsigned long address, + pte_t *pte, pgoff_t pgoff, unsigned int flags) +{ + unsigned long start_addr; + pgoff_t max_pgoff; + struct vm_fault vmf; + int off; + + start_addr = max(address & fault_around_mask(), vma->vm_start); + off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + pte -= off; + pgoff -= off; + /* - * Should we do an early C-O-W break? + * max_pgoff is either end of page table or end of vma + * or fault_around_pages() from pgoff, depending what is nearest. */ - page = vmf.page; - if (flags & FAULT_FLAG_WRITE) { - if (!(vma->vm_flags & VM_SHARED)) { - page = cow_page; - anon = 1; - copy_user_highpage(page, vmf.page, address, vma); - __SetPageUptodate(page); - } else { - /* - * If the page will be shareable, see if the backing - * address space wants to know that the page is about - * to become writable - */ - if (vma->vm_ops->page_mkwrite) { - int tmp; - - unlock_page(page); - vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - tmp = vma->vm_ops->page_mkwrite(vma, &vmf); - if (unlikely(tmp & - (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { - ret = tmp; - goto unwritable_page; - } - if (unlikely(!(tmp & VM_FAULT_LOCKED))) { - lock_page(page); - if (!page->mapping) { - ret = 0; /* retry the fault */ - unlock_page(page); - goto unwritable_page; - } - } else - VM_BUG_ON_PAGE(!PageLocked(page), page); - page_mkwrite = 1; - } - } - + max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + + PTRS_PER_PTE - 1; + max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, + pgoff + fault_around_pages() - 1); + + /* Check if it makes any sense to call ->map_pages */ + while (!pte_none(*pte)) { + if (++pgoff > max_pgoff) + return; + start_addr += PAGE_SIZE; + if (start_addr >= vma->vm_end) + return; + pte++; } - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + vmf.virtual_address = (void __user *) start_addr; + vmf.pte = pte; + vmf.pgoff = pgoff; + vmf.max_pgoff = max_pgoff; + vmf.flags = flags; + vma->vm_ops->map_pages(vma, &vmf); +} + +static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ + struct page *fault_page; + spinlock_t *ptl; + pte_t *pte; + int ret = 0; /* - * This silly early PAGE_DIRTY setting removes a race - * due to the bad i386 page protection. But it's valid - * for other architectures too. - * - * Note that if FAULT_FLAG_WRITE is set, we either now have - * an exclusive copy of the page, or this is a shared mapping, - * so we can make it writable and dirty to avoid having to - * handle that later. + * Let's call ->map_pages() first and use ->fault() as fallback + * if page by the offset is not ready to be mapped (cold cache or + * something). */ - /* Only go through if we didn't race with anybody else... */ - if (likely(pte_same(*page_table, orig_pte))) { - flush_icache_page(vma, page); - entry = mk_pte(page, vma->vm_page_prot); - if (flags & FAULT_FLAG_WRITE) - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - else if (pte_file(orig_pte) && pte_file_soft_dirty(orig_pte)) - pte_mksoft_dirty(entry); - if (anon) { - inc_mm_counter_fast(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address); - } else { - inc_mm_counter_fast(mm, MM_FILEPAGES); - page_add_file_rmap(page); - if (flags & FAULT_FLAG_WRITE) { - dirty_page = page; - get_page(dirty_page); - } - } - set_pte_at(mm, address, page_table, entry); + if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) && + fault_around_pages() > 1) { + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + do_fault_around(vma, address, pte, pgoff, flags); + if (!pte_same(*pte, orig_pte)) + goto unlock_out; + pte_unmap_unlock(pte, ptl); + } - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, address, page_table); - } else { - if (cow_page) - mem_cgroup_uncharge_page(cow_page); - if (anon) - page_cache_release(page); - else - anon = 1; /* no anon but release faulted_page */ + ret = __do_fault(vma, address, pgoff, flags, &fault_page); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + return ret; + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*pte, orig_pte))) { + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); + return ret; } + do_set_pte(vma, address, fault_page, pte, false, false); + unlock_page(fault_page); +unlock_out: + pte_unmap_unlock(pte, ptl); + return ret; +} - pte_unmap_unlock(page_table, ptl); +static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ + struct page *fault_page, *new_page; + spinlock_t *ptl; + pte_t *pte; + int ret; - if (dirty_page) { - struct address_space *mapping = page->mapping; - int dirtied = 0; + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; - if (set_page_dirty(dirty_page)) - dirtied = 1; - unlock_page(dirty_page); - put_page(dirty_page); - if ((dirtied || page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping but still - * dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!new_page) + return VM_FAULT_OOM; - /* file_update_time outside page_lock */ - if (vma->vm_file && !page_mkwrite) - file_update_time(vma->vm_file); - } else { - unlock_page(vmf.page); - if (anon) - page_cache_release(vmf.page); + if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) { + page_cache_release(new_page); + return VM_FAULT_OOM; } - return ret; + ret = __do_fault(vma, address, pgoff, flags, &fault_page); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + goto uncharge_out; -unwritable_page: - page_cache_release(page); + copy_user_highpage(new_page, fault_page, address, vma); + __SetPageUptodate(new_page); + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*pte, orig_pte))) { + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); + goto uncharge_out; + } + do_set_pte(vma, address, new_page, pte, true, true); + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); return ret; uncharge_out: - /* fs's fault handler get error */ - if (cow_page) { - mem_cgroup_uncharge_page(cow_page); - page_cache_release(cow_page); + mem_cgroup_uncharge_page(new_page); + page_cache_release(new_page); + return ret; +} + +static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ + struct page *fault_page; + struct address_space *mapping; + spinlock_t *ptl; + pte_t *pte; + int dirtied = 0; + int ret, tmp; + + ret = __do_fault(vma, address, pgoff, flags, &fault_page); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + return ret; + + /* + * Check if the backing address space wants to know that the page is + * about to become writable + */ + if (vma->vm_ops->page_mkwrite) { + unlock_page(fault_page); + tmp = do_page_mkwrite(vma, fault_page, address); + if (unlikely(!tmp || + (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { + page_cache_release(fault_page); + return tmp; + } } + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*pte, orig_pte))) { + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); + return ret; + } + do_set_pte(vma, address, fault_page, pte, true, false); + pte_unmap_unlock(pte, ptl); + + if (set_page_dirty(fault_page)) + dirtied = 1; + mapping = fault_page->mapping; + unlock_page(fault_page); + if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { + /* + * Some device drivers do not set page.mapping but still + * dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + + /* file_update_time outside page_lock */ + if (vma->vm_file && !vma->vm_ops->page_mkwrite) + file_update_time(vma->vm_file); + return ret; } @@ -3496,7 +3024,13 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; pte_unmap(page_table); - return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); + if (!(flags & FAULT_FLAG_WRITE)) + return do_read_fault(mm, vma, address, pmd, pgoff, flags, + orig_pte); + if (!(vma->vm_flags & VM_SHARED)) + return do_cow_fault(mm, vma, address, pmd, pgoff, flags, + orig_pte); + return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } /* @@ -3528,10 +3062,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, } pgoff = pte_to_pgoff(orig_pte); - return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); + if (!(flags & FAULT_FLAG_WRITE)) + return do_read_fault(mm, vma, address, pmd, pgoff, flags, + orig_pte); + if (!(vma->vm_flags & VM_SHARED)) + return do_cow_fault(mm, vma, address, pmd, pgoff, flags, + orig_pte); + return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } -int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, +static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags) { @@ -3546,7 +3086,7 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, return mpol_misplaced(page, vma, addr); } -int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, +static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) { struct page *page = NULL; @@ -3751,9 +3291,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, } } - /* THP should already have been handled */ - BUG_ON(pmd_numa(*pmd)); - /* * Use __pte_alloc instead of pte_alloc_map, because we can't * run pte_offset_map on the pmd, if an huge pmd could diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a650db2..469bbf5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -46,19 +46,84 @@ static void generic_online_page(struct page *page); static online_page_callback_t online_page_callback = generic_online_page; +static DEFINE_MUTEX(online_page_callback_lock); -DEFINE_MUTEX(mem_hotplug_mutex); +/* The same as the cpu_hotplug lock, but for memory hotplug. */ +static struct { + struct task_struct *active_writer; + struct mutex lock; /* Synchronizes accesses to refcount, */ + /* + * Also blocks the new readers during + * an ongoing mem hotplug operation. + */ + int refcount; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +} mem_hotplug = { + .active_writer = NULL, + .lock = __MUTEX_INITIALIZER(mem_hotplug.lock), + .refcount = 0, +#ifdef CONFIG_DEBUG_LOCK_ALLOC + .dep_map = {.name = "mem_hotplug.lock" }, +#endif +}; + +/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */ +#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map) +#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) +#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) + +void get_online_mems(void) +{ + might_sleep(); + if (mem_hotplug.active_writer == current) + return; + memhp_lock_acquire_read(); + mutex_lock(&mem_hotplug.lock); + mem_hotplug.refcount++; + mutex_unlock(&mem_hotplug.lock); + +} -void lock_memory_hotplug(void) +void put_online_mems(void) { - mutex_lock(&mem_hotplug_mutex); + if (mem_hotplug.active_writer == current) + return; + mutex_lock(&mem_hotplug.lock); + + if (WARN_ON(!mem_hotplug.refcount)) + mem_hotplug.refcount++; /* try to fix things up */ + + if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer)) + wake_up_process(mem_hotplug.active_writer); + mutex_unlock(&mem_hotplug.lock); + memhp_lock_release(); + } -void unlock_memory_hotplug(void) +static void mem_hotplug_begin(void) { - mutex_unlock(&mem_hotplug_mutex); + mem_hotplug.active_writer = current; + + memhp_lock_acquire(); + for (;;) { + mutex_lock(&mem_hotplug.lock); + if (likely(!mem_hotplug.refcount)) + break; + __set_current_state(TASK_UNINTERRUPTIBLE); + mutex_unlock(&mem_hotplug.lock); + schedule(); + } } +static void mem_hotplug_done(void) +{ + mem_hotplug.active_writer = NULL; + mutex_unlock(&mem_hotplug.lock); + memhp_lock_release(); +} /* add this memory to iomem resource */ static struct resource *register_memory_resource(u64 start, u64 size) @@ -727,14 +792,16 @@ int set_online_page_callback(online_page_callback_t callback) { int rc = -EINVAL; - lock_memory_hotplug(); + get_online_mems(); + mutex_lock(&online_page_callback_lock); if (online_page_callback == generic_online_page) { online_page_callback = callback; rc = 0; } - unlock_memory_hotplug(); + mutex_unlock(&online_page_callback_lock); + put_online_mems(); return rc; } @@ -744,14 +811,16 @@ int restore_online_page_callback(online_page_callback_t callback) { int rc = -EINVAL; - lock_memory_hotplug(); + get_online_mems(); + mutex_lock(&online_page_callback_lock); if (online_page_callback == callback) { online_page_callback = generic_online_page; rc = 0; } - unlock_memory_hotplug(); + mutex_unlock(&online_page_callback_lock); + put_online_mems(); return rc; } @@ -899,7 +968,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ int ret; struct memory_notify arg; - lock_memory_hotplug(); + mem_hotplug_begin(); /* * This doesn't need a lock to do pfn_to_page(). * The section can't be removed here because of the @@ -907,23 +976,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ */ zone = page_zone(pfn_to_page(pfn)); + ret = -EINVAL; if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && - !can_online_high_movable(zone)) { - unlock_memory_hotplug(); - return -EINVAL; - } + !can_online_high_movable(zone)) + goto out; if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { - if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { - unlock_memory_hotplug(); - return -EINVAL; - } + if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) + goto out; } if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { - if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { - unlock_memory_hotplug(); - return -EINVAL; - } + if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) + goto out; } /* Previous code may changed the zone of the pfn range */ @@ -939,8 +1003,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ ret = notifier_to_errno(ret); if (ret) { memory_notify(MEM_CANCEL_ONLINE, &arg); - unlock_memory_hotplug(); - return ret; + goto out; } /* * If this zone is not populated, then it is not in zonelist. @@ -964,8 +1027,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_ONLINE, &arg); - unlock_memory_hotplug(); - return ret; + goto out; } zone->present_pages += onlined_pages; @@ -995,9 +1057,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (onlined_pages) memory_notify(MEM_ONLINE, &arg); - unlock_memory_hotplug(); - - return 0; +out: + mem_hotplug_done(); + return ret; } #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ @@ -1007,7 +1069,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) struct pglist_data *pgdat; unsigned long zones_size[MAX_NR_ZONES] = {0}; unsigned long zholes_size[MAX_NR_ZONES] = {0}; - unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long start_pfn = PFN_DOWN(start); pgdat = NODE_DATA(nid); if (!pgdat) { @@ -1055,7 +1117,7 @@ int try_online_node(int nid) if (node_online(nid)) return 0; - lock_memory_hotplug(); + mem_hotplug_begin(); pgdat = hotadd_new_pgdat(nid, 0); if (!pgdat) { pr_err("Cannot online node %d due to NULL pgdat\n", nid); @@ -1073,13 +1135,13 @@ int try_online_node(int nid) } out: - unlock_memory_hotplug(); + mem_hotplug_done(); return ret; } static int check_hotplug_memory_range(u64 start, u64 size) { - u64 start_pfn = start >> PAGE_SHIFT; + u64 start_pfn = PFN_DOWN(start); u64 nr_pages = size >> PAGE_SHIFT; /* Memory range must be aligned with section */ @@ -1117,7 +1179,7 @@ int __ref add_memory(int nid, u64 start, u64 size) new_pgdat = !p; } - lock_memory_hotplug(); + mem_hotplug_begin(); new_node = !node_online(nid); if (new_node) { @@ -1158,7 +1220,7 @@ error: release_memory_resource(res); out: - unlock_memory_hotplug(); + mem_hotplug_done(); return ret; } EXPORT_SYMBOL_GPL(add_memory); @@ -1332,7 +1394,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * alloc_migrate_target should be improooooved!! * migrate_pages returns # of failed pages. */ - ret = migrate_pages(&source, alloc_migrate_target, 0, + ret = migrate_pages(&source, alloc_migrate_target, NULL, 0, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); if (ret) putback_movable_pages(&source); @@ -1565,7 +1627,7 @@ static int __ref __offline_pages(unsigned long start_pfn, if (!test_pages_in_a_zone(start_pfn, end_pfn)) return -EINVAL; - lock_memory_hotplug(); + mem_hotplug_begin(); zone = page_zone(pfn_to_page(start_pfn)); node = zone_to_nid(zone); @@ -1672,7 +1734,7 @@ repeat: writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); - unlock_memory_hotplug(); + mem_hotplug_done(); return 0; failed_removal: @@ -1684,7 +1746,7 @@ failed_removal: undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); out: - unlock_memory_hotplug(); + mem_hotplug_done(); return ret; } @@ -1888,7 +1950,7 @@ void __ref remove_memory(int nid, u64 start, u64 size) BUG_ON(check_hotplug_memory_range(start, size)); - lock_memory_hotplug(); + mem_hotplug_begin(); /* * All memory blocks must be offlined before removing memory. Check @@ -1897,10 +1959,8 @@ void __ref remove_memory(int nid, u64 start, u64 size) */ ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, check_memblock_offlined_cb); - if (ret) { - unlock_memory_hotplug(); + if (ret) BUG(); - } /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); @@ -1909,7 +1969,7 @@ void __ref remove_memory(int nid, u64 start, u64 size) try_offline_node(nid); - unlock_memory_hotplug(); + mem_hotplug_done(); } EXPORT_SYMBOL_GPL(remove_memory); #endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4755c85..8f5330d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -65,6 +65,8 @@ kernel is not always grateful with that. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/mempolicy.h> #include <linux/mm.h> #include <linux/highmem.h> @@ -91,6 +93,7 @@ #include <linux/ctype.h> #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> +#include <linux/printk.h> #include <asm/tlbflush.h> #include <asm/uaccess.h> @@ -526,9 +529,13 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, int nid; struct page *page; spinlock_t *ptl; + pte_t entry; ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); - page = pte_page(huge_ptep_get((pte_t *)pmd)); + entry = huge_ptep_get((pte_t *)pmd); + if (!pte_present(entry)) + goto unlock; + page = pte_page(entry); nid = page_to_nid(page); if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) goto unlock; @@ -649,19 +656,18 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, * @nodes and @flags,) it's isolated and queued to the pagelist which is * passed via @private.) */ -static struct vm_area_struct * +static int queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) { - int err; - struct vm_area_struct *first, *vma, *prev; - + int err = 0; + struct vm_area_struct *vma, *prev; - first = find_vma(mm, start); - if (!first) - return ERR_PTR(-EFAULT); + vma = find_vma(mm, start); + if (!vma) + return -EFAULT; prev = NULL; - for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { + for (; vma && vma->vm_start < end; vma = vma->vm_next) { unsigned long endvma = vma->vm_end; if (endvma > end) @@ -671,9 +677,9 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, if (!(flags & MPOL_MF_DISCONTIG_OK)) { if (!vma->vm_next && vma->vm_end < end) - return ERR_PTR(-EFAULT); + return -EFAULT; if (prev && prev->vm_end < vma->vm_start) - return ERR_PTR(-EFAULT); + return -EFAULT; } if (flags & MPOL_MF_LAZY) { @@ -687,15 +693,13 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, err = queue_pages_pgd_range(vma, start, endvma, nodes, flags, private); - if (err) { - first = ERR_PTR(err); + if (err) break; - } } next: prev = vma; } - return first; + return err; } /* @@ -795,36 +799,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, return err; } -/* - * Update task->flags PF_MEMPOLICY bit: set iff non-default - * mempolicy. Allows more rapid checking of this (combined perhaps - * with other PF_* flag bits) on memory allocation hot code paths. - * - * If called from outside this file, the task 'p' should -only- be - * a newly forked child not yet visible on the task list, because - * manipulating the task flags of a visible task is not safe. - * - * The above limitation is why this routine has the funny name - * mpol_fix_fork_child_flag(). - * - * It is also safe to call this with a task pointer of current, - * which the static wrapper mpol_set_task_struct_flag() does, - * for use within this file. - */ - -void mpol_fix_fork_child_flag(struct task_struct *p) -{ - if (p->mempolicy) - p->flags |= PF_MEMPOLICY; - else - p->flags &= ~PF_MEMPOLICY; -} - -static void mpol_set_task_struct_flag(void) -{ - mpol_fix_fork_child_flag(current); -} - /* Set the process memory policy */ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) @@ -861,7 +835,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, } old = current->mempolicy; current->mempolicy = new; - mpol_set_task_struct_flag(); if (new && new->mode == MPOL_INTERLEAVE && nodes_weight(new->v.nodes)) current->il_next = first_node(new->v.nodes); @@ -1059,7 +1032,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, dest, + err = migrate_pages(&pagelist, new_node_page, NULL, dest, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); @@ -1180,16 +1153,17 @@ out: /* * Allocate a new page for page migration based on vma policy. - * Start assuming that page is mapped by vma pointed to by @private. + * Start by assuming the page is mapped by the same vma as contains @start. * Search forward from there, if not. N.B., this assumes that the * list of pages handed to migrate_pages()--which is how we get here-- * is in virtual address order. */ -static struct page *new_vma_page(struct page *page, unsigned long private, int **x) +static struct page *new_page(struct page *page, unsigned long start, int **x) { - struct vm_area_struct *vma = (struct vm_area_struct *)private; + struct vm_area_struct *vma; unsigned long uninitialized_var(address); + vma = find_vma(current->mm, start); while (vma) { address = page_address_in_vma(page, vma); if (address != -EFAULT) @@ -1219,7 +1193,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, return -ENOSYS; } -static struct page *new_vma_page(struct page *page, unsigned long private, int **x) +static struct page *new_page(struct page *page, unsigned long start, int **x) { return NULL; } @@ -1229,7 +1203,6 @@ static long do_mbind(unsigned long start, unsigned long len, unsigned short mode, unsigned short mode_flags, nodemask_t *nmask, unsigned long flags) { - struct vm_area_struct *vma; struct mm_struct *mm = current->mm; struct mempolicy *new; unsigned long end; @@ -1295,11 +1268,9 @@ static long do_mbind(unsigned long start, unsigned long len, if (err) goto mpol_out; - vma = queue_pages_range(mm, start, end, nmask, + err = queue_pages_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); - - err = PTR_ERR(vma); /* maybe ... */ - if (!IS_ERR(vma)) + if (!err) err = mbind_range(mm, start, end, new); if (!err) { @@ -1307,9 +1278,8 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); - nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma, - MIGRATE_SYNC, MR_MEMPOLICY_MBIND); + nr_failed = migrate_pages(&pagelist, new_page, NULL, + start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); if (nr_failed) putback_movable_pages(&pagelist); } @@ -1393,7 +1363,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, } SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, - unsigned long, mode, unsigned long __user *, nmask, + unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned, flags) { nodemask_t nodes; @@ -1414,7 +1384,7 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, } /* Set the process memory policy */ -SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask, +SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, unsigned long, maxnode) { int err; @@ -1637,9 +1607,9 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, /* * get_vma_policy(@task, @vma, @addr) - * @task - task for fallback if vma policy == default - * @vma - virtual memory area whose policy is sought - * @addr - address in @vma for shared policy lookup + * @task: task for fallback if vma policy == default + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup * * Returns effective policy for a VMA at specified address. * Falls back to @task or system default policy, as necessary. @@ -1782,21 +1752,18 @@ static unsigned interleave_nodes(struct mempolicy *policy) /* * Depending on the memory policy provide a node from which to allocate the * next slab entry. - * @policy must be protected by freeing by the caller. If @policy is - * the current task's mempolicy, this protection is implicit, as only the - * task can change it's policy. The system default policy requires no - * such protection. */ -unsigned slab_node(void) +unsigned int mempolicy_slab_node(void) { struct mempolicy *policy; + int node = numa_mem_id(); if (in_interrupt()) - return numa_node_id(); + return node; policy = current->mempolicy; if (!policy || policy->flags & MPOL_F_LOCAL) - return numa_node_id(); + return node; switch (policy->mode) { case MPOL_PREFERRED: @@ -1816,11 +1783,11 @@ unsigned slab_node(void) struct zonelist *zonelist; struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); - zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; + zonelist = &NODE_DATA(node)->node_zonelists[0]; (void)first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes, &zone); - return zone ? zone->node : numa_node_id(); + return zone ? zone->node : node; } default: @@ -1888,18 +1855,18 @@ int node_random(const nodemask_t *maskp) #ifdef CONFIG_HUGETLBFS /* * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) - * @vma = virtual memory area whose policy is sought - * @addr = address in @vma for shared policy lookup and interleave policy - * @gfp_flags = for requested zone - * @mpol = pointer to mempolicy pointer for reference counted mempolicy - * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup and interleave policy + * @gfp_flags: for requested zone + * @mpol: pointer to mempolicy pointer for reference counted mempolicy + * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask * * Returns a zonelist suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'BIND, returns a pointer to the mempolicy's * @nodemask for filtering the zonelist. * - * Must be protected by get_mems_allowed() + * Must be protected by read_mems_allowed_begin() */ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, @@ -2063,7 +2030,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, retry_cpuset: pol = get_vma_policy(current, vma, addr); - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; @@ -2071,7 +2038,7 @@ retry_cpuset: nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; @@ -2081,7 +2048,7 @@ retry_cpuset: policy_nodemask(gfp, pol)); if (unlikely(mpol_needs_cond_ref(pol))) __mpol_put(pol); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; } @@ -2115,7 +2082,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) pol = &default_policy; retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); /* * No reference counting needed for current->mempolicy @@ -2128,7 +2095,7 @@ retry_cpuset: policy_zonelist(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; @@ -2172,7 +2139,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) } else *new = *old; - rcu_read_lock(); if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); if (new->flags & MPOL_F_REBINDING) @@ -2180,7 +2146,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) else mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); } - rcu_read_unlock(); atomic_set(&new->refcnt, 1); return new; } @@ -2304,9 +2269,9 @@ static void sp_free(struct sp_node *n) /** * mpol_misplaced - check whether current page node is valid in policy * - * @page - page to be checked - * @vma - vm area where page mapped - * @addr - virtual address where page mapped + * @page: page to be checked + * @vma: vm area where page mapped + * @addr: virtual address where page mapped * * Lookup current policy node id for vma,addr and "compare to" page's * node id. @@ -2679,7 +2644,7 @@ void __init numa_policy_init(void) node_set(prefer, interleave_nodes); if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) - printk("numa_policy_init: interleaving failed\n"); + pr_err("%s: interleaving failed\n", __func__); check_numabalancing_enable(); } diff --git a/mm/mempool.c b/mm/mempool.c index 659aa42..e209c98 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -10,6 +10,7 @@ #include <linux/mm.h> #include <linux/slab.h> +#include <linux/kmemleak.h> #include <linux/export.h> #include <linux/mempool.h> #include <linux/blkdev.h> @@ -192,6 +193,7 @@ EXPORT_SYMBOL(mempool_resize); * returns NULL. Note that due to preallocation, this function * *never* fails when called from process contexts. (it might * fail if called from an IRQ context.) + * Note: using __GFP_ZERO is not supported. */ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) { @@ -200,6 +202,7 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) wait_queue_t wait; gfp_t gfp_temp; + VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); might_sleep_if(gfp_mask & __GFP_WAIT); gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ @@ -220,6 +223,11 @@ repeat_alloc: spin_unlock_irqrestore(&pool->lock, flags); /* paired with rmb in mempool_free(), read comment there */ smp_wmb(); + /* + * Update the allocation stack trace as this is more useful + * for debugging. + */ + kmemleak_update_trace(element); return element; } @@ -304,9 +312,9 @@ void mempool_free(void *element, mempool_t *pool) * ensures that there will be frees which return elements to the * pool waking up the waiters. */ - if (pool->curr_nr < pool->min_nr) { + if (unlikely(pool->curr_nr < pool->min_nr)) { spin_lock_irqsave(&pool->lock, flags); - if (pool->curr_nr < pool->min_nr) { + if (likely(pool->curr_nr < pool->min_nr)) { add_element(pool, element); spin_unlock_irqrestore(&pool->lock, flags); wake_up(&pool->wait); diff --git a/mm/migrate.c b/mm/migrate.c index bed4880..be6dbf9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -120,8 +120,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, pmd = mm_find_pmd(mm, addr); if (!pmd) goto out; - if (pmd_trans_huge(*pmd)) - goto out; ptep = pte_offset_map(pmd, addr); @@ -938,8 +936,9 @@ out: * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. */ -static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, enum migrate_mode mode) +static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, + unsigned long private, struct page *page, int force, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -983,11 +982,18 @@ out: page_is_file_cache(page)); putback_lru_page(page); } + /* - * Move the new page to the LRU. If migration was not successful - * then this will free the page. + * If migration was not successful and there's a freeing callback, use + * it. Otherwise, putback_lru_page() will drop the reference grabbed + * during isolation. */ - putback_lru_page(newpage); + if (rc != MIGRATEPAGE_SUCCESS && put_new_page) { + ClearPageSwapBacked(newpage); + put_new_page(newpage, private); + } else + putback_lru_page(newpage); + if (result) { if (rc) *result = rc; @@ -1016,8 +1022,9 @@ out: * will wait in the page fault for migration to complete. */ static int unmap_and_move_huge_page(new_page_t get_new_page, - unsigned long private, struct page *hpage, - int force, enum migrate_mode mode) + free_page_t put_new_page, unsigned long private, + struct page *hpage, int force, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -1031,7 +1038,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, * tables or check whether the hugepage is pmd-based or not before * kicking migration. */ - if (!hugepage_migration_support(page_hstate(hpage))) { + if (!hugepage_migration_supported(page_hstate(hpage))) { putback_active_hugepage(hpage); return -ENOSYS; } @@ -1056,20 +1063,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (!page_mapped(hpage)) rc = move_to_new_page(new_hpage, hpage, 1, mode); - if (rc) + if (rc != MIGRATEPAGE_SUCCESS) remove_migration_ptes(hpage, hpage); if (anon_vma) put_anon_vma(anon_vma); - if (!rc) + if (rc == MIGRATEPAGE_SUCCESS) hugetlb_cgroup_migrate(hpage, new_hpage); unlock_page(hpage); out: if (rc != -EAGAIN) putback_active_hugepage(hpage); - put_page(new_hpage); + + /* + * If migration was not successful and there's a freeing callback, use + * it. Otherwise, put_page() will drop the reference grabbed during + * isolation. + */ + if (rc != MIGRATEPAGE_SUCCESS && put_new_page) + put_new_page(new_hpage, private); + else + put_page(new_hpage); + if (result) { if (rc) *result = rc; @@ -1086,6 +1103,8 @@ out: * @from: The list of pages to be migrated. * @get_new_page: The function used to allocate free pages to be used * as the target of the page migration. + * @put_new_page: The function used to free target pages if migration + * fails, or NULL if no special handling is necessary. * @private: Private data to be passed on to get_new_page() * @mode: The migration mode that specifies the constraints for * page migration, if any. @@ -1099,7 +1118,8 @@ out: * Returns the number of pages that were not migrated, or an error code. */ int migrate_pages(struct list_head *from, new_page_t get_new_page, - unsigned long private, enum migrate_mode mode, int reason) + free_page_t put_new_page, unsigned long private, + enum migrate_mode mode, int reason) { int retry = 1; int nr_failed = 0; @@ -1121,10 +1141,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, if (PageHuge(page)) rc = unmap_and_move_huge_page(get_new_page, - private, page, pass > 2, mode); + put_new_page, private, page, + pass > 2, mode); else - rc = unmap_and_move(get_new_page, private, - page, pass > 2, mode); + rc = unmap_and_move(get_new_page, put_new_page, + private, page, pass > 2, mode); switch(rc) { case -ENOMEM: @@ -1273,7 +1294,7 @@ set_status: err = 0; if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_page_node, + err = migrate_pages(&pagelist, new_page_node, NULL, (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); @@ -1729,7 +1750,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, list_add(&page->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, - node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); + NULL, node, MIGRATE_ASYNC, + MR_NUMA_MISPLACED); if (nr_remaining) { if (!list_empty(&migratepages)) { list_del(&page->lru); @@ -1852,7 +1874,7 @@ fail_putback: * guarantee the copy is visible before the pagetable update. */ flush_cache_range(vma, mmun_start, mmun_end); - page_add_new_anon_rmap(new_page, vma, mmun_start); + page_add_anon_rmap(new_page, vma, mmun_start); pmdp_clear_flush(vma, mmun_start, pmd); set_pmd_at(mm, mmun_start, pmd, entry); flush_tlb_range(vma, mmun_start, mmun_end); @@ -1877,6 +1899,10 @@ fail_putback: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + /* Take an "isolate" reference and put new page on the LRU. */ + get_page(new_page); + putback_lru_page(new_page); + unlock_page(new_page); unlock_page(page); put_page(page); /* Drop the rmap reference */ diff --git a/mm/mincore.c b/mm/mincore.c index 1016233..725c809 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -70,13 +70,21 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ - page = find_get_page(mapping, pgoff); #ifdef CONFIG_SWAP - /* shmem/tmpfs may return swap: account for swapcache page too. */ - if (radix_tree_exceptional_entry(page)) { - swp_entry_t swap = radix_to_swp_entry(page); - page = find_get_page(swap_address_space(swap), swap.val); - } + if (shmem_mapping(mapping)) { + page = find_get_entry(mapping, pgoff); + /* + * shmem/tmpfs may return swap: account for swapcache + * page too. + */ + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swp = radix_to_swp_entry(page); + page = find_get_page(swap_address_space(swp), swp.val); + } + } else + page = find_get_page(mapping, pgoff); +#else + page = find_get_page(mapping, pgoff); #endif if (page) { present = PageUptodate(page); @@ -79,6 +79,7 @@ void clear_page_mlock(struct page *page) */ void mlock_vma_page(struct page *page) { + /* Serialize with page migration */ BUG_ON(!PageLocked(page)); if (!TestSetPageMlocked(page)) { @@ -174,6 +175,7 @@ unsigned int munlock_vma_page(struct page *page) unsigned int nr_pages; struct zone *zone = page_zone(page); + /* For try_to_munlock() and to serialize with page migration */ BUG_ON(!PageLocked(page)); /* @@ -6,10 +6,13 @@ * Address space accounting code <alan@lxorguk.ukuu.org.uk> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/shm.h> #include <linux/mman.h> #include <linux/pagemap.h> @@ -36,6 +39,7 @@ #include <linux/sched/sysctl.h> #include <linux/notifier.h> #include <linux/memory.h> +#include <linux/printk.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -360,20 +364,20 @@ static int browse_rb(struct rb_root *root) struct vm_area_struct *vma; vma = rb_entry(nd, struct vm_area_struct, vm_rb); if (vma->vm_start < prev) { - printk("vm_start %lx prev %lx\n", vma->vm_start, prev); + pr_info("vm_start %lx prev %lx\n", vma->vm_start, prev); bug = 1; } if (vma->vm_start < pend) { - printk("vm_start %lx pend %lx\n", vma->vm_start, pend); + pr_info("vm_start %lx pend %lx\n", vma->vm_start, pend); bug = 1; } if (vma->vm_start > vma->vm_end) { - printk("vm_end %lx < vm_start %lx\n", + pr_info("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); bug = 1; } if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { - printk("free gap %lx, correct %lx\n", + pr_info("free gap %lx, correct %lx\n", vma->rb_subtree_gap, vma_compute_subtree_gap(vma)); bug = 1; @@ -387,7 +391,7 @@ static int browse_rb(struct rb_root *root) for (nd = pn; nd; nd = rb_prev(nd)) j++; if (i != j) { - printk("backwards %d, forwards %d\n", j, i); + pr_info("backwards %d, forwards %d\n", j, i); bug = 1; } return bug ? -1 : i; @@ -405,7 +409,7 @@ static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) } } -void validate_mm(struct mm_struct *mm) +static void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; @@ -422,17 +426,17 @@ void validate_mm(struct mm_struct *mm) i++; } if (i != mm->map_count) { - printk("map_count %d vm_next %d\n", mm->map_count, i); + pr_info("map_count %d vm_next %d\n", mm->map_count, i); bug = 1; } if (highest_address != mm->highest_vm_end) { - printk("mm->highest_vm_end %lx, found %lx\n", + pr_info("mm->highest_vm_end %lx, found %lx\n", mm->highest_vm_end, highest_address); bug = 1; } i = browse_rb(&mm->mm_rb); if (i != mm->map_count) { - printk("map_count %d rb %d\n", mm->map_count, i); + pr_info("map_count %d rb %d\n", mm->map_count, i); bug = 1; } BUG_ON(bug); @@ -639,11 +643,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, { struct address_space *mapping = NULL; - if (vma->vm_file) + if (vma->vm_file) { mapping = vma->vm_file->f_mapping; - - if (mapping) mutex_lock(&mapping->i_mmap_mutex); + } __vma_link(mm, vma, prev, rb_link, rb_parent); __vma_link_file(vma); @@ -681,8 +684,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, prev->vm_next = next = vma->vm_next; if (next) next->vm_prev = prev; - if (mm->mmap_cache == vma) - mm->mmap_cache = prev; + + /* Kill the cache */ + vmacache_invalidate(mm); } /* @@ -1299,7 +1303,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, /* * Make sure there are no mandatory locks on the file. */ - if (locks_verify_locked(inode)) + if (locks_verify_locked(file)) return -EAGAIN; vm_flags |= VM_SHARED | VM_MAYSHARE; @@ -1989,34 +1993,33 @@ EXPORT_SYMBOL(get_unmapped_area); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma = NULL; + struct rb_node *rb_node; + struct vm_area_struct *vma; /* Check the cache first. */ - /* (Cache hit rate is typically around 35%.) */ - vma = ACCESS_ONCE(mm->mmap_cache); - if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { - struct rb_node *rb_node; + vma = vmacache_find(mm, addr); + if (likely(vma)) + return vma; - rb_node = mm->mm_rb.rb_node; - vma = NULL; + rb_node = mm->mm_rb.rb_node; + vma = NULL; - while (rb_node) { - struct vm_area_struct *vma_tmp; - - vma_tmp = rb_entry(rb_node, - struct vm_area_struct, vm_rb); - - if (vma_tmp->vm_end > addr) { - vma = vma_tmp; - if (vma_tmp->vm_start <= addr) - break; - rb_node = rb_node->rb_left; - } else - rb_node = rb_node->rb_right; - } - if (vma) - mm->mmap_cache = vma; + while (rb_node) { + struct vm_area_struct *tmp; + + tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); + + if (tmp->vm_end > addr) { + vma = tmp; + if (tmp->vm_start <= addr) + break; + rb_node = rb_node->rb_left; + } else + rb_node = rb_node->rb_right; } + + if (vma) + vmacache_update(addr, vma); return vma; } @@ -2388,7 +2391,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, } else mm->highest_vm_end = prev ? prev->vm_end : 0; tail_vma->vm_next = NULL; - mm->mmap_cache = NULL; /* Kill the cache. */ + + /* Kill the cache */ + vmacache_invalidate(mm); } /* @@ -2869,6 +2874,31 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages) return 1; } +static int special_mapping_fault(struct vm_area_struct *vma, + struct vm_fault *vmf); + +/* + * Having a close hook prevents vma merging regardless of flags. + */ +static void special_mapping_close(struct vm_area_struct *vma) +{ +} + +static const char *special_mapping_name(struct vm_area_struct *vma) +{ + return ((struct vm_special_mapping *)vma->vm_private_data)->name; +} + +static const struct vm_operations_struct special_mapping_vmops = { + .close = special_mapping_close, + .fault = special_mapping_fault, + .name = special_mapping_name, +}; + +static const struct vm_operations_struct legacy_special_mapping_vmops = { + .close = special_mapping_close, + .fault = special_mapping_fault, +}; static int special_mapping_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -2884,7 +2914,13 @@ static int special_mapping_fault(struct vm_area_struct *vma, */ pgoff = vmf->pgoff - vma->vm_pgoff; - for (pages = vma->vm_private_data; pgoff && *pages; ++pages) + if (vma->vm_ops == &legacy_special_mapping_vmops) + pages = vma->vm_private_data; + else + pages = ((struct vm_special_mapping *)vma->vm_private_data)-> + pages; + + for (; pgoff && *pages; ++pages) pgoff--; if (*pages) { @@ -2897,30 +2933,11 @@ static int special_mapping_fault(struct vm_area_struct *vma, return VM_FAULT_SIGBUS; } -/* - * Having a close hook prevents vma merging regardless of flags. - */ -static void special_mapping_close(struct vm_area_struct *vma) -{ -} - -static const struct vm_operations_struct special_mapping_vmops = { - .close = special_mapping_close, - .fault = special_mapping_fault, -}; - -/* - * Called with mm->mmap_sem held for writing. - * Insert a new vma covering the given region, with the given flags. - * Its pages are supplied by the given array of struct page *. - * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. - * The region past the last page supplied will always produce SIGBUS. - * The array pointer and the pages it points to are assumed to stay alive - * for as long as this mapping might exist. - */ -struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, - unsigned long addr, unsigned long len, - unsigned long vm_flags, struct page **pages) +static struct vm_area_struct *__install_special_mapping( + struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, const struct vm_operations_struct *ops, + void *priv) { int ret; struct vm_area_struct *vma; @@ -2937,8 +2954,8 @@ struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); - vma->vm_ops = &special_mapping_vmops; - vma->vm_private_data = pages; + vma->vm_ops = ops; + vma->vm_private_data = priv; ret = insert_vm_struct(mm, vma); if (ret) @@ -2955,16 +2972,33 @@ out: return ERR_PTR(ret); } +/* + * Called with mm->mmap_sem held for writing. + * Insert a new vma covering the given region, with the given flags. + * Its pages are supplied by the given array of struct page *. + * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. + * The region past the last page supplied will always produce SIGBUS. + * The array pointer and the pages it points to are assumed to stay alive + * for as long as this mapping might exist. + */ +struct vm_area_struct *_install_special_mapping( + struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, const struct vm_special_mapping *spec) +{ + return __install_special_mapping(mm, addr, len, vm_flags, + &special_mapping_vmops, (void *)spec); +} + int install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long vm_flags, struct page **pages) { - struct vm_area_struct *vma = _install_special_mapping(mm, - addr, len, vm_flags, pages); + struct vm_area_struct *vma = __install_special_mapping( + mm, addr, len, vm_flags, &legacy_special_mapping_vmops, + (void *)pages); - if (IS_ERR(vma)) - return PTR_ERR(vma); - return 0; + return PTR_ERR_OR_ZERO(vma); } static DEFINE_MUTEX(mm_all_locks_mutex); @@ -3249,7 +3283,7 @@ static struct notifier_block reserve_mem_nb = { static int __meminit init_reserve_notifier(void) { if (register_hotmemory_notifier(&reserve_mem_nb)) - printk("Failed registering memory add/remove notifier for admin reserve"); + pr_err("Failed registering memory add/remove notifier for admin reserve\n"); return 0; } diff --git a/mm/mprotect.c b/mm/mprotect.c index 769a67a..c43d557 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -36,6 +36,34 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) } #endif +/* + * For a prot_numa update we only hold mmap_sem for read so there is a + * potential race with faulting where a pmd was temporarily none. This + * function checks for a transhuge pmd under the appropriate lock. It + * returns a pte if it was successfully locked or NULL if it raced with + * a transhuge insertion. + */ +static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, int prot_numa, spinlock_t **ptl) +{ + pte_t *pte; + spinlock_t *pmdl; + + /* !prot_numa is protected by mmap_sem held for write */ + if (!prot_numa) + return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); + + pmdl = pmd_lock(vma->vm_mm, pmd); + if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) { + spin_unlock(pmdl); + return NULL; + } + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); + spin_unlock(pmdl); + return pte; +} + static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) @@ -45,7 +73,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; unsigned long pages = 0; - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl); + if (!pte) + return 0; + arch_enter_lazy_mmu_mode(); do { oldpte = *pte; @@ -109,15 +140,26 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pgprot_t newprot, int dirty_accountable, int prot_numa) { pmd_t *pmd; + struct mm_struct *mm = vma->vm_mm; unsigned long next; unsigned long pages = 0; unsigned long nr_huge_updates = 0; + unsigned long mni_start = 0; pmd = pmd_offset(pud, addr); do { unsigned long this_pages; next = pmd_addr_end(addr, end); + if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd)) + continue; + + /* invoke the mmu notifier if the pmd is populated */ + if (!mni_start) { + mni_start = addr; + mmu_notifier_invalidate_range_start(mm, mni_start, end); + } + if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma, addr, pmd); @@ -130,18 +172,21 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pages += HPAGE_PMD_NR; nr_huge_updates++; } + + /* huge pmd was handled */ continue; } } - /* fall through */ + /* fall through, the trans huge pmd just split */ } - if (pmd_none_or_clear_bad(pmd)) - continue; this_pages = change_pte_range(vma, pmd, addr, next, newprot, dirty_accountable, prot_numa); pages += this_pages; } while (pmd++, addr = next, addr != end); + if (mni_start) + mmu_notifier_invalidate_range_end(mm, mni_start, end); + if (nr_huge_updates) count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); return pages; @@ -201,15 +246,12 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) { - struct mm_struct *mm = vma->vm_mm; unsigned long pages; - mmu_notifier_invalidate_range_start(mm, start, end); if (is_vm_hugetlb_page(vma)) pages = hugetlb_change_protection(vma, start, end, newprot); else pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); - mmu_notifier_invalidate_range_end(mm, start, end); return pages; } diff --git a/mm/mremap.c b/mm/mremap.c index 0843feb..05f1180 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -194,10 +194,17 @@ unsigned long move_page_tables(struct vm_area_struct *vma, break; if (pmd_trans_huge(*old_pmd)) { int err = 0; - if (extent == HPAGE_PMD_SIZE) + if (extent == HPAGE_PMD_SIZE) { + VM_BUG_ON(vma->vm_file || !vma->anon_vma); + /* See comment in move_ptes() */ + if (need_rmap_locks) + anon_vma_lock_write(vma->anon_vma); err = move_huge_pmd(vma, new_vma, old_addr, new_addr, old_end, old_pmd, new_pmd); + if (need_rmap_locks) + anon_vma_unlock_write(vma->anon_vma); + } if (err > 0) { need_flush = true; continue; @@ -58,6 +58,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) vma = find_vma(mm, start); for (;;) { struct file *file; + loff_t fstart, fend; /* Still start < end. */ error = -ENOMEM; @@ -77,12 +78,18 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) goto out_unlock; } file = vma->vm_file; + fstart = (start - vma->vm_start) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + fend = fstart + (min(end, vma->vm_end) - start) - 1; start = vma->vm_end; if ((flags & MS_SYNC) && file && (vma->vm_flags & VM_SHARED)) { get_file(file); up_read(&mm->mmap_sem); - error = vfs_fsync(file, 0); + if (vma->vm_flags & VM_NONLINEAR) + error = vfs_fsync(file, 1); + else + error = vfs_fsync_range(file, fstart, fend, 1); fput(file); if (error || start >= end) goto out; diff --git a/mm/nobootmem.c b/mm/nobootmem.c index f73f298..7ed5860 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -197,7 +197,6 @@ unsigned long __init free_all_bootmem(void) void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { - kmemleak_free_part(__va(physaddr), size); memblock_free(physaddr, size); } @@ -212,7 +211,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, */ void __init free_bootmem(unsigned long addr, unsigned long size) { - kmemleak_free_part(__va(addr), size); memblock_free(addr, size); } @@ -334,7 +332,7 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); } -void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, +static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { @@ -13,8 +13,11 @@ * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/export.h> #include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/file.h> @@ -24,12 +27,14 @@ #include <linux/vmalloc.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> +#include <linux/compiler.h> #include <linux/mount.h> #include <linux/personality.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/audit.h> #include <linux/sched/sysctl.h> +#include <linux/printk.h> #include <asm/uaccess.h> #include <asm/tlb.h> @@ -296,7 +301,7 @@ long vwrite(char *buf, char *addr, unsigned long count) count = -(unsigned long) addr; memcpy(addr, buf, count); - return(count); + return count; } /* @@ -459,7 +464,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases); * Implement a stub for vmalloc_sync_all() if the architecture chose not to * have one. */ -void __attribute__((weak)) vmalloc_sync_all(void) +void __weak vmalloc_sync_all(void) { } @@ -768,16 +773,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) */ static void delete_vma_from_mm(struct vm_area_struct *vma) { + int i; struct address_space *mapping; struct mm_struct *mm = vma->vm_mm; + struct task_struct *curr = current; kenter("%p", vma); protect_vma(vma, 0); mm->map_count--; - if (mm->mmap_cache == vma) - mm->mmap_cache = NULL; + for (i = 0; i < VMACACHE_SIZE; i++) { + /* if the vma is cached, invalidate the entire cache */ + if (curr->vmacache[i] == vma) { + vmacache_invalidate(mm); + break; + } + } /* remove the VMA from the mapping */ if (vma->vm_file) { @@ -825,8 +837,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) struct vm_area_struct *vma; /* check the cache first */ - vma = ACCESS_ONCE(mm->mmap_cache); - if (vma && vma->vm_start <= addr && vma->vm_end > addr) + vma = vmacache_find(mm, addr); + if (likely(vma)) return vma; /* trawl the list (there may be multiple mappings in which addr @@ -835,7 +847,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) if (vma->vm_start > addr) return NULL; if (vma->vm_end > addr) { - mm->mmap_cache = vma; + vmacache_update(addr, vma); return vma; } } @@ -874,8 +886,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, unsigned long end = addr + len; /* check the cache first */ - vma = mm->mmap_cache; - if (vma && vma->vm_start == addr && vma->vm_end == end) + vma = vmacache_find_exact(mm, addr, end); + if (vma) return vma; /* trawl the list (there may be multiple mappings in which addr @@ -886,7 +898,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, if (vma->vm_start > addr) return NULL; if (vma->vm_end == end) { - mm->mmap_cache = vma; + vmacache_update(addr, vma); return vma; } } @@ -995,7 +1007,7 @@ static int validate_mmap_request(struct file *file, (file->f_mode & FMODE_WRITE)) return -EACCES; - if (locks_verify_locked(file_inode(file))) + if (locks_verify_locked(file)) return -EAGAIN; if (!(capabilities & BDI_CAP_MAP_DIRECT)) @@ -1003,8 +1015,7 @@ static int validate_mmap_request(struct file *file, /* we mustn't privatise shared mappings */ capabilities &= ~BDI_CAP_MAP_COPY; - } - else { + } else { /* we're going to read the file into private memory we * allocate */ if (!(capabilities & BDI_CAP_MAP_COPY)) @@ -1035,23 +1046,20 @@ static int validate_mmap_request(struct file *file, if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { if (prot & PROT_EXEC) return -EPERM; - } - else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { + } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { /* handle implication of PROT_EXEC by PROT_READ */ if (current->personality & READ_IMPLIES_EXEC) { if (capabilities & BDI_CAP_EXEC_MAP) prot |= PROT_EXEC; } - } - else if ((prot & PROT_READ) && + } else if ((prot & PROT_READ) && (prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP) ) { /* backing file is not executable, try to copy */ capabilities &= ~BDI_CAP_MAP_DIRECT; } - } - else { + } else { /* anonymous mappings are always memory backed and can be * privately mapped */ @@ -1241,7 +1249,7 @@ error_free: return ret; enomem: - printk("Allocation of length %lu from process %d (%s) failed\n", + pr_err("Allocation of length %lu from process %d (%s) failed\n", len, current->pid, current->comm); show_free_areas(0); return -ENOMEM; @@ -1659,7 +1667,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) /* find the first potentially overlapping VMA */ vma = find_vma(mm, start); if (!vma) { - static int limit = 0; + static int limit; if (limit < 5) { printk(KERN_WARNING "munmap of memory not mmapped by process %d" @@ -1985,6 +1993,12 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); +void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + BUG(); +} +EXPORT_SYMBOL(filemap_map_pages); + int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, unsigned long size, pgoff_t pgoff) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7106cb1..e0c94301 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -156,24 +156,6 @@ static unsigned long writeout_period_time = 0; #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) /* - * Work out the current dirty-memory clamping and background writeout - * thresholds. - * - * The main aim here is to lower them aggressively if there is a lot of mapped - * memory around. To avoid stressing page reclaim with lots of unreclaimable - * pages. It is better to clamp down on writers than to start swapping, and - * performing lots of scanning. - * - * We only allow 1/2 of the currently-unmapped memory to be dirtied. - * - * We don't permit the clamping level to fall below 5% - that is getting rather - * excessive. - * - * We make sure that the background writeout level is below the adjusted - * clamping level. - */ - -/* * In a memory zone, there is a certain amount of pages we consider * available for the page cache, which is essentially the number of * free and reclaimable pages, minus some zone reserves to protect @@ -593,14 +575,14 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) * => fast response on large errors; small oscillation near setpoint */ -static inline long long pos_ratio_polynom(unsigned long setpoint, +static long long pos_ratio_polynom(unsigned long setpoint, unsigned long dirty, unsigned long limit) { long long pos_ratio; long x; - x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, + x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, limit - setpoint + 1); pos_ratio = x; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; @@ -842,7 +824,7 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, x_intercept = bdi_setpoint + span; if (bdi_dirty < x_intercept - span / 4) { - pos_ratio = div_u64(pos_ratio * (x_intercept - bdi_dirty), + pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty), x_intercept - bdi_setpoint + 1); } else pos_ratio /= 4; @@ -1324,9 +1306,9 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi, *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); if (bdi_bg_thresh) - *bdi_bg_thresh = div_u64((u64)*bdi_thresh * - background_thresh, - dirty_thresh); + *bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh * + background_thresh, + dirty_thresh) : 0; /* * In order to avoid the stacked BDI deadlock we need @@ -1562,9 +1544,9 @@ pause: bdi_start_background_writeback(bdi); } -void set_page_dirty_balance(struct page *page, int page_mkwrite) +void set_page_dirty_balance(struct page *page) { - if (set_page_dirty(page) || page_mkwrite) { + if (set_page_dirty(page)) { struct address_space *mapping = page_mapping(page); if (mapping) @@ -1623,7 +1605,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) * 1000+ tasks, all of them start dirtying pages at exactly the same * time, hence all honoured too large initial task->nr_dirtied_pause. */ - p = &__get_cpu_var(bdp_ratelimits); + p = this_cpu_ptr(&bdp_ratelimits); if (unlikely(current->nr_dirtied >= ratelimit)) *p = 0; else if (unlikely(*p >= ratelimit_pages)) { @@ -1635,7 +1617,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) * short-lived tasks (eg. gcc invocations in a kernel build) escaping * the dirty throttling and livelock other long-run dirtiers. */ - p = &__get_cpu_var(dirty_throttle_leaks); + p = this_cpu_ptr(&dirty_throttle_leaks); if (*p > 0 && current->nr_dirtied < ratelimit) { unsigned long nr_pages_dirtied; nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); @@ -1682,7 +1664,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) /* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ -int dirty_writeback_centisecs_handler(ctl_table *table, int write, +int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); @@ -2398,7 +2380,7 @@ int test_clear_page_writeback(struct page *page) return ret; } -int test_set_page_writeback(struct page *page) +int __test_set_page_writeback(struct page *page, bool keep_write) { struct address_space *mapping = page_mapping(page); int ret; @@ -2423,9 +2405,10 @@ int test_set_page_writeback(struct page *page) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), - PAGECACHE_TAG_TOWRITE); + if (!keep_write) + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_TOWRITE); spin_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestSetPageWriteback(page); @@ -2436,7 +2419,7 @@ int test_set_page_writeback(struct page *page) return ret; } -EXPORT_SYMBOL(test_set_page_writeback); +EXPORT_SYMBOL(__test_set_page_writeback); /* * Return true if any of the pages in the mapping are marked with the diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3bac76a..ef44ad7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -69,6 +69,7 @@ /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); +#define MIN_PERCPU_PAGELIST_FRACTION (8) #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); @@ -261,8 +262,9 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) } while (zone_span_seqretry(zone, seq)); if (ret) - pr_err("page %lu outside zone [ %lu - %lu ]\n", - pfn, start_pfn, start_pfn + sp); + pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", + pfn, zone_to_nid(zone), zone->name, + start_pfn, start_pfn + sp); return ret; } @@ -295,7 +297,8 @@ static inline int bad_range(struct zone *zone, struct page *page) } #endif -static void bad_page(struct page *page, char *reason, unsigned long bad_flags) +static void bad_page(struct page *page, const char *reason, + unsigned long bad_flags) { static unsigned long resume; static unsigned long nr_shown; @@ -407,7 +410,8 @@ static int destroy_compound_page(struct page *page, unsigned long order) return bad; } -static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) +static inline void prep_zero_page(struct page *page, unsigned int order, + gfp_t gfp_flags) { int i; @@ -451,7 +455,7 @@ static inline void set_page_guard_flag(struct page *page) { } static inline void clear_page_guard_flag(struct page *page) { } #endif -static inline void set_page_order(struct page *page, int order) +static inline void set_page_order(struct page *page, unsigned int order) { set_page_private(page, order); __SetPageBuddy(page); @@ -502,21 +506,31 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) * For recording page's order, we use page_private(page). */ static inline int page_is_buddy(struct page *page, struct page *buddy, - int order) + unsigned int order) { if (!pfn_valid_within(page_to_pfn(buddy))) return 0; - if (page_zone_id(page) != page_zone_id(buddy)) - return 0; - if (page_is_guard(buddy) && page_order(buddy) == order) { VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + + if (page_zone_id(page) != page_zone_id(buddy)) + return 0; + return 1; } if (PageBuddy(buddy) && page_order(buddy) == order) { VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + + /* + * zone check is done late to avoid uselessly + * calculating zone/node ids for pages that could + * never merge. + */ + if (page_zone_id(page) != page_zone_id(buddy)) + return 0; + return 1; } return 0; @@ -548,6 +562,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, */ static inline void __free_one_page(struct page *page, + unsigned long pfn, struct zone *zone, unsigned int order, int migratetype) { @@ -564,7 +579,7 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON(migratetype == -1); - page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); + page_idx = pfn & ((1 << MAX_ORDER) - 1); VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); @@ -623,7 +638,7 @@ out: static inline int free_pages_check(struct page *page) { - char *bad_reason = NULL; + const char *bad_reason = NULL; unsigned long bad_flags = 0; if (unlikely(page_mapcount(page))) @@ -699,7 +714,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, list_del(&page->lru); mt = get_freepage_migratetype(page); /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ - __free_one_page(page, zone, 0, mt); + __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); if (likely(!is_migrate_isolate_page(page))) { __mod_zone_page_state(zone, NR_FREE_PAGES, 1); @@ -711,13 +726,15 @@ static void free_pcppages_bulk(struct zone *zone, int count, spin_unlock(&zone->lock); } -static void free_one_page(struct zone *zone, struct page *page, int order, +static void free_one_page(struct zone *zone, + struct page *page, unsigned long pfn, + unsigned int order, int migratetype) { spin_lock(&zone->lock); zone->pages_scanned = 0; - __free_one_page(page, zone, order, migratetype); + __free_one_page(page, pfn, zone, order, migratetype); if (unlikely(!is_migrate_isolate(migratetype))) __mod_zone_freepage_state(zone, 1 << order, migratetype); spin_unlock(&zone->lock); @@ -754,15 +771,16 @@ static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; int migratetype; + unsigned long pfn = page_to_pfn(page); if (!free_pages_prepare(page, order)) return; + migratetype = get_pfnblock_migratetype(page, pfn); local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); - migratetype = get_pageblock_migratetype(page); set_freepage_migratetype(page, migratetype); - free_one_page(page_zone(page), page, order, migratetype); + free_one_page(page_zone(page), page, pfn, order, migratetype); local_irq_restore(flags); } @@ -798,9 +816,21 @@ void __init init_cma_reserved_pageblock(struct page *page) set_page_count(p, 0); } while (++p, --i); - set_page_refcounted(page); set_pageblock_migratetype(page, MIGRATE_CMA); - __free_pages(page, pageblock_order); + + if (pageblock_order >= MAX_ORDER) { + i = pageblock_nr_pages; + p = page; + do { + set_page_refcounted(p); + __free_pages(p, MAX_ORDER - 1); + p += MAX_ORDER_NR_PAGES; + } while (i -= MAX_ORDER_NR_PAGES); + } else { + set_page_refcounted(page); + __free_pages(page, pageblock_order); + } + adjust_managed_page_count(page, pageblock_nr_pages); } #endif @@ -859,7 +889,7 @@ static inline void expand(struct zone *zone, struct page *page, */ static inline int check_new_page(struct page *page) { - char *bad_reason = NULL; + const char *bad_reason = NULL; unsigned long bad_flags = 0; if (unlikely(page_mapcount(page))) @@ -881,7 +911,7 @@ static inline int check_new_page(struct page *page) return 0; } -static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) { int i; @@ -930,6 +960,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, rmv_page_order(page); area->nr_free--; expand(zone, page, order, current_order, area, migratetype); + set_freepage_migratetype(page, migratetype); return page; } @@ -1056,7 +1087,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, /* * When borrowing from MIGRATE_CMA, we need to release the excess - * buddy pages to CMA itself. + * buddy pages to CMA itself. We also ensure the freepage_migratetype + * is set to CMA so it is returned to the correct freelist in case + * the page ends up being not actually allocated from the pcp lists. */ if (is_migrate_cma(fallback_type)) return fallback_type; @@ -1089,16 +1122,17 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, /* Remove an element from the buddy allocator from the fallback list */ static inline struct page * -__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) +__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) { struct free_area *area; - int current_order; + unsigned int current_order; struct page *page; int migratetype, new_type, i; /* Find the largest possible block of pages in the other list */ - for (current_order = MAX_ORDER-1; current_order >= order; - --current_order) { + for (current_order = MAX_ORDER-1; + current_order >= order && current_order <= MAX_ORDER-1; + --current_order) { for (i = 0;; i++) { migratetype = fallbacks[start_migratetype][i]; @@ -1124,6 +1158,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) expand(zone, page, order, current_order, area, new_type); + /* The freepage_migratetype may differ from pageblock's + * migratetype depending on the decisions in + * try_to_steal_freepages. This is OK as long as it does + * not differ for MIGRATE_CMA type. + */ + set_freepage_migratetype(page, new_type); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, migratetype, new_type); @@ -1172,9 +1212,9 @@ retry_reserve: */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype, int cold) + int migratetype, bool cold) { - int mt = migratetype, i; + int i; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { @@ -1191,18 +1231,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * merge IO requests if the physical pages are ordered * properly. */ - if (likely(cold == 0)) + if (likely(!cold)) list_add(&page->lru, list); else list_add_tail(&page->lru, list); - if (IS_ENABLED(CONFIG_CMA)) { - mt = get_pageblock_migratetype(page); - if (!is_migrate_cma(mt) && !is_migrate_isolate(mt)) - mt = migratetype; - } - set_freepage_migratetype(page, mt); list = &page->lru; - if (is_migrate_cma(mt)) + if (is_migrate_cma(get_freepage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); } @@ -1238,15 +1272,6 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) } local_irq_restore(flags); } -static bool gfp_thisnode_allocation(gfp_t gfp_mask) -{ - return (gfp_mask & GFP_THISNODE) == GFP_THISNODE; -} -#else -static bool gfp_thisnode_allocation(gfp_t gfp_mask) -{ - return false; -} #endif /* @@ -1335,7 +1360,7 @@ void mark_free_pages(struct zone *zone) { unsigned long pfn, max_zone_pfn; unsigned long flags; - int order, t; + unsigned int order, t; struct list_head *curr; if (zone_is_empty(zone)) @@ -1367,19 +1392,20 @@ void mark_free_pages(struct zone *zone) /* * Free a 0-order page - * cold == 1 ? free a cold page : free a hot page + * cold == true ? free a cold page : free a hot page */ -void free_hot_cold_page(struct page *page, int cold) +void free_hot_cold_page(struct page *page, bool cold) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; + unsigned long pfn = page_to_pfn(page); int migratetype; if (!free_pages_prepare(page, 0)) return; - migratetype = get_pageblock_migratetype(page); + migratetype = get_pfnblock_migratetype(page, pfn); set_freepage_migratetype(page, migratetype); local_irq_save(flags); __count_vm_event(PGFREE); @@ -1393,17 +1419,17 @@ void free_hot_cold_page(struct page *page, int cold) */ if (migratetype >= MIGRATE_PCPTYPES) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(zone, page, 0, migratetype); + free_one_page(zone, page, pfn, 0, migratetype); goto out; } migratetype = MIGRATE_MOVABLE; } pcp = &this_cpu_ptr(zone->pageset)->pcp; - if (cold) - list_add_tail(&page->lru, &pcp->lists[migratetype]); - else + if (!cold) list_add(&page->lru, &pcp->lists[migratetype]); + else + list_add_tail(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { unsigned long batch = ACCESS_ONCE(pcp->batch); @@ -1418,7 +1444,7 @@ out: /* * Free a list of 0-order pages */ -void free_hot_cold_page_list(struct list_head *list, int cold) +void free_hot_cold_page_list(struct list_head *list, bool cold) { struct page *page, *next; @@ -1530,12 +1556,12 @@ int split_free_page(struct page *page) */ static inline struct page *buffered_rmqueue(struct zone *preferred_zone, - struct zone *zone, int order, gfp_t gfp_flags, - int migratetype) + struct zone *zone, unsigned int order, + gfp_t gfp_flags, int migratetype) { unsigned long flags; struct page *page; - int cold = !!(gfp_flags & __GFP_COLD); + bool cold = ((gfp_flags & __GFP_COLD) != 0); again: if (likely(order == 0)) { @@ -1580,15 +1606,10 @@ again: if (!page) goto failed; __mod_zone_freepage_state(zone, -(1 << order), - get_pageblock_migratetype(page)); + get_freepage_migratetype(page)); } - /* - * NOTE: GFP_THISNODE allocations do not partake in the kswapd - * aging protocol, so they can't be fair. - */ - if (!gfp_thisnode_allocation(gfp_flags)) - __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); @@ -1685,8 +1706,9 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) * Return true if free pages are above 'mark'. This takes into account the order * of the allocation. */ -static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags, long free_pages) +static bool __zone_watermark_ok(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, int alloc_flags, + long free_pages) { /* free_pages my go negative - that's OK */ long min = mark; @@ -1720,15 +1742,15 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, return true; } -bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, +bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, int alloc_flags) { return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, zone_page_state(z, NR_FREE_PAGES)); } -bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags) +bool zone_watermark_ok_safe(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, int alloc_flags) { long free_pages = zone_page_state(z, NR_FREE_PAGES); @@ -1863,18 +1885,8 @@ static bool zone_local(struct zone *local_zone, struct zone *zone) static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { - return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); -} - -static void __paginginit init_zone_allows_reclaim(int nid) -{ - int i; - - for_each_online_node(i) - if (node_distance(nid, i) <= RECLAIM_DISTANCE) - node_set(i, NODE_DATA(nid)->reclaim_nodes); - else - zone_reclaim_mode = 1; + return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < + RECLAIM_DISTANCE; } #else /* CONFIG_NUMA */ @@ -1908,9 +1920,6 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) return true; } -static inline void init_zone_allows_reclaim(int nid) -{ -} #endif /* CONFIG_NUMA */ /* @@ -1920,17 +1929,17 @@ static inline void init_zone_allows_reclaim(int nid) static struct page * get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, struct zonelist *zonelist, int high_zoneidx, int alloc_flags, - struct zone *preferred_zone, int migratetype) + struct zone *preferred_zone, int classzone_idx, int migratetype) { struct zoneref *z; struct page *page = NULL; - int classzone_idx; struct zone *zone; nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ int did_zlc_setup = 0; /* just call zlc_setup() one time */ + bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && + (gfp_mask & __GFP_WRITE); - classzone_idx = zone_idx(preferred_zone); zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. @@ -1943,34 +1952,21 @@ zonelist_scan: if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; - if ((alloc_flags & ALLOC_CPUSET) && + if (cpusets_enabled() && + (alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) continue; - BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); - if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS)) - goto try_this_zone; /* * Distribute pages in proportion to the individual * zone size to ensure fair page aging. The zone a * page was allocated in should have no effect on the * time the page has in memory before being reclaimed. - * - * Try to stay in local zones in the fastpath. If - * that fails, the slowpath is entered, which will do - * another pass starting with the local zones, but - * ultimately fall back to remote zones that do not - * partake in the fairness round-robin cycle of this - * zonelist. - * - * NOTE: GFP_THISNODE allocations do not partake in - * the kswapd aging protocol, so they can't be fair. */ - if ((alloc_flags & ALLOC_WMARK_LOW) && - !gfp_thisnode_allocation(gfp_mask)) { - if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) - continue; + if (alloc_flags & ALLOC_FAIR) { if (!zone_local(preferred_zone, zone)) continue; + if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) + continue; } /* * When allocating a page cache page for writing, we @@ -1998,15 +1994,19 @@ zonelist_scan: * will require awareness of zones in the * dirty-throttling and the flusher threads. */ - if ((alloc_flags & ALLOC_WMARK_LOW) && - (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) - goto this_zone_full; + if (consider_zone_dirty && !zone_dirty_ok(zone)) + continue; mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_ok(zone, order, mark, classzone_idx, alloc_flags)) { int ret; + /* Checked here to keep the fast path fast */ + BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); + if (alloc_flags & ALLOC_NO_WATERMARKS) + goto try_this_zone; + if (IS_ENABLED(CONFIG_NUMA) && !did_zlc_setup && nr_online_nodes > 1) { /* @@ -2068,7 +2068,7 @@ try_this_zone: if (page) break; this_zone_full: - if (IS_ENABLED(CONFIG_NUMA)) + if (IS_ENABLED(CONFIG_NUMA) && zlc_active) zlc_mark_zone_full(zonelist, z); } @@ -2197,7 +2197,7 @@ static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, struct zone *preferred_zone, - int migratetype) + int classzone_idx, int migratetype) { struct page *page; @@ -2215,7 +2215,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) goto out; @@ -2250,7 +2250,7 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, bool sync_migration, + int classzone_idx, int migratetype, enum migrate_mode mode, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { @@ -2264,7 +2264,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, - nodemask, sync_migration, + nodemask, mode, contended_compaction); current->flags &= ~PF_MEMALLOC; @@ -2278,7 +2278,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) { preferred_zone->compact_blockskip_flush = false; compaction_defer_reset(preferred_zone, order, true); @@ -2297,7 +2297,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, * As async compaction considers a subset of pageblocks, only * defer if the failure was a sync compaction failure. */ - if (sync_migration) + if (mode != MIGRATE_ASYNC) defer_compaction(preferred_zone, order); cond_resched(); @@ -2310,9 +2310,9 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, bool sync_migration, - bool *contended_compaction, bool *deferred_compaction, - unsigned long *did_some_progress) + int classzone_idx, int migratetype, + enum migrate_mode mode, bool *contended_compaction, + bool *deferred_compaction, unsigned long *did_some_progress) { return NULL; } @@ -2351,7 +2351,7 @@ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) + int classzone_idx, int migratetype, unsigned long *did_some_progress) { struct page *page = NULL; bool drained = false; @@ -2369,7 +2369,8 @@ retry: page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, + migratetype); /* * If an allocation failed after direct reclaim, it could be because @@ -2392,14 +2393,14 @@ static inline struct page * __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, struct zone *preferred_zone, - int migratetype) + int classzone_idx, int migratetype) { struct page *page; do { page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (!page && gfp_mask & __GFP_NOFAIL) wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); @@ -2408,37 +2409,45 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, return page; } -static void prepare_slowpath(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, - enum zone_type high_zoneidx, - struct zone *preferred_zone) +static void reset_alloc_batches(struct zonelist *zonelist, + enum zone_type high_zoneidx, + struct zone *preferred_zone) { struct zoneref *z; struct zone *zone; for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - if (!(gfp_mask & __GFP_NO_KSWAPD)) - wakeup_kswapd(zone, order, zone_idx(preferred_zone)); /* * Only reset the batches of zones that were actually - * considered in the fast path, we don't want to - * thrash fairness information for zones that are not + * considered in the fairness pass, we don't want to + * trash fairness information for zones that are not * actually part of this zonelist's round-robin cycle. */ if (!zone_local(preferred_zone, zone)) continue; mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - - low_wmark_pages(zone) - - zone_page_state(zone, NR_ALLOC_BATCH)); + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); } } +static void wake_all_kswapds(unsigned int order, + struct zonelist *zonelist, + enum zone_type high_zoneidx, + struct zone *preferred_zone) +{ + struct zoneref *z; + struct zone *zone; + + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) + wakeup_kswapd(zone, order, zone_idx(preferred_zone)); +} + static inline int gfp_to_alloc_flags(gfp_t gfp_mask) { int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; - const gfp_t wait = gfp_mask & __GFP_WAIT; + const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD)); /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); @@ -2447,20 +2456,20 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). + * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH). */ alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); - if (!wait) { + if (atomic) { /* - * Not worth trying to allocate harder for - * __GFP_NOMEMALLOC even if it can't schedule. + * Not worth trying to allocate harder for __GFP_NOMEMALLOC even + * if it can't schedule. */ - if (!(gfp_mask & __GFP_NOMEMALLOC)) + if (!(gfp_mask & __GFP_NOMEMALLOC)) alloc_flags |= ALLOC_HARDER; /* - * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. - * See also cpuset_zone_allowed() comment in kernel/cpuset.c. + * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the + * comment for __cpuset_node_allowed_softwall(). */ alloc_flags &= ~ALLOC_CPUSET; } else if (unlikely(rt_task(current)) && !in_interrupt()) @@ -2492,14 +2501,14 @@ static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, struct zone *preferred_zone, - int migratetype) + int classzone_idx, int migratetype) { const gfp_t wait = gfp_mask & __GFP_WAIT; struct page *page = NULL; int alloc_flags; unsigned long pages_reclaimed = 0; unsigned long did_some_progress; - bool sync_migration = false; + enum migrate_mode migration_mode = MIGRATE_ASYNC; bool deferred_compaction = false; bool contended_compaction = false; @@ -2522,12 +2531,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * allowed per node queues are empty and that nodes are * over allocated. */ - if (gfp_thisnode_allocation(gfp_mask)) + if (IS_ENABLED(CONFIG_NUMA) && + (gfp_mask & GFP_THISNODE) == GFP_THISNODE) goto nopage; restart: - prepare_slowpath(gfp_mask, order, zonelist, - high_zoneidx, preferred_zone); + if (!(gfp_mask & __GFP_NO_KSWAPD)) + wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); /* * OK, we're below the kswapd watermark and have kicked background @@ -2540,15 +2550,18 @@ restart: * Find the true preferred zone if the allocation is unconstrained by * cpusets. */ - if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) - first_zones_zonelist(zonelist, high_zoneidx, NULL, - &preferred_zone); + if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { + struct zoneref *preferred_zoneref; + preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, + NULL, &preferred_zone); + classzone_idx = zonelist_zone_idx(preferred_zoneref); + } rebalance: /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) goto got_pg; @@ -2563,7 +2576,7 @@ rebalance: page = __alloc_pages_high_priority(gfp_mask, order, zonelist, high_zoneidx, nodemask, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) { goto got_pg; } @@ -2592,17 +2605,23 @@ rebalance: * Try direct compaction. The first pass is asynchronous. Subsequent * attempts after direct reclaim are synchronous */ - page = __alloc_pages_direct_compact(gfp_mask, order, - zonelist, high_zoneidx, - nodemask, - alloc_flags, preferred_zone, - migratetype, sync_migration, - &contended_compaction, + page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, + high_zoneidx, nodemask, alloc_flags, + preferred_zone, + classzone_idx, migratetype, + migration_mode, &contended_compaction, &deferred_compaction, &did_some_progress); if (page) goto got_pg; - sync_migration = true; + + /* + * It can become very expensive to allocate transparent hugepages at + * fault, so use asynchronous memory compaction for THP unless it is + * khugepaged trying to collapse. + */ + if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD)) + migration_mode = MIGRATE_SYNC_LIGHT; /* * If compaction is deferred for high-order allocations, it is because @@ -2619,7 +2638,8 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress); + classzone_idx, migratetype, + &did_some_progress); if (page) goto got_pg; @@ -2638,7 +2658,7 @@ rebalance: page = __alloc_pages_may_oom(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, - migratetype); + classzone_idx, migratetype); if (page) goto got_pg; @@ -2677,12 +2697,11 @@ rebalance: * direct reclaim and reclaim/compaction depends on compaction * being called after reclaim so call directly if necessary */ - page = __alloc_pages_direct_compact(gfp_mask, order, - zonelist, high_zoneidx, - nodemask, - alloc_flags, preferred_zone, - migratetype, sync_migration, - &contended_compaction, + page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, + high_zoneidx, nodemask, alloc_flags, + preferred_zone, + classzone_idx, migratetype, + migration_mode, &contended_compaction, &deferred_compaction, &did_some_progress); if (page) @@ -2708,11 +2727,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, { enum zone_type high_zoneidx = gfp_zone(gfp_mask); struct zone *preferred_zone; + struct zoneref *preferred_zoneref; struct page *page = NULL; int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; - int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; - struct mem_cgroup *memcg = NULL; + int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; + int classzone_idx; gfp_mask &= gfp_allowed_mask; @@ -2731,33 +2751,44 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; - /* - * Will only have any effect when __GFP_KMEMCG is set. This is - * verified in the (always inline) callee - */ - if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) - return NULL; - retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); /* The preferred zone is used for statistics later */ - first_zones_zonelist(zonelist, high_zoneidx, + preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, nodemask ? : &cpuset_current_mems_allowed, &preferred_zone); if (!preferred_zone) goto out; + classzone_idx = zonelist_zone_idx(preferred_zoneref); #ifdef CONFIG_CMA if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif +retry: /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (unlikely(!page)) { /* + * The first pass makes sure allocations are spread + * fairly within the local node. However, the local + * node might have free pages left after the fairness + * batches are exhausted, and remote zones haven't + * even been considered yet. Try once more without + * fairness, and include remote zones now, before + * entering the slowpath and waking kswapd: prefer + * spilling to a remote zone over swapping locally. + */ + if (alloc_flags & ALLOC_FAIR) { + reset_alloc_batches(zonelist, high_zoneidx, + preferred_zone); + alloc_flags &= ~ALLOC_FAIR; + goto retry; + } + /* * Runtime PM, block IO and its error handling path * can deadlock because I/O on the device might not * complete. @@ -2765,7 +2796,7 @@ retry_cpuset: gfp_mask = memalloc_noio_flags(gfp_mask); page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); } trace_mm_page_alloc(page, order, gfp_mask, migratetype); @@ -2777,11 +2808,9 @@ out: * the mask is being updated. If a page allocation is about to fail, * check if the cpuset changed during allocation and if so, retry. */ - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; - memcg_kmem_commit_charge(page, memcg, order); - return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2816,7 +2845,7 @@ void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { if (order == 0) - free_hot_cold_page(page, 0); + free_hot_cold_page(page, false); else __free_pages_ok(page, order); } @@ -2835,27 +2864,51 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); /* - * __free_memcg_kmem_pages and free_memcg_kmem_pages will free - * pages allocated with __GFP_KMEMCG. - * - * Those pages are accounted to a particular memcg, embedded in the - * corresponding page_cgroup. To avoid adding a hit in the allocator to search - * for that information only to find out that it is NULL for users who have no - * interest in that whatsoever, we provide these functions. + * alloc_kmem_pages charges newly allocated pages to the kmem resource counter + * of the current memory cgroup. * - * The caller knows better which flags it relies on. + * It should be used when the caller would like to use kmalloc, but since the + * allocation is large, it has to fall back to the page allocator. */ -void __free_memcg_kmem_pages(struct page *page, unsigned int order) +struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages(gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages_node(nid, gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +/* + * __free_kmem_pages and free_kmem_pages will free pages allocated with + * alloc_kmem_pages. + */ +void __free_kmem_pages(struct page *page, unsigned int order) { memcg_kmem_uncharge_pages(page, order); __free_pages(page, order); } -void free_memcg_kmem_pages(unsigned long addr, unsigned int order) +void free_kmem_pages(unsigned long addr, unsigned int order) { if (addr != 0) { VM_BUG_ON(!virt_addr_valid((void *)addr)); - __free_memcg_kmem_pages(virt_to_page((void *)addr), order); + __free_kmem_pages(virt_to_page((void *)addr), order); } } @@ -3045,9 +3098,9 @@ bool skip_free_areas_node(unsigned int flags, int nid) goto out; do { - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); ret = !node_isset(nid, cpuset_current_mems_allowed); - } while (!put_mems_allowed(cpuset_mems_cookie)); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); out: return ret; } @@ -3349,7 +3402,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order); /* * sysctl handler for numa_zonelist_order */ -int numa_zonelist_order_handler(ctl_table *table, int write, +int numa_zonelist_order_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -4093,7 +4146,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, static void __meminit zone_init_free_lists(struct zone *zone) { - int order, t; + unsigned int order, t; for_each_migratetype_order(order, t) { INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); zone->free_area[order].nr_free = 0; @@ -4105,7 +4158,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) #endif -static int __meminit zone_batchsize(struct zone *zone) +static int zone_batchsize(struct zone *zone) { #ifdef CONFIG_MMU int batch; @@ -4221,8 +4274,8 @@ static void pageset_set_high(struct per_cpu_pageset *p, pageset_update(&p->pcp, high, batch); } -static void __meminit pageset_set_high_and_batch(struct zone *zone, - struct per_cpu_pageset *pcp) +static void pageset_set_high_and_batch(struct zone *zone, + struct per_cpu_pageset *pcp) { if (percpu_pagelist_fraction) pageset_set_high(pcp, @@ -4347,9 +4400,6 @@ int __meminit init_currently_empty_zone(struct zone *zone, #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. - * Architectures may implement their own version but if add_active_range() - * was used and there are no special requirements, this is a convenient - * alternative */ int __meminit __early_pfn_to_nid(unsigned long pfn) { @@ -4404,10 +4454,9 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid * - * If an architecture guarantees that all ranges registered with - * add_active_ranges() contain no holes and may be freed, this - * this function may be used instead of calling memblock_free_early_nid() - * manually. + * If an architecture guarantees that all ranges registered contain no holes + * and may be freed, this this function may be used instead of calling + * memblock_free_early_nid() manually. */ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) { @@ -4429,9 +4478,8 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. * - * If an architecture guarantees that all ranges registered with - * add_active_ranges() contain no holes and may be freed, this - * function may be used instead of calling memory_present() manually. + * If an architecture guarantees that all ranges registered contain no holes and may + * be freed, this function may be used instead of calling memory_present() manually. */ void __init sparse_memory_present_with_active_regions(int nid) { @@ -4449,7 +4497,7 @@ void __init sparse_memory_present_with_active_regions(int nid) * @end_pfn: Passed by reference. On return, it will have the node end_pfn. * * It returns the start and end page frame of a node based on information - * provided by an arch calling add_active_range(). If called for a node + * provided by memblock_set_node(). If called for a node * with no available memory, a warning is printed and the start and end * PFNs will be 0. */ @@ -4919,7 +4967,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; - init_zone_allows_reclaim(nid); #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); #endif @@ -5027,7 +5074,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) * find_min_pfn_with_active_regions - Find the minimum PFN registered * * It returns the minimum PFN based on information provided via - * add_active_range(). + * memblock_set_node(). */ unsigned long __init find_min_pfn_with_active_regions(void) { @@ -5070,7 +5117,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) nodemask_t saved_node_state = node_states[N_MEMORY]; unsigned long totalpages = early_calculate_totalpages(); int usable_nodes = nodes_weight(node_states[N_MEMORY]); - struct memblock_type *type = &memblock.memory; + struct memblock_region *r; /* Need to find movable_zone earlier when movable_node is specified. */ find_usable_zone_for_movable(); @@ -5080,13 +5127,13 @@ static void __init find_zone_movable_pfns_for_nodes(void) * options. */ if (movable_node_is_enabled()) { - for (i = 0; i < type->cnt; i++) { - if (!memblock_is_hotpluggable(&type->regions[i])) + for_each_memblock(memory, r) { + if (!memblock_is_hotpluggable(r)) continue; - nid = type->regions[i].nid; + nid = r->nid; - usable_startpfn = PFN_DOWN(type->regions[i].base); + usable_startpfn = PFN_DOWN(r->base); zone_movable_pfn[nid] = zone_movable_pfn[nid] ? min(usable_startpfn, zone_movable_pfn[nid]) : usable_startpfn; @@ -5248,7 +5295,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid) * @max_zone_pfn: an array of max PFNs for each zone * * This will call free_area_init_node() for each active node in the system. - * Using the page ranges provided by add_active_range(), the size of each + * Using the page ranges provided by memblock_set_node(), the size of each * zone in each node and their holes is calculated. If the maximum PFN * between two adjacent zones match, it is assumed that the zone is empty. * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed @@ -5771,7 +5818,7 @@ module_init(init_per_zone_wmark_min) * that we can call two helper functions whenever min_free_kbytes * changes. */ -int min_free_kbytes_sysctl_handler(ctl_table *table, int write, +int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { int rc; @@ -5788,7 +5835,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, } #ifdef CONFIG_NUMA -int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, +int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; @@ -5804,7 +5851,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, return 0; } -int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, +int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; @@ -5830,7 +5877,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, * minimum watermarks. The lowmem reserve ratio can only make sense * if in function of the boot time zone sizes. */ -int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, +int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec_minmax(table, write, buffer, length, ppos); @@ -5843,27 +5890,42 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, * cpu. It is the fraction of total pages in each zone that a hot per cpu * pagelist can have before it gets flushed back to buddy allocator. */ -int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, +int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; - unsigned int cpu; + int old_percpu_pagelist_fraction; int ret; + mutex_lock(&pcp_batch_high_lock); + old_percpu_pagelist_fraction = percpu_pagelist_fraction; + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (!write || (ret < 0)) - return ret; + if (!write || ret < 0) + goto out; + + /* Sanity checking to avoid pcp imbalance */ + if (percpu_pagelist_fraction && + percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { + percpu_pagelist_fraction = old_percpu_pagelist_fraction; + ret = -EINVAL; + goto out; + } + + /* No change? */ + if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) + goto out; - mutex_lock(&pcp_batch_high_lock); for_each_populated_zone(zone) { - unsigned long high; - high = zone->managed_pages / percpu_pagelist_fraction; + unsigned int cpu; + for_each_possible_cpu(cpu) - pageset_set_high(per_cpu_ptr(zone->pageset, cpu), - high); + pageset_set_high_and_batch(zone, + per_cpu_ptr(zone->pageset, cpu)); } +out: mutex_unlock(&pcp_batch_high_lock); - return 0; + return ret; } int hashdist = HASHDIST_DEFAULT; @@ -6000,59 +6062,73 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) } /** - * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages + * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages * @page: The page within the block of interest - * @start_bitidx: The first bit of interest to retrieve - * @end_bitidx: The last bit of interest - * returns pageblock_bits flags + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest to retrieve + * @mask: mask of bits that the caller is interested in + * + * Return: pageblock_bits flags */ -unsigned long get_pageblock_flags_group(struct page *page, - int start_bitidx, int end_bitidx) +unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) { struct zone *zone; unsigned long *bitmap; - unsigned long pfn, bitidx; - unsigned long flags = 0; - unsigned long value = 1; + unsigned long bitidx, word_bitidx; + unsigned long word; zone = page_zone(page); - pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); - for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) - if (test_bit(bitidx + start_bitidx, bitmap)) - flags |= value; - - return flags; + word = bitmap[word_bitidx]; + bitidx += end_bitidx; + return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; } /** - * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages + * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages * @page: The page within the block of interest - * @start_bitidx: The first bit of interest - * @end_bitidx: The last bit of interest * @flags: The flags to set + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest + * @mask: mask of bits that the caller is interested in */ -void set_pageblock_flags_group(struct page *page, unsigned long flags, - int start_bitidx, int end_bitidx) +void set_pfnblock_flags_mask(struct page *page, unsigned long flags, + unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) { struct zone *zone; unsigned long *bitmap; - unsigned long pfn, bitidx; - unsigned long value = 1; + unsigned long bitidx, word_bitidx; + unsigned long old_word, word; + + BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); zone = page_zone(page); - pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); - for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) - if (flags & value) - __set_bit(bitidx + start_bitidx, bitmap); - else - __clear_bit(bitidx + start_bitidx, bitmap); + bitidx += end_bitidx; + mask <<= (BITS_PER_LONG - bitidx - 1); + flags <<= (BITS_PER_LONG - bitidx - 1); + + word = ACCESS_ONCE(bitmap[word_bitidx]); + for (;;) { + old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); + if (word == old_word) + break; + word = old_word; + } } /* @@ -6212,7 +6288,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migrate_target, - 0, MIGRATE_SYNC, MR_CMA); + NULL, 0, cc->mode, MR_CMA); } if (ret < 0) { putback_movable_pages(&cc->migratepages); @@ -6251,7 +6327,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .nr_migratepages = 0, .order = -1, .zone = page_zone(pfn_to_page(start)), - .sync = true, + .mode = MIGRATE_SYNC, .ignore_skip_hint = true, }; INIT_LIST_HEAD(&cc.migratepages); @@ -6406,7 +6482,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { struct page *page; struct zone *zone; - int order, i; + unsigned int order, i; unsigned long pfn; unsigned long flags; /* find the first valid pfn */ @@ -6458,7 +6534,7 @@ bool is_free_buddy_page(struct page *page) struct zone *zone = page_zone(page); unsigned long pfn = page_to_pfn(page); unsigned long flags; - int order; + unsigned int order; spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { @@ -6544,7 +6620,8 @@ static void dump_page_flags(unsigned long flags) printk(")\n"); } -void dump_page_badflags(struct page *page, char *reason, unsigned long badflags) +void dump_page_badflags(struct page *page, const char *reason, + unsigned long badflags) { printk(KERN_ALERT "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", @@ -6560,8 +6637,8 @@ void dump_page_badflags(struct page *page, char *reason, unsigned long badflags) mem_cgroup_print_bad_page(page); } -void dump_page(struct page *page, char *reason) +void dump_page(struct page *page, const char *reason) { dump_page_badflags(page, reason, 0); } -EXPORT_SYMBOL_GPL(dump_page); +EXPORT_SYMBOL(dump_page); diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index cfd1628..3708264 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -175,7 +175,7 @@ static void free_page_cgroup(void *addr) } } -void __free_page_cgroup(unsigned long pfn) +static void __free_page_cgroup(unsigned long pfn) { struct mem_section *ms; struct page_cgroup *base; @@ -188,9 +188,9 @@ void __free_page_cgroup(unsigned long pfn) ms->page_cgroup = NULL; } -int __meminit online_page_cgroup(unsigned long start_pfn, - unsigned long nr_pages, - int nid) +static int __meminit online_page_cgroup(unsigned long start_pfn, + unsigned long nr_pages, + int nid) { unsigned long start, end, pfn; int fail = 0; @@ -223,8 +223,8 @@ int __meminit online_page_cgroup(unsigned long start_pfn, return -ENOMEM; } -int __meminit offline_page_cgroup(unsigned long start_pfn, - unsigned long nr_pages, int nid) +static int __meminit offline_page_cgroup(unsigned long start_pfn, + unsigned long nr_pages, int nid) { unsigned long start, end, pfn; diff --git a/mm/page_io.c b/mm/page_io.c index 7c59ef6..955db8b 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -248,21 +248,34 @@ out: return ret; } +static sector_t swap_page_sector(struct page *page) +{ + return (sector_t)__page_file_index(page) << (PAGE_CACHE_SHIFT - 9); +} + int __swap_writepage(struct page *page, struct writeback_control *wbc, void (*end_write_func)(struct bio *, int)) { struct bio *bio; - int ret = 0, rw = WRITE; + int ret, rw = WRITE; struct swap_info_struct *sis = page_swap_info(page); if (sis->flags & SWP_FILE) { struct kiocb kiocb; struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; - struct iovec iov = { - .iov_base = kmap(page), - .iov_len = PAGE_SIZE, + struct bio_vec bv = { + .bv_page = page, + .bv_len = PAGE_SIZE, + .bv_offset = 0 + }; + struct iov_iter from = { + .type = ITER_BVEC | WRITE, + .count = PAGE_SIZE, + .iov_offset = 0, + .nr_segs = 1, }; + from.bvec = &bv; /* older gcc versions are broken */ init_sync_kiocb(&kiocb, swap_file); kiocb.ki_pos = page_file_offset(page); @@ -270,10 +283,9 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, set_page_writeback(page); unlock_page(page); - ret = mapping->a_ops->direct_IO(KERNEL_WRITE, - &kiocb, &iov, - kiocb.ki_pos, 1); - kunmap(page); + ret = mapping->a_ops->direct_IO(ITER_BVEC | WRITE, + &kiocb, &from, + kiocb.ki_pos); if (ret == PAGE_SIZE) { count_vm_event(PSWPOUT); ret = 0; @@ -297,6 +309,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, return ret; } + ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc); + if (!ret) { + count_vm_event(PSWPOUT); + return 0; + } + + ret = 0; bio = get_swap_bio(GFP_NOIO, page, end_write_func); if (bio == NULL) { set_page_dirty(page); @@ -338,6 +357,13 @@ int swap_readpage(struct page *page) return ret; } + ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); + if (!ret) { + count_vm_event(PSWPIN); + return 0; + } + + ret = 0; bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); if (bio == NULL) { unlock_page(page); diff --git a/mm/percpu.c b/mm/percpu.c index 63e24fb..2139e30 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -610,7 +610,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); if (!chunk->map) { - kfree(chunk); + pcpu_mem_free(chunk, pcpu_chunk_struct_size); return NULL; } @@ -720,8 +720,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) if (unlikely(align < 2)) align = 2; - if (unlikely(size & 1)) - size++; + size = ALIGN(size, 2); if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { WARN(true, "illegal size (%zu) or align (%zu) for " diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 3c5cf68..5077afc 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -23,129 +23,40 @@ /** * process_vm_rw_pages - read/write pages from task specified - * @task: task to read/write from - * @mm: mm for task - * @process_pages: struct pages area that can store at least - * nr_pages_to_copy struct page pointers - * @pa: address of page in task to start copying from/to + * @pages: array of pointers to pages we want to copy * @start_offset: offset in page to start copying from/to * @len: number of bytes to copy - * @lvec: iovec array specifying where to copy to/from - * @lvec_cnt: number of elements in iovec array - * @lvec_current: index in iovec array we are up to - * @lvec_offset: offset in bytes from current iovec iov_base we are up to + * @iter: where to copy to/from locally * @vm_write: 0 means copy from, 1 means copy to - * @nr_pages_to_copy: number of pages to copy - * @bytes_copied: returns number of bytes successfully copied * Returns 0 on success, error code otherwise */ -static int process_vm_rw_pages(struct task_struct *task, - struct mm_struct *mm, - struct page **process_pages, - unsigned long pa, - unsigned long start_offset, - unsigned long len, - const struct iovec *lvec, - unsigned long lvec_cnt, - unsigned long *lvec_current, - size_t *lvec_offset, - int vm_write, - unsigned int nr_pages_to_copy, - ssize_t *bytes_copied) +static int process_vm_rw_pages(struct page **pages, + unsigned offset, + size_t len, + struct iov_iter *iter, + int vm_write) { - int pages_pinned; - void *target_kaddr; - int pgs_copied = 0; - int j; - int ret; - ssize_t bytes_to_copy; - ssize_t rc = 0; - - *bytes_copied = 0; - - /* Get the pages we're interested in */ - down_read(&mm->mmap_sem); - pages_pinned = get_user_pages(task, mm, pa, - nr_pages_to_copy, - vm_write, 0, process_pages, NULL); - up_read(&mm->mmap_sem); - - if (pages_pinned != nr_pages_to_copy) { - rc = -EFAULT; - goto end; - } - /* Do the copy for each page */ - for (pgs_copied = 0; - (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt); - pgs_copied++) { - /* Make sure we have a non zero length iovec */ - while (*lvec_current < lvec_cnt - && lvec[*lvec_current].iov_len == 0) - (*lvec_current)++; - if (*lvec_current == lvec_cnt) - break; + while (len && iov_iter_count(iter)) { + struct page *page = *pages++; + size_t copy = PAGE_SIZE - offset; + size_t copied; - /* - * Will copy smallest of: - * - bytes remaining in page - * - bytes remaining in destination iovec - */ - bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset, - len - *bytes_copied); - bytes_to_copy = min_t(ssize_t, bytes_to_copy, - lvec[*lvec_current].iov_len - - *lvec_offset); - - target_kaddr = kmap(process_pages[pgs_copied]) + start_offset; - - if (vm_write) - ret = copy_from_user(target_kaddr, - lvec[*lvec_current].iov_base - + *lvec_offset, - bytes_to_copy); - else - ret = copy_to_user(lvec[*lvec_current].iov_base - + *lvec_offset, - target_kaddr, bytes_to_copy); - kunmap(process_pages[pgs_copied]); - if (ret) { - *bytes_copied += bytes_to_copy - ret; - pgs_copied++; - rc = -EFAULT; - goto end; - } - *bytes_copied += bytes_to_copy; - *lvec_offset += bytes_to_copy; - if (*lvec_offset == lvec[*lvec_current].iov_len) { - /* - * Need to copy remaining part of page into the - * next iovec if there are any bytes left in page - */ - (*lvec_current)++; - *lvec_offset = 0; - start_offset = (start_offset + bytes_to_copy) - % PAGE_SIZE; - if (start_offset) - pgs_copied--; - } else { - start_offset = 0; - } - } + if (copy > len) + copy = len; -end: - if (vm_write) { - for (j = 0; j < pages_pinned; j++) { - if (j < pgs_copied) - set_page_dirty_lock(process_pages[j]); - put_page(process_pages[j]); + if (vm_write) { + copied = copy_page_from_iter(page, offset, copy, iter); + set_page_dirty_lock(page); + } else { + copied = copy_page_to_iter(page, offset, copy, iter); } - } else { - for (j = 0; j < pages_pinned; j++) - put_page(process_pages[j]); + len -= copied; + if (copied < copy && iov_iter_count(iter)) + return -EFAULT; + offset = 0; } - - return rc; + return 0; } /* Maximum number of pages kmalloc'd to hold struct page's during copy */ @@ -155,67 +66,60 @@ end: * process_vm_rw_single_vec - read/write pages from task specified * @addr: start memory address of target process * @len: size of area to copy to/from - * @lvec: iovec array specifying where to copy to/from locally - * @lvec_cnt: number of elements in iovec array - * @lvec_current: index in iovec array we are up to - * @lvec_offset: offset in bytes from current iovec iov_base we are up to + * @iter: where to copy to/from locally * @process_pages: struct pages area that can store at least * nr_pages_to_copy struct page pointers * @mm: mm for task * @task: task to read/write from * @vm_write: 0 means copy from, 1 means copy to - * @bytes_copied: returns number of bytes successfully copied * Returns 0 on success or on failure error code */ static int process_vm_rw_single_vec(unsigned long addr, unsigned long len, - const struct iovec *lvec, - unsigned long lvec_cnt, - unsigned long *lvec_current, - size_t *lvec_offset, + struct iov_iter *iter, struct page **process_pages, struct mm_struct *mm, struct task_struct *task, - int vm_write, - ssize_t *bytes_copied) + int vm_write) { unsigned long pa = addr & PAGE_MASK; unsigned long start_offset = addr - pa; unsigned long nr_pages; - ssize_t bytes_copied_loop; ssize_t rc = 0; - unsigned long nr_pages_copied = 0; - unsigned long nr_pages_to_copy; unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES / sizeof(struct pages *); - *bytes_copied = 0; - /* Work out address and page range required */ if (len == 0) return 0; nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; - while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) { - nr_pages_to_copy = min(nr_pages - nr_pages_copied, - max_pages_per_loop); + while (!rc && nr_pages && iov_iter_count(iter)) { + int pages = min(nr_pages, max_pages_per_loop); + size_t bytes; - rc = process_vm_rw_pages(task, mm, process_pages, pa, - start_offset, len, - lvec, lvec_cnt, - lvec_current, lvec_offset, - vm_write, nr_pages_to_copy, - &bytes_copied_loop); - start_offset = 0; - *bytes_copied += bytes_copied_loop; + /* Get the pages we're interested in */ + down_read(&mm->mmap_sem); + pages = get_user_pages(task, mm, pa, pages, + vm_write, 0, process_pages, NULL); + up_read(&mm->mmap_sem); - if (rc < 0) { - return rc; - } else { - len -= bytes_copied_loop; - nr_pages_copied += nr_pages_to_copy; - pa += nr_pages_to_copy * PAGE_SIZE; - } + if (pages <= 0) + return -EFAULT; + + bytes = pages * PAGE_SIZE - start_offset; + if (bytes > len) + bytes = len; + + rc = process_vm_rw_pages(process_pages, + start_offset, bytes, iter, + vm_write); + len -= bytes; + start_offset = 0; + nr_pages -= pages; + pa += pages * PAGE_SIZE; + while (pages) + put_page(process_pages[--pages]); } return rc; @@ -228,8 +132,7 @@ static int process_vm_rw_single_vec(unsigned long addr, /** * process_vm_rw_core - core of reading/writing pages from task specified * @pid: PID of process to read/write from/to - * @lvec: iovec array specifying where to copy to/from locally - * @liovcnt: size of lvec array + * @iter: where to copy to/from locally * @rvec: iovec array specifying where to copy to/from in the other process * @riovcnt: size of rvec array * @flags: currently unused @@ -238,8 +141,7 @@ static int process_vm_rw_single_vec(unsigned long addr, * return less bytes than expected if an error occurs during the copying * process. */ -static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, - unsigned long liovcnt, +static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, const struct iovec *rvec, unsigned long riovcnt, unsigned long flags, int vm_write) @@ -250,13 +152,10 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, struct mm_struct *mm; unsigned long i; ssize_t rc = 0; - ssize_t bytes_copied_loop; - ssize_t bytes_copied = 0; unsigned long nr_pages = 0; unsigned long nr_pages_iov; - unsigned long iov_l_curr_idx = 0; - size_t iov_l_curr_offset = 0; ssize_t iov_len; + size_t total_len = iov_iter_count(iter); /* * Work out how many pages of struct pages we're going to need @@ -310,24 +209,20 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, goto put_task_struct; } - for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) { + for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++) rc = process_vm_rw_single_vec( (unsigned long)rvec[i].iov_base, rvec[i].iov_len, - lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset, - process_pages, mm, task, vm_write, &bytes_copied_loop); - bytes_copied += bytes_copied_loop; - if (rc != 0) { - /* If we have managed to copy any data at all then - we return the number of bytes copied. Otherwise - we return the error code */ - if (bytes_copied) - rc = bytes_copied; - goto put_mm; - } - } + iter, process_pages, mm, task, vm_write); + + /* copied = space before - space after */ + total_len -= iov_iter_count(iter); + + /* If we have managed to copy any data at all then + we return the number of bytes copied. Otherwise + we return the error code */ + if (total_len) + rc = total_len; - rc = bytes_copied; -put_mm: mmput(mm); put_task_struct: @@ -363,6 +258,7 @@ static ssize_t process_vm_rw(pid_t pid, struct iovec iovstack_r[UIO_FASTIOV]; struct iovec *iov_l = iovstack_l; struct iovec *iov_r = iovstack_r; + struct iov_iter iter; ssize_t rc; if (flags != 0) @@ -378,13 +274,14 @@ static ssize_t process_vm_rw(pid_t pid, if (rc <= 0) goto free_iovecs; + iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc); + rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, iovstack_r, &iov_r); if (rc <= 0) goto free_iovecs; - rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags, - vm_write); + rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write); free_iovecs: if (iov_r != iovstack_r) @@ -412,7 +309,7 @@ SYSCALL_DEFINE6(process_vm_writev, pid_t, pid, #ifdef CONFIG_COMPAT -asmlinkage ssize_t +static ssize_t compat_process_vm_rw(compat_pid_t pid, const struct compat_iovec __user *lvec, unsigned long liovcnt, @@ -424,6 +321,7 @@ compat_process_vm_rw(compat_pid_t pid, struct iovec iovstack_r[UIO_FASTIOV]; struct iovec *iov_l = iovstack_l; struct iovec *iov_r = iovstack_r; + struct iov_iter iter; ssize_t rc = -EFAULT; if (flags != 0) @@ -439,14 +337,14 @@ compat_process_vm_rw(compat_pid_t pid, &iov_l); if (rc <= 0) goto free_iovecs; + iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc); rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, iovstack_r, &iov_r); if (rc <= 0) goto free_iovecs; - rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags, - vm_write); + rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write); free_iovecs: if (iov_r != iovstack_r) diff --git a/mm/readahead.c b/mm/readahead.c index 0de2360..0ca36a7 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -8,9 +8,7 @@ */ #include <linux/kernel.h> -#include <linux/fs.h> #include <linux/gfp.h> -#include <linux/mm.h> #include <linux/export.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> @@ -20,6 +18,8 @@ #include <linux/syscalls.h> #include <linux/file.h> +#include "internal.h" + /* * Initialise a struct file's readahead state. Assumes that the caller has * memset *ra to zero. @@ -149,8 +149,7 @@ out: * * Returns the number of pages requested, or the maximum amount of I/O allowed. */ -static int -__do_page_cache_readahead(struct address_space *mapping, struct file *filp, +int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read, unsigned long lookahead_size) { @@ -179,7 +178,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, rcu_read_lock(); page = radix_tree_lookup(&mapping->page_tree, page_offset); rcu_read_unlock(); - if (page) + if (page && !radix_tree_exceptional_entry(page)) continue; page = page_cache_alloc_readahead(mapping); @@ -233,28 +232,14 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, return 0; } +#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE) /* * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a * sensible upper limit. */ unsigned long max_sane_readahead(unsigned long nr) { - return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE) - + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); -} - -/* - * Submit IO for the read-ahead request in file_ra_state. - */ -unsigned long ra_submit(struct file_ra_state *ra, - struct address_space *mapping, struct file *filp) -{ - int actual; - - actual = __do_page_cache_readahead(mapping, filp, - ra->start, ra->size, ra->async_size); - - return actual; + return min(nr, MAX_READAHEAD); } /* @@ -347,7 +332,7 @@ static pgoff_t count_history_pages(struct address_space *mapping, pgoff_t head; rcu_read_lock(); - head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); + head = page_cache_prev_hole(mapping, offset - 1, max); rcu_read_unlock(); return offset - 1 - head; @@ -427,7 +412,7 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); + start = page_cache_next_hole(mapping, offset + 1, max); rcu_read_unlock(); if (!start || start - offset > max) @@ -103,6 +103,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) * LOCK should suffice since the actual taking of the lock must * happen _before_ what follows. */ + might_sleep(); if (rwsem_is_locked(&anon_vma->root->rwsem)) { anon_vma_lock_write(anon_vma); anon_vma_unlock_write(anon_vma); @@ -426,8 +427,9 @@ struct anon_vma *page_get_anon_vma(struct page *page) * above cannot corrupt). */ if (!page_mapped(page)) { + rcu_read_unlock(); put_anon_vma(anon_vma); - anon_vma = NULL; + return NULL; } out: rcu_read_unlock(); @@ -477,9 +479,9 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) } if (!page_mapped(page)) { + rcu_read_unlock(); put_anon_vma(anon_vma); - anon_vma = NULL; - goto out; + return NULL; } /* we pinned the anon_vma, its safe to sleep */ @@ -515,11 +517,7 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma) static inline unsigned long __vma_address(struct page *page, struct vm_area_struct *vma) { - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - - if (unlikely(is_vm_hugetlb_page(vma))) - pgoff = page->index << huge_page_order(page_hstate(page)); - + pgoff_t pgoff = page_to_pgoff(page); return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); } @@ -567,6 +565,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) pgd_t *pgd; pud_t *pud; pmd_t *pmd = NULL; + pmd_t pmde; pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -577,7 +576,13 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) goto out; pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) + /* + * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at() + * without holding anon_vma lock for write. So when looking for a + * genuine pmde (in which to find pte), test present and !THP together. + */ + pmde = ACCESS_ONCE(*pmd); + if (!pmd_present(pmde) || pmd_trans_huge(pmde)) pmd = NULL; out: return pmd; @@ -613,9 +618,6 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, if (!pmd) return NULL; - if (pmd_trans_huge(*pmd)) - return NULL; - pte = pte_offset_map(pmd, address); /* Make a quick check before getting the lock */ if (!sync && !pte_present(*pte)) { @@ -669,7 +671,7 @@ struct page_referenced_arg { /* * arg: page_referenced_arg will be passed */ -int page_referenced_one(struct page *page, struct vm_area_struct *vma, +static int page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; @@ -986,6 +988,12 @@ void do_page_add_anon_rmap(struct page *page, { int first = atomic_inc_and_test(&page->_mapcount); if (first) { + /* + * We use the irq-unsafe __{inc|mod}_zone_page_stat because + * these counters are not modified in interrupt context, and + * pte lock(a spinlock) is held, which implies preemption + * disabled. + */ if (PageTransHuge(page)) __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); @@ -1024,11 +1032,25 @@ void page_add_new_anon_rmap(struct page *page, __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, hpage_nr_pages(page)); __page_set_anon_rmap(page, vma, address, 1); - if (!mlocked_vma_newpage(vma, page)) { + + VM_BUG_ON_PAGE(PageLRU(page), page); + if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) { SetPageActive(page); lru_cache_add(page); - } else - add_page_to_unevictable_list(page); + return; + } + + if (!TestSetPageMlocked(page)) { + /* + * We use the irq-unsafe __mod_zone_page_stat because this + * counter is not modified from interrupt context, and the pte + * lock is held(spinlock), which implies preemption disabled. + */ + __mod_zone_page_state(page_zone(page), NR_MLOCK, + hpage_nr_pages(page)); + count_vm_event(UNEVICTABLE_PGMLOCKED); + } + add_page_to_unevictable_list(page); } /** @@ -1077,6 +1099,11 @@ void page_remove_rmap(struct page *page) /* * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED * and not charged by memcg for now. + * + * We use the irq-unsafe __{inc|mod}_zone_page_stat because + * these counters are not modified in interrupt context, and + * these counters are not modified in interrupt context, and + * pte lock(a spinlock) is held, which implies preemption disabled. */ if (unlikely(PageHuge(page))) goto out; @@ -1112,7 +1139,7 @@ out: /* * @arg: enum ttu_flags will be passed to this argument */ -int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, +static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; @@ -1135,7 +1162,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (vma->vm_flags & VM_LOCKED) goto out_mlock; - if (TTU_ACTION(flags) == TTU_MUNLOCK) + if (flags & TTU_MUNLOCK) goto out_unmap; } if (!(flags & TTU_IGNORE_ACCESS)) { @@ -1203,7 +1230,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ - BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); + BUG_ON(!(flags & TTU_MIGRATION)); entry = make_migration_entry(page, pte_write(pteval)); } swp_pte = swp_entry_to_pte(entry); @@ -1212,7 +1239,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, set_pte_at(mm, address, pte, swp_pte); BUG_ON(pte_file(*pte)); } else if (IS_ENABLED(CONFIG_MIGRATION) && - (TTU_ACTION(flags) == TTU_MIGRATION)) { + (flags & TTU_MIGRATION)) { /* Establish migration entry for a file page */ swp_entry_t entry; entry = make_migration_entry(page, pte_write(pteval)); @@ -1225,7 +1252,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, out_unmap: pte_unmap_unlock(pte, ptl); - if (ret != SWAP_FAIL) + if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK)) mmu_notifier_invalidate_page(mm, address); out: return ret; @@ -1332,9 +1359,19 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, BUG_ON(!page || PageAnon(page)); if (locked_vma) { - mlock_vma_page(page); /* no-op if already mlocked */ - if (page == check_page) + if (page == check_page) { + /* we know we have check_page locked */ + mlock_vma_page(page); ret = SWAP_MLOCK; + } else if (trylock_page(page)) { + /* + * If we can lock the page, perform mlock. + * Otherwise leave the page alone, it will be + * eventually encountered again later. + */ + mlock_vma_page(page); + unlock_page(page); + } continue; /* don't unmap */ } @@ -1349,7 +1386,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, if (page->index != linear_page_index(vma, address)) { pte_t ptfile = pgoff_to_pte(page->index); if (pte_soft_dirty(pteval)) - pte_file_mksoft_dirty(ptfile); + ptfile = pte_file_mksoft_dirty(ptfile); set_pte_at(mm, address, pte, ptfile); } @@ -1502,7 +1539,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) * locking requirements of exec(), migration skips * temporary VMAs until after exec() completes. */ - if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page)) + if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page)) rwc.invalid_vma = invalid_migration_vma; ret = rmap_walk(page, &rwc); @@ -1554,10 +1591,9 @@ void __put_anon_vma(struct anon_vma *anon_vma) { struct anon_vma *root = anon_vma->root; + anon_vma_free(anon_vma); if (root != anon_vma && atomic_dec_and_test(&root->refcount)) anon_vma_free(root); - - anon_vma_free(anon_vma); } static struct anon_vma *rmap_walk_anon_lock(struct page *page, @@ -1599,7 +1635,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff_t pgoff = page_to_pgoff(page); struct anon_vma_chain *avc; int ret = SWAP_AGAIN; @@ -1640,7 +1676,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) { struct address_space *mapping = page->mapping; - pgoff_t pgoff = page->index << compound_order(page); + pgoff_t pgoff = page_to_pgoff(page); struct vm_area_struct *vma; int ret = SWAP_AGAIN; @@ -80,11 +80,12 @@ static struct vfsmount *shm_mnt; #define SHORT_SYMLINK_LEN 128 /* - * shmem_fallocate and shmem_writepage communicate via inode->i_private - * (with i_mutex making sure that it has only one user at a time): - * we would prefer not to enlarge the shmem inode just for that. + * shmem_fallocate communicates with shmem_fault or shmem_writepage via + * inode->i_private (with i_mutex making sure that it has only one user at + * a time): we would prefer not to enlarge the shmem inode just for that. */ struct shmem_falloc { + wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ pgoff_t start; /* start of range currently being fallocated */ pgoff_t next; /* the next page offset to be fallocated */ pgoff_t nr_falloced; /* how many new pages have been fallocated */ @@ -242,19 +243,17 @@ static int shmem_radix_tree_replace(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { void **pslot; - void *item = NULL; + void *item; VM_BUG_ON(!expected); + VM_BUG_ON(!replacement); pslot = radix_tree_lookup_slot(&mapping->page_tree, index); - if (pslot) - item = radix_tree_deref_slot_protected(pslot, - &mapping->tree_lock); + if (!pslot) + return -ENOENT; + item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock); if (item != expected) return -ENOENT; - if (replacement) - radix_tree_replace_slot(pslot, replacement); - else - radix_tree_delete(&mapping->page_tree, index); + radix_tree_replace_slot(pslot, replacement); return 0; } @@ -331,84 +330,20 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) } /* - * Like find_get_pages, but collecting swap entries as well as pages. - */ -static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, - pgoff_t start, unsigned int nr_pages, - struct page **pages, pgoff_t *indices) -{ - void **slot; - unsigned int ret = 0; - struct radix_tree_iter iter; - - if (!nr_pages) - return 0; - - rcu_read_lock(); -restart: - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { - struct page *page; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) - continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto restart; - /* - * Otherwise, we must be storing a swap entry - * here as an exceptional entry: so return it - * without attempting to raise page count. - */ - goto export; - } - if (!page_cache_get_speculative(page)) - goto repeat; - - /* Has the page moved? */ - if (unlikely(page != *slot)) { - page_cache_release(page); - goto repeat; - } -export: - indices[ret] = iter.index; - pages[ret] = page; - if (++ret == nr_pages) - break; - } - rcu_read_unlock(); - return ret; -} - -/* * Remove swap entry from radix tree, free the swap and its page cache. */ static int shmem_free_swap(struct address_space *mapping, pgoff_t index, void *radswap) { - int error; + void *old; spin_lock_irq(&mapping->tree_lock); - error = shmem_radix_tree_replace(mapping, index, radswap, NULL); + old = radix_tree_delete_item(&mapping->page_tree, index, radswap); spin_unlock_irq(&mapping->tree_lock); - if (!error) - free_swap_and_cache(radix_to_swp_entry(radswap)); - return error; -} - -/* - * Pagevec may contain swap entries, so shuffle up pages before releasing. - */ -static void shmem_deswap_pagevec(struct pagevec *pvec) -{ - int i, j; - - for (i = 0, j = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - if (!radix_tree_exceptional_entry(page)) - pvec->pages[j++] = page; - } - pvec->nr = j; + if (old != radswap) + return -ENOENT; + free_swap_and_cache(radix_to_swp_entry(radswap)); + return 0; } /* @@ -429,12 +364,12 @@ void shmem_unlock_mapping(struct address_space *mapping) * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it * has finished, if it hits a row of PAGEVEC_SIZE swap entries. */ - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - PAGEVEC_SIZE, pvec.pages, indices); + pvec.nr = find_get_entries(mapping, index, + PAGEVEC_SIZE, pvec.pages, indices); if (!pvec.nr) break; index = indices[pvec.nr - 1] + 1; - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); check_move_unevictable_pages(pvec.pages, pvec.nr); pagevec_release(&pvec); cond_resched(); @@ -466,9 +401,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, pagevec_init(&pvec, 0); index = start; while (index < end) { - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - pvec.pages, indices); + pvec.nr = find_get_entries(mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + pvec.pages, indices); if (!pvec.nr) break; mem_cgroup_uncharge_start(); @@ -497,7 +432,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, } unlock_page(page); } - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); @@ -533,22 +468,20 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, return; index = start; - for ( ; ; ) { + while (index < end) { cond_resched(); - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, + + pvec.nr = find_get_entries(mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), - pvec.pages, indices); + pvec.pages, indices); if (!pvec.nr) { - if (index == start || unfalloc) + /* If all gone or hole-punch or unfalloc, we're done */ + if (index == start || end != -1) break; + /* But if truncating, restart to make sure all gone */ index = start; continue; } - if ((index == start || unfalloc) && indices[0] >= end) { - shmem_deswap_pagevec(&pvec); - pagevec_release(&pvec); - break; - } mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -560,8 +493,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (radix_tree_exceptional_entry(page)) { if (unfalloc) continue; - nr_swaps_freed += !shmem_free_swap(mapping, - index, page); + if (shmem_free_swap(mapping, index, page)) { + /* Swap was replaced by page: retry */ + index--; + break; + } + nr_swaps_freed++; continue; } @@ -570,11 +507,16 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (page->mapping == mapping) { VM_BUG_ON_PAGE(PageWriteback(page), page); truncate_inode_page(mapping, page); + } else { + /* Page was replaced by swap: retry */ + unlock_page(page); + index--; + break; } } unlock_page(page); } - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); index++; @@ -748,7 +690,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) * the shmem_swaplist_mutex which might hold up shmem_writepage(). * Charged back to the user (not to caller) when swap account is used. */ - error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); + error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL); if (error) goto out; /* No radix_tree_preload: swap entry keeps a place for page in tree */ @@ -824,6 +766,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) spin_lock(&inode->i_lock); shmem_falloc = inode->i_private; if (shmem_falloc && + !shmem_falloc->waitq && index >= shmem_falloc->start && index < shmem_falloc->next) shmem_falloc->nr_unswapped++; @@ -1080,7 +1023,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, return -EFBIG; repeat: swap.val = 0; - page = find_lock_page(mapping, index); + page = find_lock_entry(mapping, index); if (radix_tree_exceptional_entry(page)) { swap = radix_to_swp_entry(page); page = NULL; @@ -1092,6 +1035,9 @@ repeat: goto failed; } + if (page && sgp == SGP_WRITE) + mark_page_accessed(page); + /* fallocated page? */ if (page && !PageUptodate(page)) { if (sgp != SGP_READ) @@ -1145,7 +1091,7 @@ repeat: goto failed; } - error = mem_cgroup_cache_charge(page, current->mm, + error = mem_cgroup_charge_file(page, current->mm, gfp & GFP_RECLAIM_MASK); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, @@ -1173,6 +1119,9 @@ repeat: shmem_recalc_inode(inode); spin_unlock(&info->lock); + if (sgp == SGP_WRITE) + mark_page_accessed(page); + delete_from_swap_cache(page); set_page_dirty(page); swap_free(swap); @@ -1197,9 +1146,12 @@ repeat: goto decused; } - SetPageSwapBacked(page); + __SetPageSwapBacked(page); __set_page_locked(page); - error = mem_cgroup_cache_charge(page, current->mm, + if (sgp == SGP_WRITE) + init_page_accessed(page); + + error = mem_cgroup_charge_file(page, current->mm, gfp & GFP_RECLAIM_MASK); if (error) goto decused; @@ -1298,6 +1250,64 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) int error; int ret = VM_FAULT_LOCKED; + /* + * Trinity finds that probing a hole which tmpfs is punching can + * prevent the hole-punch from ever completing: which in turn + * locks writers out with its hold on i_mutex. So refrain from + * faulting pages into the hole while it's being punched. Although + * shmem_undo_range() does remove the additions, it may be unable to + * keep up, as each new page needs its own unmap_mapping_range() call, + * and the i_mmap tree grows ever slower to scan if new vmas are added. + * + * It does not matter if we sometimes reach this check just before the + * hole-punch begins, so that one fault then races with the punch: + * we just need to make racing faults a rare case. + * + * The implementation below would be much simpler if we just used a + * standard mutex or completion: but we cannot take i_mutex in fault, + * and bloating every shmem inode for this unlikely case would be sad. + */ + if (unlikely(inode->i_private)) { + struct shmem_falloc *shmem_falloc; + + spin_lock(&inode->i_lock); + shmem_falloc = inode->i_private; + if (shmem_falloc && + shmem_falloc->waitq && + vmf->pgoff >= shmem_falloc->start && + vmf->pgoff < shmem_falloc->next) { + wait_queue_head_t *shmem_falloc_waitq; + DEFINE_WAIT(shmem_fault_wait); + + ret = VM_FAULT_NOPAGE; + if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && + !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { + /* It's polite to up mmap_sem if we can */ + up_read(&vma->vm_mm->mmap_sem); + ret = VM_FAULT_RETRY; + } + + shmem_falloc_waitq = shmem_falloc->waitq; + prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); + schedule(); + + /* + * shmem_falloc_waitq points into the shmem_fallocate() + * stack of the hole-punching task: shmem_falloc_waitq + * is usually invalid by the time we reach here, but + * finish_wait() does not dereference it in that case; + * though i_lock needed lest racing with wake_up_all(). + */ + spin_lock(&inode->i_lock); + finish_wait(shmem_falloc_waitq, &shmem_fault_wait); + spin_unlock(&inode->i_lock); + return ret; + } + spin_unlock(&inode->i_lock); + } + error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); @@ -1417,6 +1427,11 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode return inode; } +bool shmem_mapping(struct address_space *mapping) +{ + return mapping->backing_dev_info == &shmem_backing_dev_info; +} + #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; @@ -1462,13 +1477,17 @@ shmem_write_end(struct file *file, struct address_space *mapping, return copied; } -static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) +static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct inode *inode = file_inode(filp); + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; pgoff_t index; unsigned long offset; enum sgp_type sgp = SGP_READ; + int error = 0; + ssize_t retval = 0; + loff_t *ppos = &iocb->ki_pos; /* * Might this read be for a stacking filesystem? Then when reading @@ -1496,10 +1515,10 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ break; } - desc->error = shmem_getpage(inode, index, &page, sgp, NULL); - if (desc->error) { - if (desc->error == -EINVAL) - desc->error = 0; + error = shmem_getpage(inode, index, &page, sgp, NULL); + if (error) { + if (error == -EINVAL) + error = 0; break; } if (page) @@ -1543,61 +1562,26 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... - * - * The actor routine returns how many bytes were actually used.. - * NOTE! This may not be the same as how much of a user buffer - * we filled up (we may be padding etc), so we can only update - * "pos" here (the actor routine has to update the user buffer - * pointers and the remaining count). */ - ret = actor(desc, page, offset, nr); + ret = copy_page_to_iter(page, offset, nr, to); + retval += ret; offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; page_cache_release(page); - if (ret != nr || !desc->count) + if (!iov_iter_count(to)) break; - + if (ret < nr) { + error = -EFAULT; + break; + } cond_resched(); } *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; - file_accessed(filp); -} - -static ssize_t shmem_file_aio_read(struct kiocb *iocb, - const struct iovec *iov, unsigned long nr_segs, loff_t pos) -{ - struct file *filp = iocb->ki_filp; - ssize_t retval; - unsigned long seg; - size_t count; - loff_t *ppos = &iocb->ki_pos; - - retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); - if (retval) - return retval; - - for (seg = 0; seg < nr_segs; seg++) { - read_descriptor_t desc; - - desc.written = 0; - desc.arg.buf = iov[seg].iov_base; - desc.count = iov[seg].iov_len; - if (desc.count == 0) - continue; - desc.error = 0; - do_shmem_file_read(filp, ppos, &desc, file_read_actor); - retval += desc.written; - if (desc.error) { - retval = retval ?: desc.error; - break; - } - if (desc.count > 0) - break; - } - return retval; + file_accessed(file); + return retval ? retval : error; } static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, @@ -1636,7 +1620,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, index = *ppos >> PAGE_CACHE_SHIFT; loff = *ppos & ~PAGE_CACHE_MASK; req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - nr_pages = min(req_pages, pipe->buffers); + nr_pages = min(req_pages, spd.nr_pages_max); spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages); @@ -1729,7 +1713,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, pagevec_init(&pvec, 0); pvec.nr = 1; /* start small: we may be there already */ while (!done) { - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, + pvec.nr = find_get_entries(mapping, index, pvec.nr, pvec.pages, indices); if (!pvec.nr) { if (whence == SEEK_DATA) @@ -1756,7 +1740,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, break; } } - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); pvec.nr = PAGEVEC_SIZE; cond_resched(); @@ -1811,18 +1795,34 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, pgoff_t start, index, end; int error; + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + mutex_lock(&inode->i_mutex); if (mode & FALLOC_FL_PUNCH_HOLE) { struct address_space *mapping = file->f_mapping; loff_t unmap_start = round_up(offset, PAGE_SIZE); loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); + + shmem_falloc.waitq = &shmem_falloc_waitq; + shmem_falloc.start = unmap_start >> PAGE_SHIFT; + shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; + spin_lock(&inode->i_lock); + inode->i_private = &shmem_falloc; + spin_unlock(&inode->i_lock); if ((u64)unmap_end > (u64)unmap_start) unmap_mapping_range(mapping, unmap_start, 1 + unmap_end - unmap_start, 0); shmem_truncate_range(inode, offset, offset + len - 1); /* No need to unmap again: hole-punching leaves COWed pages */ + + spin_lock(&inode->i_lock); + inode->i_private = NULL; + wake_up_all(&shmem_falloc_waitq); + spin_unlock(&inode->i_lock); error = 0; goto out; } @@ -1840,6 +1840,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, goto out; } + shmem_falloc.waitq = NULL; shmem_falloc.start = start; shmem_falloc.next = start; shmem_falloc.nr_falloced = 0; @@ -2708,13 +2709,13 @@ static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, #ifdef CONFIG_TMPFS .llseek = shmem_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = shmem_file_aio_read, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = shmem_file_read_iter, + .write_iter = generic_file_write_iter, .fsync = noop_fsync, .splice_read = shmem_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, .fallocate = shmem_fallocate, #endif }; @@ -2783,6 +2784,7 @@ static const struct super_operations shmem_ops = { static const struct vm_operations_struct shmem_vm_ops = { .fault = shmem_fault, + .map_pages = filemap_map_pages, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, @@ -157,6 +157,17 @@ #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN #endif +#define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \ + <= SLAB_OBJ_MIN_SIZE) ? 1 : 0) + +#if FREELIST_BYTE_INDEX +typedef unsigned char freelist_idx_t; +#else +typedef unsigned short freelist_idx_t; +#endif + +#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) + /* * true if a page was allocated from pfmemalloc reserves for network-based * swap @@ -277,8 +288,8 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) * OTOH the cpuarrays can contain lots of objects, * which could lock up otherwise freeable slabs. */ -#define REAPTIMEOUT_CPUC (2*HZ) -#define REAPTIMEOUT_LIST3 (4*HZ) +#define REAPTIMEOUT_AC (2*HZ) +#define REAPTIMEOUT_NODE (4*HZ) #if STATS #define STATS_INC_ACTIVE(x) ((x)->num_active++) @@ -375,6 +386,39 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif +#define OBJECT_FREE (0) +#define OBJECT_ACTIVE (1) + +#ifdef CONFIG_DEBUG_SLAB_LEAK + +static void set_obj_status(struct page *page, int idx, int val) +{ + int freelist_size; + char *status; + struct kmem_cache *cachep = page->slab_cache; + + freelist_size = cachep->num * sizeof(freelist_idx_t); + status = (char *)page->freelist + freelist_size; + status[idx] = val; +} + +static inline unsigned int get_obj_status(struct page *page, int idx) +{ + int freelist_size; + char *status; + struct kmem_cache *cachep = page->slab_cache; + + freelist_size = cachep->num * sizeof(freelist_idx_t); + status = (char *)page->freelist + freelist_size; + + return status[idx]; +} + +#else +static inline void set_obj_status(struct page *page, int idx, int val) {} + +#endif + /* * Do not go above this order unless 0 objects fit into the slab or * overridden on the command line. @@ -565,9 +609,50 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) return cachep->array[smp_processor_id()]; } -static size_t slab_mgmt_size(size_t nr_objs, size_t align) +static size_t calculate_freelist_size(int nr_objs, size_t align) { - return ALIGN(nr_objs * sizeof(unsigned int), align); + size_t freelist_size; + + freelist_size = nr_objs * sizeof(freelist_idx_t); + if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) + freelist_size += nr_objs * sizeof(char); + + if (align) + freelist_size = ALIGN(freelist_size, align); + + return freelist_size; +} + +static int calculate_nr_objs(size_t slab_size, size_t buffer_size, + size_t idx_size, size_t align) +{ + int nr_objs; + size_t remained_size; + size_t freelist_size; + int extra_space = 0; + + if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) + extra_space = sizeof(char); + /* + * Ignore padding for the initial guess. The padding + * is at most @align-1 bytes, and @buffer_size is at + * least @align. In the worst case, this result will + * be one greater than the number of objects that fit + * into the memory allocation when taking the padding + * into account. + */ + nr_objs = slab_size / (buffer_size + idx_size + extra_space); + + /* + * This calculated number will be either the right + * amount, or one greater than what we want. + */ + remained_size = slab_size - nr_objs * buffer_size; + freelist_size = calculate_freelist_size(nr_objs, align); + if (remained_size < freelist_size) + nr_objs--; + + return nr_objs; } /* @@ -600,25 +685,9 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, nr_objs = slab_size / buffer_size; } else { - /* - * Ignore padding for the initial guess. The padding - * is at most @align-1 bytes, and @buffer_size is at - * least @align. In the worst case, this result will - * be one greater than the number of objects that fit - * into the memory allocation when taking the padding - * into account. - */ - nr_objs = (slab_size) / (buffer_size + sizeof(unsigned int)); - - /* - * This calculated number will be either the right - * amount, or one greater than what we want. - */ - if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size - > slab_size) - nr_objs--; - - mgmt_size = slab_mgmt_size(nr_objs, align); + nr_objs = calculate_nr_objs(slab_size, buffer_size, + sizeof(freelist_idx_t), align); + mgmt_size = calculate_freelist_size(nr_objs, align); } *num = nr_objs; *left_over = slab_size - nr_objs*buffer_size - mgmt_size; @@ -1067,7 +1136,7 @@ static int init_cache_node_node(int node) list_for_each_entry(cachep, &slab_caches, list) { /* - * Set up the size64 kmemlist for cpu before we can + * Set up the kmem_cache_node for cpu before we can * begin anything. Make sure some other cpu on this * node has not already allocated this */ @@ -1076,12 +1145,12 @@ static int init_cache_node_node(int node) if (!n) return -ENOMEM; kmem_cache_node_init(n); - n->next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + n->next_reap = jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; /* - * The l3s don't come and go as CPUs come and - * go. slab_mutex is sufficient + * The kmem_cache_nodes don't come and go as CPUs + * come and go. slab_mutex is sufficient * protection here. */ cachep->node[node] = n; @@ -1406,8 +1475,8 @@ static void __init set_up_node(struct kmem_cache *cachep, int index) for_each_online_node(node) { cachep->node[node] = &init_kmem_cache_node[index + node]; cachep->node[node]->next_reap = jiffies + - REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; } } @@ -1604,10 +1673,16 @@ __initcall(cpucache_init); static noinline void slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) { +#if DEBUG struct kmem_cache_node *n; struct page *page; unsigned long flags; int node; + static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) + return; printk(KERN_WARNING "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", @@ -1645,6 +1720,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) node, active_slabs, num_slabs, active_objs, num_objs, free_objects); } +#endif } /* @@ -1664,10 +1740,13 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, if (cachep->flags & SLAB_RECLAIM_ACCOUNT) flags |= __GFP_RECLAIMABLE; + if (memcg_charge_slab(cachep, flags, cachep->gfporder)) + return NULL; + page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); if (!page) { - if (!(flags & __GFP_NOWARN) && printk_ratelimit()) - slab_out_of_memory(cachep, flags, nodeid); + memcg_uncharge_slab(cachep, cachep->gfporder); + slab_out_of_memory(cachep, flags, nodeid); return NULL; } @@ -1685,7 +1764,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, __SetPageSlab(page); if (page->pfmemalloc) SetPageSlabPfmemalloc(page); - memcg_bind_pages(cachep, cachep->gfporder); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); @@ -1721,10 +1799,10 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) page_mapcount_reset(page); page->mapping = NULL; - memcg_release_pages(cachep, cachep->gfporder); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - __free_memcg_kmem_pages(page, cachep->gfporder); + __free_pages(page, cachep->gfporder); + memcg_uncharge_slab(cachep, cachep->gfporder); } static void kmem_rcu_free(struct rcu_head *head) @@ -2010,14 +2088,21 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, if (!num) continue; + /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */ + if (num > SLAB_OBJ_MAX_NUM) + break; + if (flags & CFLGS_OFF_SLAB) { + size_t freelist_size_per_obj = sizeof(freelist_idx_t); /* * Max number of objs-per-slab for caches which * use off-slab slabs. Needed to avoid a possible * looping condition in cache_grow(). */ + if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) + freelist_size_per_obj += sizeof(char); offslab_limit = size; - offslab_limit /= sizeof(unsigned int); + offslab_limit /= freelist_size_per_obj; if (num > offslab_limit) break; @@ -2103,8 +2188,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) } } cachep->node[numa_mem_id()]->next_reap = - jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; cpu_cache_get(cachep)->avail = 0; cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; @@ -2243,7 +2328,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * it too early on. Always use on-slab management when * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) */ - if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init && + if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init && !(flags & SLAB_NOLEAKTRACE)) /* * Size is large, assume best to place the slab management obj @@ -2252,14 +2337,19 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) flags |= CFLGS_OFF_SLAB; size = ALIGN(size, cachep->align); + /* + * We should restrict the number of objects in a slab to implement + * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. + */ + if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) + size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); left_over = calculate_slab_order(cachep, size, cachep->align, flags); if (!cachep->num) return -E2BIG; - freelist_size = - ALIGN(cachep->num * sizeof(unsigned int), cachep->align); + freelist_size = calculate_freelist_size(cachep->num, cachep->align); /* * If the slab has been placed off-slab, and we have enough space then @@ -2272,7 +2362,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ - freelist_size = cachep->num * sizeof(unsigned int); + freelist_size = calculate_freelist_size(cachep->num, 0); #ifdef CONFIG_PAGE_POISONING /* If we're going to use the generic kernel_map_pages() @@ -2300,10 +2390,10 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (flags & CFLGS_OFF_SLAB) { cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); /* - * This is a possibility for one of the malloc_sizes caches. + * This is a possibility for one of the kmalloc_{dma,}_caches. * But since we go off slab only for object size greater than - * PAGE_SIZE/8, and malloc_sizes gets created in ascending order, - * this should not happen at all. + * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created + * in ascending order,this should not happen at all. * But leave a BUG_ON for some lucky dude. */ BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); @@ -2442,8 +2532,7 @@ out: return nr_freed; } -/* Called with slab_mutex held to protect against cpu hotplug */ -static int __cache_shrink(struct kmem_cache *cachep) +int __kmem_cache_shrink(struct kmem_cache *cachep) { int ret = 0, i = 0; struct kmem_cache_node *n; @@ -2464,32 +2553,11 @@ static int __cache_shrink(struct kmem_cache *cachep) return (ret ? 1 : 0); } -/** - * kmem_cache_shrink - Shrink a cache. - * @cachep: The cache to shrink. - * - * Releases as many slabs as possible for a cache. - * To help debugging, a zero exit status indicates all slabs were released. - */ -int kmem_cache_shrink(struct kmem_cache *cachep) -{ - int ret; - BUG_ON(!cachep || in_interrupt()); - - get_online_cpus(); - mutex_lock(&slab_mutex); - ret = __cache_shrink(cachep); - mutex_unlock(&slab_mutex); - put_online_cpus(); - return ret; -} -EXPORT_SYMBOL(kmem_cache_shrink); - int __kmem_cache_shutdown(struct kmem_cache *cachep) { int i; struct kmem_cache_node *n; - int rc = __cache_shrink(cachep); + int rc = __kmem_cache_shrink(cachep); if (rc) return rc; @@ -2511,14 +2579,17 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) /* * Get the memory for a slab management obj. - * For a slab cache when the slab descriptor is off-slab, slab descriptors - * always come from malloc_sizes caches. The slab descriptor cannot - * come from the same cache which is getting created because, - * when we are searching for an appropriate cache for these - * descriptors in kmem_cache_create, we search through the malloc_sizes array. - * If we are creating a malloc_sizes cache here it would not be visible to - * kmem_find_general_cachep till the initialization is complete. - * Hence we cannot have freelist_cache same as the original cache. + * + * For a slab cache when the slab descriptor is off-slab, the + * slab descriptor can't come from the same cache which is being created, + * Because if it is the case, that means we defer the creation of + * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point. + * And we eventually call down to __kmem_cache_create(), which + * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one. + * This is a "chicken-and-egg" problem. + * + * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches, + * which are all initialized during kmem_cache_init(). */ static void *alloc_slabmgmt(struct kmem_cache *cachep, struct page *page, int colour_off, @@ -2542,9 +2613,15 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, return freelist; } -static inline unsigned int *slab_freelist(struct page *page) +static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx) +{ + return ((freelist_idx_t *)page->freelist)[idx]; +} + +static inline void set_free_obj(struct page *page, + unsigned int idx, freelist_idx_t val) { - return (unsigned int *)(page->freelist); + ((freelist_idx_t *)(page->freelist))[idx] = val; } static void cache_init_objs(struct kmem_cache *cachep, @@ -2589,7 +2666,8 @@ static void cache_init_objs(struct kmem_cache *cachep, if (cachep->ctor) cachep->ctor(objp); #endif - slab_freelist(page)[i] = i; + set_obj_status(page, i, OBJECT_FREE); + set_free_obj(page, i, i); } } @@ -2608,7 +2686,7 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, { void *objp; - objp = index_to_obj(cachep, page, slab_freelist(page)[page->active]); + objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); page->active++; #if DEBUG WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); @@ -2629,7 +2707,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page, /* Verify double free bug */ for (i = page->active; i < cachep->num; i++) { - if (slab_freelist(page)[i] == objnr) { + if (get_free_obj(page, i) == objnr) { printk(KERN_ERR "slab: double free detected in cache " "'%s', objp %p\n", cachep->name, objp); BUG(); @@ -2637,7 +2715,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page, } #endif page->active--; - slab_freelist(page)[page->active] = objnr; + set_free_obj(page, page->active, objnr); } /* @@ -2797,6 +2875,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, BUG_ON(objnr >= cachep->num); BUG_ON(objp != index_to_obj(cachep, page, objnr)); + set_obj_status(page, objnr, OBJECT_FREE); if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { @@ -2886,9 +2965,9 @@ retry: /* move slabp to correct slabp list: */ list_del(&page->lru); if (page->active == cachep->num) - list_add(&page->list, &n->slabs_full); + list_add(&page->lru, &n->slabs_full); else - list_add(&page->list, &n->slabs_partial); + list_add(&page->lru, &n->slabs_partial); } must_grow: @@ -2930,6 +3009,8 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, unsigned long caller) { + struct page *page; + if (!objp) return objp; if (cachep->flags & SLAB_POISON) { @@ -2960,6 +3041,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, *dbg_redzone1(cachep, objp) = RED_ACTIVE; *dbg_redzone2(cachep, objp) = RED_ACTIVE; } + + page = virt_to_head_page(objp); + set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE); objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) cachep->ctor(objp); @@ -3027,7 +3111,7 @@ out: #ifdef CONFIG_NUMA /* - * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. + * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set. * * If we are in_interrupt, then process context, including cpusets and * mempolicy, may not apply and should not be used for allocation policy. @@ -3042,7 +3126,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) nid_alloc = cpuset_slab_spread_node(); else if (current->mempolicy) - nid_alloc = slab_node(); + nid_alloc = mempolicy_slab_node(); if (nid_alloc != nid_here) return ____cache_alloc_node(cachep, flags, nid_alloc); return NULL; @@ -3073,8 +3157,8 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); - zonelist = node_zonelist(slab_node(), flags); + cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = node_zonelist(mempolicy_slab_node(), flags); retry: /* @@ -3131,7 +3215,7 @@ retry: } } - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) + if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return obj; } @@ -3245,11 +3329,11 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, flags); - if (likely(ptr)) + if (likely(ptr)) { kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); - - if (unlikely((flags & __GFP_ZERO) && ptr)) - memset(ptr, 0, cachep->object_size); + if (unlikely(flags & __GFP_ZERO)) + memset(ptr, 0, cachep->object_size); + } return ptr; } @@ -3259,7 +3343,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) { void *objp; - if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { + if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) { objp = alternate_node_alloc(cache, flags); if (objp) goto out; @@ -3310,17 +3394,17 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) flags); prefetchw(objp); - if (likely(objp)) + if (likely(objp)) { kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); - - if (unlikely((flags & __GFP_ZERO) && objp)) - memset(objp, 0, cachep->object_size); + if (unlikely(flags & __GFP_ZERO)) + memset(objp, 0, cachep->object_size); + } return objp; } /* - * Caller needs to acquire correct kmem_list's list_lock + * Caller needs to acquire correct kmem_cache_node's list_lock */ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, int node) @@ -3574,11 +3658,6 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, struct kmem_cache *cachep; void *ret; - /* If you want to save a few bytes .text space: replace - * __ with kmem_. - * Then kmalloc uses the uninlined functions instead of the inline - * functions. - */ cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; @@ -3670,7 +3749,7 @@ EXPORT_SYMBOL(kfree); /* * This initializes kmem_cache_node or resizes various caches for all nodes. */ -static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) +static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) { int node; struct kmem_cache_node *n; @@ -3726,8 +3805,8 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) } kmem_cache_node_init(n); - n->next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + n->next_reap = jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; n->shared = new_shared; n->alien = new_alien; n->free_limit = (1 + nr_cpus_node(node)) * @@ -3813,7 +3892,7 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, kfree(ccold); } kfree(new); - return alloc_kmemlist(cachep, gfp); + return alloc_kmem_cache_node(cachep, gfp); } static int do_tune_cpucache(struct kmem_cache *cachep, int limit, @@ -3982,7 +4061,7 @@ static void cache_reap(struct work_struct *w) if (time_after(n->next_reap, jiffies)) goto next; - n->next_reap = jiffies + REAPTIMEOUT_LIST3; + n->next_reap = jiffies + REAPTIMEOUT_NODE; drain_array(searchp, n, n->shared, 0, node); @@ -4003,7 +4082,7 @@ next: next_reap_node(); out: /* Set up the next iteration */ - schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); + schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC)); } #ifdef CONFIG_SLABINFO @@ -4201,21 +4280,12 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct page *page) { void *p; - int i, j; + int i; if (n[0] == n[1]) return; for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { - bool active = true; - - for (j = page->active; j < c->num; j++) { - /* Skip freed item */ - if (slab_freelist(page)[j] == i) { - active = false; - break; - } - } - if (!active) + if (get_obj_status(page, i) != OBJECT_ACTIVE) continue; if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) @@ -55,12 +55,12 @@ extern void create_boot_cache(struct kmem_cache *, const char *name, struct mem_cgroup; #ifdef CONFIG_SLUB struct kmem_cache * -__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)); +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)); #else static inline struct kmem_cache * -__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) { return NULL; } #endif @@ -91,6 +91,8 @@ __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) int __kmem_cache_shutdown(struct kmem_cache *); +int __kmem_cache_shrink(struct kmem_cache *); +void slab_kmem_cache_release(struct kmem_cache *); struct seq_file; struct file; @@ -119,28 +121,6 @@ static inline bool is_root_cache(struct kmem_cache *s) return !s->memcg_params || s->memcg_params->is_root_cache; } -static inline bool cache_match_memcg(struct kmem_cache *cachep, - struct mem_cgroup *memcg) -{ - return (is_root_cache(cachep) && !memcg) || - (cachep->memcg_params->memcg == memcg); -} - -static inline void memcg_bind_pages(struct kmem_cache *s, int order) -{ - if (!is_root_cache(s)) - atomic_add(1 << order, &s->memcg_params->nr_pages); -} - -static inline void memcg_release_pages(struct kmem_cache *s, int order) -{ - if (is_root_cache(s)) - return; - - if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages)) - mem_cgroup_destroy_cache(s); -} - static inline bool slab_equal_or_root(struct kmem_cache *s, struct kmem_cache *p) { @@ -198,24 +178,29 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) return s; return s->memcg_params->root_cache; } -#else -static inline bool is_root_cache(struct kmem_cache *s) -{ - return true; -} -static inline bool cache_match_memcg(struct kmem_cache *cachep, - struct mem_cgroup *memcg) +static __always_inline int memcg_charge_slab(struct kmem_cache *s, + gfp_t gfp, int order) { - return true; + if (!memcg_kmem_enabled()) + return 0; + if (is_root_cache(s)) + return 0; + return __memcg_charge_slab(s, gfp, order); } -static inline void memcg_bind_pages(struct kmem_cache *s, int order) +static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) { + if (!memcg_kmem_enabled()) + return; + if (is_root_cache(s)) + return; + __memcg_uncharge_slab(s, order); } - -static inline void memcg_release_pages(struct kmem_cache *s, int order) +#else +static inline bool is_root_cache(struct kmem_cache *s) { + return true; } static inline bool slab_equal_or_root(struct kmem_cache *s, @@ -239,6 +224,15 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) { return s; } + +static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order) +{ + return 0; +} + +static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) +{ +} #endif static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) diff --git a/mm/slab_common.c b/mm/slab_common.c index 1ec3c61..d31c4ba 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -29,8 +29,7 @@ DEFINE_MUTEX(slab_mutex); struct kmem_cache *kmem_cache; #ifdef CONFIG_DEBUG_VM -static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, - size_t size) +static int kmem_cache_sanity_check(const char *name, size_t size) { struct kmem_cache *s = NULL; @@ -56,14 +55,8 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, continue; } -#if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON) - /* - * For simplicity, we won't check this in the list of memcg - * caches. We have control over memcg naming, and if there - * aren't duplicates in the global list, there won't be any - * duplicates in the memcg lists as well. - */ - if (!memcg && !strcmp(s->name, name)) { +#if !defined(CONFIG_SLUB) + if (!strcmp(s->name, name)) { pr_err("%s (%s): Cache name already exists.\n", __func__, name); dump_stack(); @@ -77,8 +70,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, return 0; } #else -static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg, - const char *name, size_t size) +static inline int kmem_cache_sanity_check(const char *name, size_t size) { return 0; } @@ -139,6 +131,45 @@ unsigned long calculate_alignment(unsigned long flags, return ALIGN(align, sizeof(void *)); } +static struct kmem_cache * +do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *), + struct mem_cgroup *memcg, struct kmem_cache *root_cache) +{ + struct kmem_cache *s; + int err; + + err = -ENOMEM; + s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); + if (!s) + goto out; + + s->name = name; + s->object_size = object_size; + s->size = size; + s->align = align; + s->ctor = ctor; + + err = memcg_alloc_cache_params(memcg, s, root_cache); + if (err) + goto out_free_cache; + + err = __kmem_cache_create(s, flags); + if (err) + goto out_free_cache; + + s->refcount = 1; + list_add(&s->list, &slab_caches); +out: + if (err) + return ERR_PTR(err); + return s; + +out_free_cache: + memcg_free_cache_params(s); + kfree(s); + goto out; +} /* * kmem_cache_create - Create a cache. @@ -164,34 +195,23 @@ unsigned long calculate_alignment(unsigned long flags, * cacheline. This can be beneficial if you're counting cycles as closely * as davem. */ - struct kmem_cache * -kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *), - struct kmem_cache *parent_cache) +kmem_cache_create(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) { - struct kmem_cache *s = NULL; + struct kmem_cache *s; + char *cache_name; int err; get_online_cpus(); + get_online_mems(); + mutex_lock(&slab_mutex); - err = kmem_cache_sanity_check(memcg, name, size); + err = kmem_cache_sanity_check(name, size); if (err) goto out_unlock; - if (memcg) { - /* - * Since per-memcg caches are created asynchronously on first - * allocation (see memcg_kmem_get_cache()), several threads can - * try to create the same cache, but only one of them may - * succeed. Therefore if we get here and see the cache has - * already been created, we silently return NULL. - */ - if (cache_from_memcg_idx(parent_cache, memcg_cache_id(memcg))) - goto out_unlock; - } - /* * Some allocators will constraint the set of valid flags to a subset * of all flags. We expect them to define CACHE_CREATE_MASK in this @@ -200,50 +220,31 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, */ flags &= CACHE_CREATE_MASK; - s = __kmem_cache_alias(memcg, name, size, align, flags, ctor); + s = __kmem_cache_alias(name, size, align, flags, ctor); if (s) goto out_unlock; - err = -ENOMEM; - s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); - if (!s) + cache_name = kstrdup(name, GFP_KERNEL); + if (!cache_name) { + err = -ENOMEM; goto out_unlock; + } - s->object_size = s->size = size; - s->align = calculate_alignment(flags, align, size); - s->ctor = ctor; - - s->name = kstrdup(name, GFP_KERNEL); - if (!s->name) - goto out_free_cache; - - err = memcg_alloc_cache_params(memcg, s, parent_cache); - if (err) - goto out_free_cache; - - err = __kmem_cache_create(s, flags); - if (err) - goto out_free_cache; - - s->refcount = 1; - list_add(&s->list, &slab_caches); - memcg_register_cache(s); + s = do_kmem_cache_create(cache_name, size, size, + calculate_alignment(flags, align, size), + flags, ctor, NULL, NULL); + if (IS_ERR(s)) { + err = PTR_ERR(s); + kfree(cache_name); + } out_unlock: mutex_unlock(&slab_mutex); + + put_online_mems(); put_online_cpus(); if (err) { - /* - * There is no point in flooding logs with warnings or - * especially crashing the system if we fail to create a cache - * for a memcg. In this case we will be accounting the memcg - * allocation to the root cgroup until we succeed to create its - * own cache, but it isn't that critical. - */ - if (!memcg) - return NULL; - if (flags & SLAB_PANIC) panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", name, err); @@ -255,56 +256,145 @@ out_unlock: return NULL; } return s; +} +EXPORT_SYMBOL(kmem_cache_create); -out_free_cache: - memcg_free_cache_params(s); - kfree(s->name); - kmem_cache_free(kmem_cache, s); - goto out_unlock; +#ifdef CONFIG_MEMCG_KMEM +/* + * memcg_create_kmem_cache - Create a cache for a memory cgroup. + * @memcg: The memory cgroup the new cache is for. + * @root_cache: The parent of the new cache. + * @memcg_name: The name of the memory cgroup (used for naming the new cache). + * + * This function attempts to create a kmem cache that will serve allocation + * requests going from @memcg to @root_cache. The new cache inherits properties + * from its parent. + */ +struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, + struct kmem_cache *root_cache, + const char *memcg_name) +{ + struct kmem_cache *s = NULL; + char *cache_name; + + get_online_cpus(); + get_online_mems(); + + mutex_lock(&slab_mutex); + + cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, + memcg_cache_id(memcg), memcg_name); + if (!cache_name) + goto out_unlock; + + s = do_kmem_cache_create(cache_name, root_cache->object_size, + root_cache->size, root_cache->align, + root_cache->flags, root_cache->ctor, + memcg, root_cache); + if (IS_ERR(s)) { + kfree(cache_name); + s = NULL; + } + +out_unlock: + mutex_unlock(&slab_mutex); + + put_online_mems(); + put_online_cpus(); + + return s; } -struct kmem_cache * -kmem_cache_create(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) +static int memcg_cleanup_cache_params(struct kmem_cache *s) { - return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL); + int rc; + + if (!s->memcg_params || + !s->memcg_params->is_root_cache) + return 0; + + mutex_unlock(&slab_mutex); + rc = __memcg_cleanup_cache_params(s); + mutex_lock(&slab_mutex); + + return rc; } -EXPORT_SYMBOL(kmem_cache_create); +#else +static int memcg_cleanup_cache_params(struct kmem_cache *s) +{ + return 0; +} +#endif /* CONFIG_MEMCG_KMEM */ -void kmem_cache_destroy(struct kmem_cache *s) +void slab_kmem_cache_release(struct kmem_cache *s) { - /* Destroy all the children caches if we aren't a memcg cache */ - kmem_cache_destroy_memcg_children(s); + kfree(s->name); + kmem_cache_free(kmem_cache, s); +} +void kmem_cache_destroy(struct kmem_cache *s) +{ get_online_cpus(); + get_online_mems(); + mutex_lock(&slab_mutex); + s->refcount--; - if (!s->refcount) { - list_del(&s->list); - - if (!__kmem_cache_shutdown(s)) { - memcg_unregister_cache(s); - mutex_unlock(&slab_mutex); - if (s->flags & SLAB_DESTROY_BY_RCU) - rcu_barrier(); - - memcg_free_cache_params(s); - kfree(s->name); - kmem_cache_free(kmem_cache, s); - } else { - list_add(&s->list, &slab_caches); - mutex_unlock(&slab_mutex); - printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n", - s->name); - dump_stack(); - } - } else { - mutex_unlock(&slab_mutex); + if (s->refcount) + goto out_unlock; + + if (memcg_cleanup_cache_params(s) != 0) + goto out_unlock; + + if (__kmem_cache_shutdown(s) != 0) { + printk(KERN_ERR "kmem_cache_destroy %s: " + "Slab cache still has objects\n", s->name); + dump_stack(); + goto out_unlock; } + + list_del(&s->list); + + mutex_unlock(&slab_mutex); + if (s->flags & SLAB_DESTROY_BY_RCU) + rcu_barrier(); + + memcg_free_cache_params(s); +#ifdef SLAB_SUPPORTS_SYSFS + sysfs_slab_remove(s); +#else + slab_kmem_cache_release(s); +#endif + goto out; + +out_unlock: + mutex_unlock(&slab_mutex); +out: + put_online_mems(); put_online_cpus(); } EXPORT_SYMBOL(kmem_cache_destroy); +/** + * kmem_cache_shrink - Shrink a cache. + * @cachep: The cache to shrink. + * + * Releases as many slabs as possible for a cache. + * To help debugging, a zero exit status indicates all slabs were released. + */ +int kmem_cache_shrink(struct kmem_cache *cachep) +{ + int ret; + + get_online_cpus(); + get_online_mems(); + ret = __kmem_cache_shrink(cachep); + put_online_mems(); + put_online_cpus(); + return ret; +} +EXPORT_SYMBOL(kmem_cache_shrink); + int slab_is_available(void) { return slab_state >= UP; @@ -519,6 +609,24 @@ void __init create_kmalloc_caches(unsigned long flags) } #endif /* !CONFIG_SLOB */ +/* + * To avoid unnecessary overhead, we pass through large allocation requests + * directly to the page allocator. We use __GFP_COMP, because we will need to + * know the allocation order to free the pages properly in kfree. + */ +void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) +{ + void *ret; + struct page *page; + + flags |= __GFP_COMP; + page = alloc_kmem_pages(flags, order); + ret = page ? page_address(page) : NULL; + kmemleak_alloc(ret, size, 1, flags); + return ret; +} +EXPORT_SYMBOL(kmalloc_order); + #ifdef CONFIG_TRACING void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) { @@ -111,13 +111,13 @@ static inline int slob_page_free(struct page *sp) static void set_slob_page_free(struct page *sp, struct list_head *list) { - list_add(&sp->list, list); + list_add(&sp->lru, list); __SetPageSlobFree(sp); } static inline void clear_slob_page_free(struct page *sp) { - list_del(&sp->list); + list_del(&sp->lru); __ClearPageSlobFree(sp); } @@ -282,7 +282,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) spin_lock_irqsave(&slob_lock, flags); /* Iterate through each partially free page, try to find room */ - list_for_each_entry(sp, slob_list, list) { + list_for_each_entry(sp, slob_list, lru) { #ifdef CONFIG_NUMA /* * If there's a node specification, search for a partial @@ -296,7 +296,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) continue; /* Attempt to alloc */ - prev = sp->list.prev; + prev = sp->lru.prev; b = slob_page_alloc(sp, size, align); if (!b) continue; @@ -322,7 +322,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) spin_lock_irqsave(&slob_lock, flags); sp->units = SLOB_UNITS(PAGE_SIZE); sp->freelist = b; - INIT_LIST_HEAD(&sp->list); + INIT_LIST_HEAD(&sp->lru); set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); set_slob_page_free(sp, slob_list); b = slob_page_alloc(sp, size, align); @@ -620,11 +620,10 @@ int __kmem_cache_shutdown(struct kmem_cache *c) return 0; } -int kmem_cache_shrink(struct kmem_cache *d) +int __kmem_cache_shrink(struct kmem_cache *d) { return 0; } -EXPORT_SYMBOL(kmem_cache_shrink); struct kmem_cache kmem_cache_boot = { .name = "kmem_cache", @@ -210,21 +210,22 @@ enum track_item { TRACK_ALLOC, TRACK_FREE }; #ifdef CONFIG_SYSFS static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); -static void sysfs_slab_remove(struct kmem_cache *); static void memcg_propagate_slab_attrs(struct kmem_cache *s); #else static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } -static inline void sysfs_slab_remove(struct kmem_cache *s) { } - static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } #endif static inline void stat(const struct kmem_cache *s, enum stat_item si) { #ifdef CONFIG_SLUB_STATS - __this_cpu_inc(s->cpu_slab->stat[si]); + /* + * The rmw is racy on a preemptible kernel but this is acceptable, so + * avoid this_cpu_add()'s irq-disable overhead. + */ + raw_cpu_inc(s->cpu_slab->stat[si]); #endif } @@ -402,7 +403,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page stat(s, CMPXCHG_DOUBLE_FAIL); #ifdef SLUB_DEBUG_CMPXCHG - printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); + pr_info("%s %s: cmpxchg double redo ", n, s->name); #endif return 0; @@ -443,7 +444,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, stat(s, CMPXCHG_DOUBLE_FAIL); #ifdef SLUB_DEBUG_CMPXCHG - printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); + pr_info("%s %s: cmpxchg double redo ", n, s->name); #endif return 0; @@ -545,14 +546,14 @@ static void print_track(const char *s, struct track *t) if (!t->addr) return; - printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", - s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); + pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); #ifdef CONFIG_STACKTRACE { int i; for (i = 0; i < TRACK_ADDRS_COUNT; i++) if (t->addrs[i]) - printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); + pr_err("\t%pS\n", (void *)t->addrs[i]); else break; } @@ -570,38 +571,37 @@ static void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - printk(KERN_ERR - "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", + pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", page, page->objects, page->inuse, page->freelist, page->flags); } static void slab_bug(struct kmem_cache *s, char *fmt, ...) { + struct va_format vaf; va_list args; - char buf[100]; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); - va_end(args); - printk(KERN_ERR "========================================" - "=====================================\n"); - printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); - printk(KERN_ERR "----------------------------------------" - "-------------------------------------\n\n"); + vaf.fmt = fmt; + vaf.va = &args; + pr_err("=============================================================================\n"); + pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); + pr_err("-----------------------------------------------------------------------------\n\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + va_end(args); } static void slab_fix(struct kmem_cache *s, char *fmt, ...) { + struct va_format vaf; va_list args; - char buf[100]; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + pr_err("FIX %s: %pV\n", s->name, &vaf); va_end(args); - printk(KERN_ERR "FIX %s: %s\n", s->name, buf); } static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) @@ -613,8 +613,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) print_page_info(page); - printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", - p, p - addr, get_freepointer(s, p)); + pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", + p, p - addr, get_freepointer(s, p)); if (p > addr + 16) print_section("Bytes b4 ", p - 16, 16); @@ -697,7 +697,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, end--; slab_bug(s, "%s overwritten", what); - printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", + pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", fault, end - 1, fault[0], value); print_trailer(s, page, object); @@ -930,7 +930,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc) { if (s->flags & SLAB_TRACE) { - printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", + pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n", s->name, alloc ? "alloc" : "free", object, page->inuse, @@ -1133,9 +1133,8 @@ static noinline struct kmem_cache_node *free_debug_processing( slab_err(s, page, "Attempt to free object(0x%p) " "outside of slab", object); } else if (!page->slab_cache) { - printk(KERN_ERR - "SLUB <none>: no slab for object 0x%p.\n", - object); + pr_err("SLUB <none>: no slab for object 0x%p.\n", + object); dump_stack(); } else object_err(s, page, object, @@ -1218,8 +1217,8 @@ static int __init setup_slub_debug(char *str) slub_debug |= SLAB_FAILSLAB; break; default: - printk(KERN_ERR "slub_debug option '%c' " - "unknown. skipped\n", *str); + pr_err("slub_debug option '%c' unknown. skipped\n", + *str); } } @@ -1313,17 +1312,26 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) /* * Slab allocation and freeing */ -static inline struct page *alloc_slab_page(gfp_t flags, int node, - struct kmem_cache_order_objects oo) +static inline struct page *alloc_slab_page(struct kmem_cache *s, + gfp_t flags, int node, struct kmem_cache_order_objects oo) { + struct page *page; int order = oo_order(oo); flags |= __GFP_NOTRACK; + if (memcg_charge_slab(s, flags, order)) + return NULL; + if (node == NUMA_NO_NODE) - return alloc_pages(flags, order); + page = alloc_pages(flags, order); else - return alloc_pages_exact_node(node, flags, order); + page = alloc_pages_exact_node(node, flags, order); + + if (!page) + memcg_uncharge_slab(s, order); + + return page; } static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) @@ -1345,14 +1353,15 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; - page = alloc_slab_page(alloc_gfp, node, oo); + page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) { oo = s->min; + alloc_gfp = flags; /* * Allocation may have failed due to fragmentation. * Try a lower order alloc if possible */ - page = alloc_slab_page(flags, node, oo); + page = alloc_slab_page(s, alloc_gfp, node, oo); if (page) stat(s, ORDER_FALLBACK); @@ -1362,7 +1371,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { int pages = 1 << oo_order(oo); - kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); + kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node); /* * Objects from caches that have a constructor don't get @@ -1413,7 +1422,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) order = compound_order(page); inc_slabs_node(s, page_to_nid(page), page->objects); - memcg_bind_pages(s, order); page->slab_cache = s; __SetPageSlab(page); if (page->pfmemalloc) @@ -1464,11 +1472,11 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); - memcg_release_pages(s, order); page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - __free_memcg_kmem_pages(page, order); + __free_pages(page, order); + memcg_uncharge_slab(s, order); } #define need_reserve_slab_rcu \ @@ -1684,8 +1692,8 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, return NULL; do { - cpuset_mems_cookie = get_mems_allowed(); - zonelist = node_zonelist(slab_node(), flags); + cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = node_zonelist(mempolicy_slab_node(), flags); for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { struct kmem_cache_node *n; @@ -1696,19 +1704,17 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, object = get_partial_node(s, n, c, flags); if (object) { /* - * Return the object even if - * put_mems_allowed indicated that - * the cpuset mems_allowed was - * updated in parallel. It's a - * harmless race between the alloc - * and the cpuset update. + * Don't check read_mems_allowed_retry() + * here - if mems_allowed was updated in + * parallel, that was a harmless race + * between allocation and the cpuset + * update */ - put_mems_allowed(cpuset_mems_cookie); return object; } } } - } while (!put_mems_allowed(cpuset_mems_cookie)); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); #endif return NULL; } @@ -1720,7 +1726,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, struct kmem_cache_cpu *c) { void *object; - int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; + int searchnode = (node == NUMA_NO_NODE) ? numa_mem_id() : node; object = get_partial_node(s, get_node(s, searchnode), c, flags); if (object || node != NUMA_NO_NODE) @@ -1770,19 +1776,19 @@ static inline void note_cmpxchg_failure(const char *n, #ifdef SLUB_DEBUG_CMPXCHG unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); - printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); + pr_info("%s %s: cmpxchg redo ", n, s->name); #ifdef CONFIG_PREEMPT if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) - printk("due to cpu change %d -> %d\n", + pr_warn("due to cpu change %d -> %d\n", tid_to_cpu(tid), tid_to_cpu(actual_tid)); else #endif if (tid_to_event(tid) != tid_to_event(actual_tid)) - printk("due to cpu running other code. Event %ld->%ld\n", + pr_warn("due to cpu running other code. Event %ld->%ld\n", tid_to_event(tid), tid_to_event(actual_tid)); else - printk("for unknown reason: actual=%lx was=%lx target=%lx\n", + pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", actual_tid, tid, next_tid(tid)); #endif stat(s, CMPXCHG_DOUBLE_CPU_FAIL); @@ -1875,7 +1881,7 @@ redo: new.frozen = 0; - if (!new.inuse && n->nr_partial > s->min_partial) + if (!new.inuse && n->nr_partial >= s->min_partial) m = M_FREE; else if (new.freelist) { m = M_PARTIAL; @@ -1986,7 +1992,7 @@ static void unfreeze_partials(struct kmem_cache *s, new.freelist, new.counters, "unfreezing slab")); - if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) { + if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) { page->next = discard_page; discard_page = page; } else { @@ -2121,11 +2127,19 @@ static inline int node_match(struct page *page, int node) return 1; } +#ifdef CONFIG_SLUB_DEBUG static int count_free(struct page *page) { return page->objects - page->inuse; } +static inline unsigned long node_nr_objs(struct kmem_cache_node *n) +{ + return atomic_long_read(&n->total_objects); +} +#endif /* CONFIG_SLUB_DEBUG */ + +#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) static unsigned long count_partial(struct kmem_cache_node *n, int (*get_count)(struct page *)) { @@ -2139,31 +2153,28 @@ static unsigned long count_partial(struct kmem_cache_node *n, spin_unlock_irqrestore(&n->list_lock, flags); return x; } - -static inline unsigned long node_nr_objs(struct kmem_cache_node *n) -{ -#ifdef CONFIG_SLUB_DEBUG - return atomic_long_read(&n->total_objects); -#else - return 0; -#endif -} +#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */ static noinline void slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { +#ifdef CONFIG_SLUB_DEBUG + static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); int node; - printk(KERN_WARNING - "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", + if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) + return; + + pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", nid, gfpflags); - printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " - "default order: %d, min order: %d\n", s->name, s->object_size, - s->size, oo_order(s->oo), oo_order(s->min)); + pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", + s->name, s->object_size, s->size, oo_order(s->oo), + oo_order(s->min)); if (oo_order(s->min) > get_order(s->object_size)) - printk(KERN_WARNING " %s debugging increased min order, use " - "slub_debug=O to disable.\n", s->name); + pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", + s->name); for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); @@ -2178,10 +2189,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) nr_slabs = node_nr_slabs(n); nr_objs = node_nr_objs(n); - printk(KERN_WARNING - " node %d: slabs: %ld, objs: %ld, free: %ld\n", + pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n", node, nr_slabs, nr_objs, nr_free); } +#endif } static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, @@ -2198,7 +2209,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, page = new_slab(s, flags, node); if (page) { - c = __this_cpu_ptr(s->cpu_slab); + c = raw_cpu_ptr(s->cpu_slab); if (c->page) flush_slab(s, c); @@ -2323,8 +2334,6 @@ redo: if (freelist) goto load_freelist; - stat(s, ALLOC_SLOWPATH); - freelist = get_freelist(s, page); if (!freelist) { @@ -2360,9 +2369,7 @@ new_slab: freelist = new_slab_objects(s, gfpflags, node, &c); if (unlikely(!freelist)) { - if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) - slab_out_of_memory(s, gfpflags, node); - + slab_out_of_memory(s, gfpflags, node); local_irq_restore(flags); return NULL; } @@ -2418,7 +2425,7 @@ redo: * and the retrieval of the tid. */ preempt_disable(); - c = __this_cpu_ptr(s->cpu_slab); + c = this_cpu_ptr(s->cpu_slab); /* * The transaction ids are globally unique per cpu and per operation on @@ -2431,10 +2438,10 @@ redo: object = c->freelist; page = c->page; - if (unlikely(!object || !node_match(page, node))) + if (unlikely(!object || !node_match(page, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); - - else { + stat(s, ALLOC_SLOWPATH); + } else { void *next_object = get_freepointer_safe(s, object); /* @@ -2613,7 +2620,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, return; } - if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) + if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) goto slab_empty; /* @@ -2674,7 +2681,7 @@ redo: * during the cmpxchg then the free will succedd. */ preempt_disable(); - c = __this_cpu_ptr(s->cpu_slab); + c = this_cpu_ptr(s->cpu_slab); tid = c->tid; preempt_enable(); @@ -2894,10 +2901,8 @@ static void early_kmem_cache_node_alloc(int node) BUG_ON(!page); if (page_to_nid(page) != node) { - printk(KERN_ERR "SLUB: Unable to allocate memory from " - "node %d\n", node); - printk(KERN_ERR "SLUB: Allocating a useless per node structure " - "in order to be able to continue\n"); + pr_err("SLUB: Unable to allocate memory from node %d\n", node); + pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); } n = page->freelist; @@ -3182,8 +3187,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, for_each_object(p, s, addr, page->objects) { if (!test_bit(slab_index(p, s, addr), map)) { - printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", - p, p - addr); + pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } @@ -3235,23 +3239,7 @@ static inline int kmem_cache_close(struct kmem_cache *s) int __kmem_cache_shutdown(struct kmem_cache *s) { - int rc = kmem_cache_close(s); - - if (!rc) { - /* - * We do the same lock strategy around sysfs_slab_add, see - * __kmem_cache_create. Because this is pretty much the last - * operation we do and the lock will be released shortly after - * that in slab_common.c, we could just move sysfs_slab_remove - * to a later point in common code. We should do that when we - * have a common sysfs framework for all allocators. - */ - mutex_unlock(&slab_mutex); - sysfs_slab_remove(s); - mutex_lock(&slab_mutex); - } - - return rc; + return kmem_cache_close(s); } /******************************************************************** @@ -3321,8 +3309,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) struct page *page; void *ptr = NULL; - flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; - page = alloc_pages_node(node, flags, get_order(size)); + flags |= __GFP_COMP | __GFP_NOTRACK; + page = alloc_kmem_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); @@ -3391,7 +3379,7 @@ void kfree(const void *x) if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kfree_hook(x); - __free_memcg_kmem_pages(page, compound_order(page)); + __free_kmem_pages(page, compound_order(page)); return; } slab_free(page->slab_cache, page, object, _RET_IP_); @@ -3408,7 +3396,7 @@ EXPORT_SYMBOL(kfree); * being allocated from last increasing the chance that the last objects * are freed in them. */ -int kmem_cache_shrink(struct kmem_cache *s) +int __kmem_cache_shrink(struct kmem_cache *s) { int node; int i; @@ -3464,7 +3452,6 @@ int kmem_cache_shrink(struct kmem_cache *s) kfree(slabs_by_inuse); return 0; } -EXPORT_SYMBOL(kmem_cache_shrink); static int slab_mem_going_offline_callback(void *arg) { @@ -3472,7 +3459,7 @@ static int slab_mem_going_offline_callback(void *arg) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) - kmem_cache_shrink(s); + __kmem_cache_shrink(s); mutex_unlock(&slab_mutex); return 0; @@ -3666,9 +3653,7 @@ void __init kmem_cache_init(void) register_cpu_notifier(&slab_notifier); #endif - printk(KERN_INFO - "SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d," - " CPUs=%d, Nodes=%d\n", + pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n", cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); @@ -3686,6 +3671,9 @@ static int slab_unmergeable(struct kmem_cache *s) if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) return 1; + if (!is_root_cache(s)) + return 1; + if (s->ctor) return 1; @@ -3698,9 +3686,8 @@ static int slab_unmergeable(struct kmem_cache *s) return 0; } -static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, - size_t align, unsigned long flags, const char *name, - void (*ctor)(void *)) +static struct kmem_cache *find_mergeable(size_t size, size_t align, + unsigned long flags, const char *name, void (*ctor)(void *)) { struct kmem_cache *s; @@ -3723,7 +3710,7 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, continue; if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) - continue; + continue; /* * Check if alignment is compatible. * Courtesy of Adrian Drzewiecki @@ -3734,23 +3721,24 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, if (s->size - size >= sizeof(void *)) continue; - if (!cache_match_memcg(s, memcg)) - continue; - return s; } return NULL; } struct kmem_cache * -__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; - s = find_mergeable(memcg, size, align, flags, name, ctor); + s = find_mergeable(size, align, flags, name, ctor); if (s) { + int i; + struct kmem_cache *c; + s->refcount++; + /* * Adjust the object sizes so that we clear * the complete object on kzalloc. @@ -3758,6 +3746,15 @@ __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, s->object_size = max(s->object_size, (int)size); s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); + for_each_memcg_cache_index(i) { + c = cache_from_memcg_idx(s, i); + if (!c) + continue; + c->object_size = s->object_size; + c->inuse = max_t(int, c->inuse, + ALIGN(size, sizeof(void *))); + } + if (sysfs_slab_alias(s, name)) { s->refcount--; s = NULL; @@ -3780,10 +3777,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) return 0; memcg_propagate_slab_attrs(s); - mutex_unlock(&slab_mutex); err = sysfs_slab_add(s); - mutex_lock(&slab_mutex); - if (err) kmem_cache_close(s); @@ -3941,8 +3935,8 @@ static int validate_slab_node(struct kmem_cache *s, count++; } if (count != n->nr_partial) - printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " - "counter=%ld\n", s->name, count, n->nr_partial); + pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n", + s->name, count, n->nr_partial); if (!(s->flags & SLAB_STORE_USER)) goto out; @@ -3952,9 +3946,8 @@ static int validate_slab_node(struct kmem_cache *s, count++; } if (count != atomic_long_read(&n->nr_slabs)) - printk(KERN_ERR "SLUB: %s %ld slabs counted but " - "counter=%ld\n", s->name, count, - atomic_long_read(&n->nr_slabs)); + pr_err("SLUB: %s %ld slabs counted but counter=%ld\n", + s->name, count, atomic_long_read(&n->nr_slabs)); out: spin_unlock_irqrestore(&n->list_lock, flags); @@ -4218,53 +4211,50 @@ static void resiliency_test(void) BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); - printk(KERN_ERR "SLUB resiliency testing\n"); - printk(KERN_ERR "-----------------------\n"); - printk(KERN_ERR "A. Corruption after allocation\n"); + pr_err("SLUB resiliency testing\n"); + pr_err("-----------------------\n"); + pr_err("A. Corruption after allocation\n"); p = kzalloc(16, GFP_KERNEL); p[16] = 0x12; - printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" - " 0x12->0x%p\n\n", p + 16); + pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n", + p + 16); validate_slab_cache(kmalloc_caches[4]); /* Hmmm... The next two are dangerous */ p = kzalloc(32, GFP_KERNEL); p[32 + sizeof(void *)] = 0x34; - printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" - " 0x34 -> -0x%p\n", p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); + pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n", + p); + pr_err("If allocated object is overwritten then not detectable\n\n"); validate_slab_cache(kmalloc_caches[5]); p = kzalloc(64, GFP_KERNEL); p += 64 + (get_cycles() & 0xff) * sizeof(void *); *p = 0x56; - printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", - p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); + pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", + p); + pr_err("If allocated object is overwritten then not detectable\n\n"); validate_slab_cache(kmalloc_caches[6]); - printk(KERN_ERR "\nB. Corruption after free\n"); + pr_err("\nB. Corruption after free\n"); p = kzalloc(128, GFP_KERNEL); kfree(p); *p = 0x78; - printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); + pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); validate_slab_cache(kmalloc_caches[7]); p = kzalloc(256, GFP_KERNEL); kfree(p); p[50] = 0x9a; - printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", - p); + pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); validate_slab_cache(kmalloc_caches[8]); p = kzalloc(512, GFP_KERNEL); kfree(p); p[512] = 0xab; - printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); + pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); validate_slab_cache(kmalloc_caches[9]); } #else @@ -4339,7 +4329,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } } - lock_memory_hotplug(); + get_online_mems(); #ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { for_each_node_state(node, N_NORMAL_MEMORY) { @@ -4379,7 +4369,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); #endif - unlock_memory_hotplug(); + put_online_mems(); kfree(nodes); return x + sprintf(buf + x, "\n"); } @@ -5058,15 +5048,18 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) #ifdef CONFIG_MEMCG_KMEM int i; char *buffer = NULL; + struct kmem_cache *root_cache; - if (!is_root_cache(s)) + if (is_root_cache(s)) return; + root_cache = s->memcg_params->root_cache; + /* * This mean this cache had no attribute written. Therefore, no point * in copying default values around */ - if (!s->max_attr_size) + if (!root_cache->max_attr_size) return; for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { @@ -5088,7 +5081,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) */ if (buffer) buf = buffer; - else if (s->max_attr_size < ARRAY_SIZE(mbuf)) + else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf)) buf = mbuf; else { buffer = (char *) get_zeroed_page(GFP_KERNEL); @@ -5097,7 +5090,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) buf = buffer; } - attr->show(s->memcg_params->root_cache, buf); + attr->show(root_cache, buf); attr->store(s, buf, strlen(buf)); } @@ -5106,6 +5099,11 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) #endif } +static void kmem_cache_release(struct kobject *k) +{ + slab_kmem_cache_release(to_slab(k)); +} + static const struct sysfs_ops slab_sysfs_ops = { .show = slab_attr_show, .store = slab_attr_store, @@ -5113,6 +5111,7 @@ static const struct sysfs_ops slab_sysfs_ops = { static struct kobj_type slab_ktype = { .sysfs_ops = &slab_sysfs_ops, + .release = kmem_cache_release, }; static int uevent_filter(struct kset *kset, struct kobject *kobj) @@ -5130,6 +5129,15 @@ static const struct kset_uevent_ops slab_uevent_ops = { static struct kset *slab_kset; +static inline struct kset *cache_kset(struct kmem_cache *s) +{ +#ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) + return s->memcg_params->root_cache->memcg_kset; +#endif + return slab_kset; +} + #define ID_STR_LENGTH 64 /* Create a unique string id for a slab cache: @@ -5195,29 +5203,42 @@ static int sysfs_slab_add(struct kmem_cache *s) name = create_unique_id(s); } - s->kobj.kset = slab_kset; + s->kobj.kset = cache_kset(s); err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); - if (err) { - kobject_put(&s->kobj); - return err; - } + if (err) + goto out_put_kobj; err = sysfs_create_group(&s->kobj, &slab_attr_group); - if (err) { - kobject_del(&s->kobj); - kobject_put(&s->kobj); - return err; + if (err) + goto out_del_kobj; + +#ifdef CONFIG_MEMCG_KMEM + if (is_root_cache(s)) { + s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj); + if (!s->memcg_kset) { + err = -ENOMEM; + goto out_del_kobj; + } } +#endif + kobject_uevent(&s->kobj, KOBJ_ADD); if (!unmergeable) { /* Setup first alias */ sysfs_slab_alias(s, s->name); - kfree(name); } - return 0; +out: + if (!unmergeable) + kfree(name); + return err; +out_del_kobj: + kobject_del(&s->kobj); +out_put_kobj: + kobject_put(&s->kobj); + goto out; } -static void sysfs_slab_remove(struct kmem_cache *s) +void sysfs_slab_remove(struct kmem_cache *s) { if (slab_state < FULL) /* @@ -5226,6 +5247,9 @@ static void sysfs_slab_remove(struct kmem_cache *s) */ return; +#ifdef CONFIG_MEMCG_KMEM + kset_unregister(s->memcg_kset); +#endif kobject_uevent(&s->kobj, KOBJ_REMOVE); kobject_del(&s->kobj); kobject_put(&s->kobj); @@ -5276,7 +5300,7 @@ static int __init slab_sysfs_init(void) slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); if (!slab_kset) { mutex_unlock(&slab_mutex); - printk(KERN_ERR "Cannot register slab subsystem.\n"); + pr_err("Cannot register slab subsystem.\n"); return -ENOSYS; } @@ -5285,8 +5309,8 @@ static int __init slab_sysfs_init(void) list_for_each_entry(s, &slab_caches, list) { err = sysfs_slab_add(s); if (err) - printk(KERN_ERR "SLUB: Unable to add boot slab %s" - " to sysfs\n", s->name); + pr_err("SLUB: Unable to add boot slab %s to sysfs\n", + s->name); } while (alias_list) { @@ -5295,8 +5319,8 @@ static int __init slab_sysfs_init(void) alias_list = alias_list->next; err = sysfs_slab_alias(al->s, al->name); if (err) - printk(KERN_ERR "SLUB: Unable to add boot slab alias" - " %s to sysfs\n", al->name); + pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n", + al->name); kfree(al); } diff --git a/mm/sparse.c b/mm/sparse.c index 38cad8f..d1b48b6 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -5,10 +5,12 @@ #include <linux/slab.h> #include <linux/mmzone.h> #include <linux/bootmem.h> +#include <linux/compiler.h> #include <linux/highmem.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> + #include "internal.h" #include <asm/dma.h> #include <asm/pgalloc.h> @@ -461,7 +463,7 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) } #endif -void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) +void __weak __meminit vmemmap_populate_print_last(void) { } @@ -67,7 +67,7 @@ static void __page_cache_release(struct page *page) static void __put_single_page(struct page *page) { __page_cache_release(page); - free_hot_cold_page(page, 0); + free_hot_cold_page(page, false); } static void __put_compound_page(struct page *page) @@ -79,95 +79,88 @@ static void __put_compound_page(struct page *page) (*dtor)(page); } -static void put_compound_page(struct page *page) +/** + * Two special cases here: we could avoid taking compound_lock_irqsave + * and could skip the tail refcounting(in _mapcount). + * + * 1. Hugetlbfs page: + * + * PageHeadHuge will remain true until the compound page + * is released and enters the buddy allocator, and it could + * not be split by __split_huge_page_refcount(). + * + * So if we see PageHeadHuge set, and we have the tail page pin, + * then we could safely put head page. + * + * 2. Slab THP page: + * + * PG_slab is cleared before the slab frees the head page, and + * tail pin cannot be the last reference left on the head page, + * because the slab code is free to reuse the compound page + * after a kfree/kmem_cache_free without having to check if + * there's any tail pin left. In turn all tail pinsmust be always + * released while the head is still pinned by the slab code + * and so we know PG_slab will be still set too. + * + * So if we see PageSlab set, and we have the tail page pin, + * then we could safely put head page. + */ +static __always_inline +void put_unrefcounted_compound_page(struct page *page_head, struct page *page) { - struct page *page_head; - - if (likely(!PageTail(page))) { - if (put_page_testzero(page)) { - /* - * By the time all refcounts have been released - * split_huge_page cannot run anymore from under us. - */ - if (PageHead(page)) - __put_compound_page(page); - else - __put_single_page(page); - } - return; - } - - /* __split_huge_page_refcount can run under us */ - page_head = compound_head(page); - /* - * THP can not break up slab pages so avoid taking - * compound_lock() and skip the tail page refcounting (in - * _mapcount) too. Slab performs non-atomic bit ops on - * page->flags for better performance. In particular - * slab_unlock() in slub used to be a hot path. It is still - * hot on arches that do not support - * this_cpu_cmpxchg_double(). - * - * If "page" is part of a slab or hugetlbfs page it cannot be - * splitted and the head page cannot change from under us. And - * if "page" is part of a THP page under splitting, if the - * head page pointed by the THP tail isn't a THP head anymore, - * we'll find PageTail clear after smp_rmb() and we'll treat - * it as a single page. + * If @page is a THP tail, we must read the tail page + * flags after the head page flags. The + * __split_huge_page_refcount side enforces write memory barriers + * between clearing PageTail and before the head page + * can be freed and reallocated. */ - if (!__compound_tail_refcounted(page_head)) { + smp_rmb(); + if (likely(PageTail(page))) { /* - * If "page" is a THP tail, we must read the tail page - * flags after the head page flags. The - * split_huge_page side enforces write memory barriers - * between clearing PageTail and before the head page - * can be freed and reallocated. + * __split_huge_page_refcount cannot race + * here, see the comment above this function. */ - smp_rmb(); - if (likely(PageTail(page))) { - /* - * __split_huge_page_refcount cannot race - * here. - */ - VM_BUG_ON_PAGE(!PageHead(page_head), page_head); - VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); - if (put_page_testzero(page_head)) { - /* - * If this is the tail of a slab - * compound page, the tail pin must - * not be the last reference held on - * the page, because the PG_slab - * cannot be cleared before all tail - * pins (which skips the _mapcount - * tail refcounting) have been - * released. For hugetlbfs the tail - * pin may be the last reference on - * the page instead, because - * PageHeadHuge will not go away until - * the compound page enters the buddy - * allocator. - */ - VM_BUG_ON_PAGE(PageSlab(page_head), page_head); - __put_compound_page(page_head); - } - return; - } else + VM_BUG_ON_PAGE(!PageHead(page_head), page_head); + VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); + if (put_page_testzero(page_head)) { /* - * __split_huge_page_refcount run before us, - * "page" was a THP tail. The split page_head - * has been freed and reallocated as slab or - * hugetlbfs page of smaller order (only - * possible if reallocated as slab on x86). + * If this is the tail of a slab THP page, + * the tail pin must not be the last reference + * held on the page, because the PG_slab cannot + * be cleared before all tail pins (which skips + * the _mapcount tail refcounting) have been + * released. + * + * If this is the tail of a hugetlbfs page, + * the tail pin may be the last reference on + * the page instead, because PageHeadHuge will + * not go away until the compound page enters + * the buddy allocator. */ - goto out_put_single; - } + VM_BUG_ON_PAGE(PageSlab(page_head), page_head); + __put_compound_page(page_head); + } + } else + /* + * __split_huge_page_refcount run before us, + * @page was a THP tail. The split @page_head + * has been freed and reallocated as slab or + * hugetlbfs page of smaller order (only + * possible if reallocated as slab on x86). + */ + if (put_page_testzero(page)) + __put_single_page(page); +} +static __always_inline +void put_refcounted_compound_page(struct page *page_head, struct page *page) +{ if (likely(page != page_head && get_page_unless_zero(page_head))) { unsigned long flags; /* - * page_head wasn't a dangling pointer but it may not + * @page_head wasn't a dangling pointer but it may not * be a head page anymore by the time we obtain the * lock. That is ok as long as it can't be freed from * under us. @@ -178,7 +171,7 @@ static void put_compound_page(struct page *page) compound_unlock_irqrestore(page_head, flags); if (put_page_testzero(page_head)) { /* - * The head page may have been freed + * The @page_head may have been freed * and reallocated as a compound page * of smaller order and then freed * again. All we know is that it @@ -222,12 +215,51 @@ out_put_single: __put_single_page(page_head); } } else { - /* page_head is a dangling pointer */ + /* @page_head is a dangling pointer */ VM_BUG_ON_PAGE(PageTail(page), page); goto out_put_single; } } +static void put_compound_page(struct page *page) +{ + struct page *page_head; + + /* + * We see the PageCompound set and PageTail not set, so @page maybe: + * 1. hugetlbfs head page, or + * 2. THP head page. + */ + if (likely(!PageTail(page))) { + if (put_page_testzero(page)) { + /* + * By the time all refcounts have been released + * split_huge_page cannot run anymore from under us. + */ + if (PageHead(page)) + __put_compound_page(page); + else + __put_single_page(page); + } + return; + } + + /* + * We see the PageCompound set and PageTail set, so @page maybe: + * 1. a tail hugetlbfs page, or + * 2. a tail THP page, or + * 3. a split THP page. + * + * Case 3 is possible, as we may race with + * __split_huge_page_refcount tearing down a THP page. + */ + page_head = compound_head_by_tail(page); + if (!__compound_tail_refcounted(page_head)) + put_unrefcounted_compound_page(page_head, page); + else + put_refcounted_compound_page(page_head, page); +} + void put_page(struct page *page) { if (unlikely(PageCompound(page))) @@ -441,7 +473,7 @@ void rotate_reclaimable_page(struct page *page) page_cache_get(page); local_irq_save(flags); - pvec = &__get_cpu_var(lru_rotate_pvecs); + pvec = this_cpu_ptr(&lru_rotate_pvecs); if (!pagevec_add(pvec, page)) pagevec_move_tail(pvec); local_irq_restore(flags); @@ -574,6 +606,8 @@ void mark_page_accessed(struct page *page) else __lru_cache_activate_page(page); ClearPageReferenced(page); + if (page_is_file_cache(page)) + workingset_activation(page); } else if (!PageReferenced(page)) { SetPageReferenced(page); } @@ -581,12 +615,17 @@ void mark_page_accessed(struct page *page) EXPORT_SYMBOL(mark_page_accessed); /* - * Queue the page for addition to the LRU via pagevec. The decision on whether - * to add the page to the [in]active [file|anon] list is deferred until the - * pagevec is drained. This gives a chance for the caller of __lru_cache_add() - * have the page added to the active list using mark_page_accessed(). + * Used to mark_page_accessed(page) that is not visible yet and when it is + * still safe to use non-atomic ops */ -void __lru_cache_add(struct page *page) +void init_page_accessed(struct page *page) +{ + if (!PageReferenced(page)) + __SetPageReferenced(page); +} +EXPORT_SYMBOL(init_page_accessed); + +static void __lru_cache_add(struct page *page) { struct pagevec *pvec = &get_cpu_var(lru_add_pvec); @@ -596,11 +635,34 @@ void __lru_cache_add(struct page *page) pagevec_add(pvec, page); put_cpu_var(lru_add_pvec); } -EXPORT_SYMBOL(__lru_cache_add); + +/** + * lru_cache_add: add a page to the page lists + * @page: the page to add + */ +void lru_cache_add_anon(struct page *page) +{ + if (PageActive(page)) + ClearPageActive(page); + __lru_cache_add(page); +} + +void lru_cache_add_file(struct page *page) +{ + if (PageActive(page)) + ClearPageActive(page); + __lru_cache_add(page); +} +EXPORT_SYMBOL(lru_cache_add_file); /** * lru_cache_add - add a page to a page list * @page: the page to be added to the LRU. + * + * Queue the page for addition to the LRU via pagevec. The decision on whether + * to add the page to the [in]active [file|anon] list is deferred until the + * pagevec is drained. This gives a chance for the caller of lru_cache_add() + * have the page added to the active list using mark_page_accessed(). */ void lru_cache_add(struct page *page) { @@ -811,7 +873,7 @@ void lru_add_drain_all(void) * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() * will free it. */ -void release_pages(struct page **pages, int nr, int cold) +void release_pages(struct page **pages, int nr, bool cold) { int i; LIST_HEAD(pages_to_free); @@ -852,7 +914,7 @@ void release_pages(struct page **pages, int nr, int cold) } /* Clear Active bit in case of parallel mark_page_accessed */ - ClearPageActive(page); + __ClearPageActive(page); list_add(&page->lru, &pages_to_free); } @@ -948,6 +1010,57 @@ void __pagevec_lru_add(struct pagevec *pvec) EXPORT_SYMBOL(__pagevec_lru_add); /** + * pagevec_lookup_entries - gang pagecache lookup + * @pvec: Where the resulting entries are placed + * @mapping: The address_space to search + * @start: The starting entry index + * @nr_entries: The maximum number of entries + * @indices: The cache indices corresponding to the entries in @pvec + * + * pagevec_lookup_entries() will search for and return a group of up + * to @nr_entries pages and shadow entries in the mapping. All + * entries are placed in @pvec. pagevec_lookup_entries() takes a + * reference against actual pages in @pvec. + * + * The search returns a group of mapping-contiguous entries with + * ascending indexes. There may be holes in the indices due to + * not-present entries. + * + * pagevec_lookup_entries() returns the number of entries which were + * found. + */ +unsigned pagevec_lookup_entries(struct pagevec *pvec, + struct address_space *mapping, + pgoff_t start, unsigned nr_pages, + pgoff_t *indices) +{ + pvec->nr = find_get_entries(mapping, start, nr_pages, + pvec->pages, indices); + return pagevec_count(pvec); +} + +/** + * pagevec_remove_exceptionals - pagevec exceptionals pruning + * @pvec: The pagevec to prune + * + * pagevec_lookup_entries() fills both pages and exceptional radix + * tree entries into the pagevec. This function prunes all + * exceptionals from @pvec without leaving holes, so that it can be + * passed on to page-only pagevec operations. + */ +void pagevec_remove_exceptionals(struct pagevec *pvec) +{ + int i, j; + + for (i = 0, j = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + if (!radix_tree_exceptional_entry(page)) + pvec->pages[j++] = page; + } + pvec->nr = j; +} + +/** * pagevec_lookup - gang pagecache lookup * @pvec: Where the resulting pages are placed * @mapping: The address_space to search diff --git a/mm/swap_state.c b/mm/swap_state.c index e76ace3..2972eee 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -270,7 +270,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr) for (i = 0; i < todo; i++) free_swap_cache(pagep[i]); - release_pages(pagep, todo, 0); + release_pages(pagep, todo, false); pagep += todo; nr -= todo; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 4a7f7e6..4c524f7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -51,14 +51,32 @@ atomic_long_t nr_swap_pages; /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; static int least_priority; -static atomic_t highest_priority_index = ATOMIC_INIT(-1); static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; -struct swap_list_t swap_list = {-1, -1}; +/* + * all active swap_info_structs + * protected with swap_lock, and ordered by priority. + */ +PLIST_HEAD(swap_active_head); + +/* + * all available (active, not full) swap_info_structs + * protected with swap_avail_lock, ordered by priority. + * This is used by get_swap_page() instead of swap_active_head + * because swap_active_head includes all swap_info_structs, + * but get_swap_page() doesn't need to look at full ones. + * This uses its own lock instead of swap_lock because when a + * swap_info_struct changes between not-full/full, it needs to + * add/remove itself to/from this list, but the swap_info_struct->lock + * is held and the locking order requires swap_lock to be taken + * before any swap_info_struct->lock. + */ +static PLIST_HEAD(swap_avail_head); +static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -505,13 +523,10 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, /* * If seek is expensive, start searching for new cluster from * start of partition, to minimize the span of allocated swap. - * But if seek is cheap, search from our current position, so - * that swap is allocated from all over the partition: if the - * Flash Translation Layer only remaps within limited zones, - * we don't want to wear out the first zone too quickly. + * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info + * case, just handled by scan_swap_map_try_ssd_cluster() above. */ - if (!(si->flags & SWP_SOLIDSTATE)) - scan_base = offset = si->lowest_bit; + scan_base = offset = si->lowest_bit; last_in_cluster = offset + SWAPFILE_CLUSTER - 1; /* Locate the first empty (unaligned) cluster */ @@ -531,26 +546,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, } } - offset = si->lowest_bit; - last_in_cluster = offset + SWAPFILE_CLUSTER - 1; - - /* Locate the first empty (unaligned) cluster */ - for (; last_in_cluster < scan_base; offset++) { - if (si->swap_map[offset]) - last_in_cluster = offset + SWAPFILE_CLUSTER; - else if (offset == last_in_cluster) { - spin_lock(&si->lock); - offset -= SWAPFILE_CLUSTER - 1; - si->cluster_next = offset; - si->cluster_nr = SWAPFILE_CLUSTER - 1; - goto checks; - } - if (unlikely(--latency_ration < 0)) { - cond_resched(); - latency_ration = LATENCY_LIMIT; - } - } - offset = scan_base; spin_lock(&si->lock); si->cluster_nr = SWAPFILE_CLUSTER - 1; @@ -591,6 +586,9 @@ checks: if (si->inuse_pages == si->pages) { si->lowest_bit = si->max; si->highest_bit = 0; + spin_lock(&swap_avail_lock); + plist_del(&si->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); } si->swap_map[offset] = usage; inc_cluster_info_page(si, si->cluster_info, offset); @@ -640,71 +638,65 @@ no_page: swp_entry_t get_swap_page(void) { - struct swap_info_struct *si; + struct swap_info_struct *si, *next; pgoff_t offset; - int type, next; - int wrapped = 0; - int hp_index; - spin_lock(&swap_lock); if (atomic_long_read(&nr_swap_pages) <= 0) goto noswap; atomic_long_dec(&nr_swap_pages); - for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { - hp_index = atomic_xchg(&highest_priority_index, -1); - /* - * highest_priority_index records current highest priority swap - * type which just frees swap entries. If its priority is - * higher than that of swap_list.next swap type, we use it. It - * isn't protected by swap_lock, so it can be an invalid value - * if the corresponding swap type is swapoff. We double check - * the flags here. It's even possible the swap type is swapoff - * and swapon again and its priority is changed. In such rare - * case, low prority swap type might be used, but eventually - * high priority swap will be used after several rounds of - * swap. - */ - if (hp_index != -1 && hp_index != type && - swap_info[type]->prio < swap_info[hp_index]->prio && - (swap_info[hp_index]->flags & SWP_WRITEOK)) { - type = hp_index; - swap_list.next = type; - } - - si = swap_info[type]; - next = si->next; - if (next < 0 || - (!wrapped && si->prio != swap_info[next]->prio)) { - next = swap_list.head; - wrapped++; - } + spin_lock(&swap_avail_lock); +start_over: + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { + /* requeue si to after same-priority siblings */ + plist_requeue(&si->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); spin_lock(&si->lock); - if (!si->highest_bit) { - spin_unlock(&si->lock); - continue; - } - if (!(si->flags & SWP_WRITEOK)) { + if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { + spin_lock(&swap_avail_lock); + if (plist_node_empty(&si->avail_list)) { + spin_unlock(&si->lock); + goto nextsi; + } + WARN(!si->highest_bit, + "swap_info %d in list but !highest_bit\n", + si->type); + WARN(!(si->flags & SWP_WRITEOK), + "swap_info %d in list but !SWP_WRITEOK\n", + si->type); + plist_del(&si->avail_list, &swap_avail_head); spin_unlock(&si->lock); - continue; + goto nextsi; } - swap_list.next = next; - - spin_unlock(&swap_lock); /* This is called for allocating swap entry for cache */ offset = scan_swap_map(si, SWAP_HAS_CACHE); spin_unlock(&si->lock); if (offset) - return swp_entry(type, offset); - spin_lock(&swap_lock); - next = swap_list.next; + return swp_entry(si->type, offset); + pr_debug("scan_swap_map of si %d failed to find offset\n", + si->type); + spin_lock(&swap_avail_lock); +nextsi: + /* + * if we got here, it's likely that si was almost full before, + * and since scan_swap_map() can drop the si->lock, multiple + * callers probably all tried to get a page from the same si + * and it filled up before we could get one; or, the si filled + * up between us dropping swap_avail_lock and taking si->lock. + * Since we dropped the swap_avail_lock, the swap_avail_head + * list may have been modified; so if next is still in the + * swap_avail_head list then try it, otherwise start over. + */ + if (plist_node_empty(&next->avail_list)) + goto start_over; } + spin_unlock(&swap_avail_lock); + atomic_long_inc(&nr_swap_pages); noswap: - spin_unlock(&swap_lock); return (swp_entry_t) {0}; } @@ -766,27 +758,6 @@ out: return NULL; } -/* - * This swap type frees swap entry, check if it is the highest priority swap - * type which just frees swap entry. get_swap_page() uses - * highest_priority_index to search highest priority swap type. The - * swap_info_struct.lock can't protect us if there are multiple swap types - * active, so we use atomic_cmpxchg. - */ -static void set_highest_priority_index(int type) -{ - int old_hp_index, new_hp_index; - - do { - old_hp_index = atomic_read(&highest_priority_index); - if (old_hp_index != -1 && - swap_info[old_hp_index]->prio >= swap_info[type]->prio) - break; - new_hp_index = type; - } while (atomic_cmpxchg(&highest_priority_index, - old_hp_index, new_hp_index) != old_hp_index); -} - static unsigned char swap_entry_free(struct swap_info_struct *p, swp_entry_t entry, unsigned char usage) { @@ -828,9 +799,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, dec_cluster_info_page(p, p->cluster_info, offset); if (offset < p->lowest_bit) p->lowest_bit = offset; - if (offset > p->highest_bit) + if (offset > p->highest_bit) { + bool was_full = !p->highest_bit; p->highest_bit = offset; - set_highest_priority_index(p->type); + if (was_full && (p->flags & SWP_WRITEOK)) { + spin_lock(&swap_avail_lock); + WARN_ON(!plist_node_empty(&p->avail_list)); + if (plist_node_empty(&p->avail_list)) + plist_add(&p->avail_list, + &swap_avail_head); + spin_unlock(&swap_avail_lock); + } + } atomic_long_inc(&nr_swap_pages); p->inuse_pages--; frontswap_invalidate_page(p->type, offset); @@ -1765,30 +1745,37 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info) { - int i, prev; - if (prio >= 0) p->prio = prio; else p->prio = --least_priority; + /* + * the plist prio is negated because plist ordering is + * low-to-high, while swap ordering is high-to-low + */ + p->list.prio = -p->prio; + p->avail_list.prio = -p->prio; p->swap_map = swap_map; p->cluster_info = cluster_info; p->flags |= SWP_WRITEOK; atomic_long_add(p->pages, &nr_swap_pages); total_swap_pages += p->pages; - /* insert swap space into swap_list: */ - prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { - if (p->prio >= swap_info[i]->prio) - break; - prev = i; - } - p->next = i; - if (prev < 0) - swap_list.head = swap_list.next = p->type; - else - swap_info[prev]->next = p->type; + assert_spin_locked(&swap_lock); + /* + * both lists are plists, and thus priority ordered. + * swap_active_head needs to be priority ordered for swapoff(), + * which on removal of any swap_info_struct with an auto-assigned + * (i.e. negative) priority increments the auto-assigned priority + * of any lower-priority swap_info_structs. + * swap_avail_head needs to be priority ordered for get_swap_page(), + * which allocates swap pages from the highest available priority + * swap_info_struct. + */ + plist_add(&p->list, &swap_active_head); + spin_lock(&swap_avail_lock); + plist_add(&p->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); } static void enable_swap_info(struct swap_info_struct *p, int prio, @@ -1823,8 +1810,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct address_space *mapping; struct inode *inode; struct filename *pathname; - int i, type, prev; - int err; + int err, found = 0; unsigned int old_block_size; if (!capable(CAP_SYS_ADMIN)) @@ -1842,17 +1828,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) goto out; mapping = victim->f_mapping; - prev = -1; spin_lock(&swap_lock); - for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { - p = swap_info[type]; + plist_for_each_entry(p, &swap_active_head, list) { if (p->flags & SWP_WRITEOK) { - if (p->swap_file->f_mapping == mapping) + if (p->swap_file->f_mapping == mapping) { + found = 1; break; + } } - prev = type; } - if (type < 0) { + if (!found) { err = -EINVAL; spin_unlock(&swap_lock); goto out_dput; @@ -1864,20 +1849,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } - if (prev < 0) - swap_list.head = p->next; - else - swap_info[prev]->next = p->next; - if (type == swap_list.next) { - /* just pick something that's safe... */ - swap_list.next = swap_list.head; - } + spin_lock(&swap_avail_lock); + plist_del(&p->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); spin_lock(&p->lock); if (p->prio < 0) { - for (i = p->next; i >= 0; i = swap_info[i]->next) - swap_info[i]->prio = p->prio--; + struct swap_info_struct *si = p; + + plist_for_each_entry_continue(si, &swap_active_head, list) { + si->prio++; + si->list.prio--; + si->avail_list.prio--; + } least_priority++; } + plist_del(&p->list, &swap_active_head); atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; p->flags &= ~SWP_WRITEOK; @@ -1885,7 +1871,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); set_current_oom_origin(); - err = try_to_unuse(type, false, 0); /* force all pages to be unused */ + err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ clear_current_oom_origin(); if (err) { @@ -1926,7 +1912,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) frontswap_map = frontswap_map_get(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); - frontswap_invalidate_area(type); + frontswap_invalidate_area(p->type); frontswap_map_set(p, NULL); mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); @@ -1935,7 +1921,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) vfree(cluster_info); vfree(frontswap_map); /* Destroy swap account information */ - swap_cgroup_swapoff(type); + swap_cgroup_swapoff(p->type); inode = mapping->host; if (S_ISBLK(inode->i_mode)) { @@ -2142,8 +2128,9 @@ static struct swap_info_struct *alloc_swap_info(void) */ } INIT_LIST_HEAD(&p->first_swap_extent.list); + plist_node_init(&p->list, 0); + plist_node_init(&p->avail_list, 0); p->flags = SWP_USED; - p->next = -1; spin_unlock(&swap_lock); spin_lock_init(&p->lock); diff --git a/mm/truncate.c b/mm/truncate.c index 353b683..eda2473 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -22,6 +22,45 @@ #include <linux/cleancache.h> #include "internal.h" +static void clear_exceptional_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + struct radix_tree_node *node; + void **slot; + + /* Handled by shmem itself */ + if (shmem_mapping(mapping)) + return; + + spin_lock_irq(&mapping->tree_lock); + /* + * Regular page slots are stabilized by the page lock even + * without the tree itself locked. These unlocked entries + * need verification under the tree lock. + */ + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) + goto unlock; + if (*slot != entry) + goto unlock; + radix_tree_replace_slot(slot, NULL); + mapping->nrshadows--; + if (!node) + goto unlock; + workingset_node_shadows_dec(node); + /* + * Don't track node without shadow entries. + * + * Avoid acquiring the list_lru lock if already untracked. + * The list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_shadows(node) && + !list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, &node->private_list); + __radix_tree_delete_node(&mapping->page_tree, node); +unlock: + spin_unlock_irq(&mapping->tree_lock); +} /** * do_invalidatepage - invalidate part or all of a page @@ -208,11 +247,12 @@ void truncate_inode_pages_range(struct address_space *mapping, unsigned int partial_start; /* inclusive */ unsigned int partial_end; /* exclusive */ struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index; int i; cleancache_invalidate_inode(mapping); - if (mapping->nrpages == 0) + if (mapping->nrpages == 0 && mapping->nrshadows == 0) return; /* Offsets within partial pages */ @@ -238,17 +278,23 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_init(&pvec, 0); index = start; - while (index < end && pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE))) { + while (index < end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + indices)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index >= end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + if (!trylock_page(page)) continue; WARN_ON(page->index != index); @@ -259,6 +305,7 @@ void truncate_inode_pages_range(struct address_space *mapping, truncate_inode_page(mapping, page); unlock_page(page); } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); @@ -307,14 +354,18 @@ void truncate_inode_pages_range(struct address_space *mapping, index = start; for ( ; ; ) { cond_resched(); - if (!pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE))) { + if (!pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { + /* If all gone from start onwards, we're done */ if (index == start) break; + /* Otherwise restart to make sure all gone */ index = start; continue; } - if (index == start && pvec.pages[0]->index >= end) { + if (index == start && indices[0] >= end) { + /* All gone out of hole to be punched, we're done */ + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); break; } @@ -323,9 +374,17 @@ void truncate_inode_pages_range(struct address_space *mapping, struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; - if (index >= end) + index = indices[i]; + if (index >= end) { + /* Restart punch to make sure all gone */ + index = start - 1; break; + } + + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } lock_page(page); WARN_ON(page->index != index); @@ -333,6 +392,7 @@ void truncate_inode_pages_range(struct address_space *mapping, truncate_inode_page(mapping, page); unlock_page(page); } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); index++; @@ -360,6 +420,53 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) EXPORT_SYMBOL(truncate_inode_pages); /** + * truncate_inode_pages_final - truncate *all* pages before inode dies + * @mapping: mapping to truncate + * + * Called under (and serialized by) inode->i_mutex. + * + * Filesystems have to use this in the .evict_inode path to inform the + * VM that this is the final truncate and the inode is going away. + */ +void truncate_inode_pages_final(struct address_space *mapping) +{ + unsigned long nrshadows; + unsigned long nrpages; + + /* + * Page reclaim can not participate in regular inode lifetime + * management (can't call iput()) and thus can race with the + * inode teardown. Tell it when the address space is exiting, + * so that it does not install eviction information after the + * final truncate has begun. + */ + mapping_set_exiting(mapping); + + /* + * When reclaim installs eviction entries, it increases + * nrshadows first, then decreases nrpages. Make sure we see + * this in the right order or we might miss an entry. + */ + nrpages = mapping->nrpages; + smp_rmb(); + nrshadows = mapping->nrshadows; + + if (nrpages || nrshadows) { + /* + * As truncation uses a lockless tree lookup, cycle + * the tree lock to make sure any ongoing tree + * modification that does not see AS_EXITING is + * completed before starting the final truncate. + */ + spin_lock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); + + truncate_inode_pages(mapping, 0); + } +} +EXPORT_SYMBOL(truncate_inode_pages_final); + +/** * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode * @mapping: the address_space which holds the pages to invalidate * @start: the offset 'from' which to invalidate @@ -375,32 +482,31 @@ EXPORT_SYMBOL(truncate_inode_pages); unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { + pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i; - /* - * Note: this function may get called on a shmem/tmpfs mapping: - * pagevec_lookup() might then return 0 prematurely (because it - * got a gangful of swap entries); but it's hardly worth worrying - * about - it can rarely have anything to free from such a mapping - * (most pages are dirty), and already skips over any difficulties. - */ - pagevec_init(&pvec, 0); - while (index <= end && pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + indices)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index > end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + if (!trylock_page(page)) continue; WARN_ON(page->index != index); @@ -414,6 +520,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, deactivate_page(page); count += ret; } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); @@ -444,7 +551,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) goto failed; BUG_ON(page_has_private(page)); - __delete_from_page_cache(page); + __delete_from_page_cache(page, NULL); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); @@ -481,6 +588,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page) int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { + pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; pgoff_t index; int i; @@ -491,17 +599,23 @@ int invalidate_inode_pages2_range(struct address_space *mapping, cleancache_invalidate_inode(mapping); pagevec_init(&pvec, 0); index = start; - while (index <= end && pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + indices)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index > end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + lock_page(page); WARN_ON(page->index != index); if (page->mapping != mapping) { @@ -539,6 +653,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, ret = ret2; unlock_page(page); } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); @@ -1,6 +1,7 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/compiler.h> #include <linux/export.h> #include <linux/err.h> #include <linux/sched.h> @@ -9,6 +10,7 @@ #include <linux/swapops.h> #include <linux/mman.h> #include <linux/hugetlb.h> +#include <linux/vmalloc.h> #include <asm/uaccess.h> @@ -307,7 +309,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * If the architecture not support this function, simply return with no * page pinned */ -int __attribute__((weak)) __get_user_pages_fast(unsigned long start, +int __weak __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { return 0; @@ -338,7 +340,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); * callers need to carefully consider what to use. On many architectures, * get_user_pages_fast simply falls back to get_user_pages. */ -int __attribute__((weak)) get_user_pages_fast(unsigned long start, +int __weak get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { struct mm_struct *mm = current->mm; @@ -386,6 +388,15 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); +void kvfree(const void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} +EXPORT_SYMBOL(kvfree); + struct address_space *page_mapping(struct page *page) { struct address_space *mapping = page->mapping; @@ -445,6 +456,54 @@ unsigned long vm_commit_limit(void) return allowed; } +/** + * get_cmdline() - copy the cmdline value to a buffer. + * @task: the task whose cmdline value to copy. + * @buffer: the buffer to copy to. + * @buflen: the length of the buffer. Larger cmdline values are truncated + * to this length. + * Returns the size of the cmdline field copied. Note that the copy does + * not guarantee an ending NULL byte. + */ +int get_cmdline(struct task_struct *task, char *buffer, int buflen) +{ + int res = 0; + unsigned int len; + struct mm_struct *mm = get_task_mm(task); + if (!mm) + goto out; + if (!mm->arg_end) + goto out_mm; /* Shh! No looking before we're done */ + + len = mm->arg_end - mm->arg_start; + + if (len > buflen) + len = buflen; + + res = access_process_vm(task, mm->arg_start, buffer, len, 0); + + /* + * If the nul at the end of args has been overwritten, then + * assume application is using setproctitle(3). + */ + if (res > 0 && buffer[res-1] != '\0' && len < buflen) { + len = strnlen(buffer, res); + if (len < res) { + res = len; + } else { + len = mm->env_end - mm->env_start; + if (len > buflen - res) + len = buflen - res; + res += access_process_vm(task, mm->env_start, + buffer+res, len, 0); + res = strnlen(buffer, res); + } + } +out_mm: + mmput(mm); +out: + return res; +} /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); diff --git a/mm/vmacache.c b/mm/vmacache.c new file mode 100644 index 0000000..9f25af8 --- /dev/null +++ b/mm/vmacache.c @@ -0,0 +1,132 @@ +/* + * Copyright (C) 2014 Davidlohr Bueso. + */ +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/vmacache.h> + +/* + * Flush vma caches for threads that share a given mm. + * + * The operation is safe because the caller holds the mmap_sem + * exclusively and other threads accessing the vma cache will + * have mmap_sem held at least for read, so no extra locking + * is required to maintain the vma cache. + */ +void vmacache_flush_all(struct mm_struct *mm) +{ + struct task_struct *g, *p; + + /* + * Single threaded tasks need not iterate the entire + * list of process. We can avoid the flushing as well + * since the mm's seqnum was increased and don't have + * to worry about other threads' seqnum. Current's + * flush will occur upon the next lookup. + */ + if (atomic_read(&mm->mm_users) == 1) + return; + + rcu_read_lock(); + for_each_process_thread(g, p) { + /* + * Only flush the vmacache pointers as the + * mm seqnum is already set and curr's will + * be set upon invalidation when the next + * lookup is done. + */ + if (mm == p->mm) + vmacache_flush(p); + } + rcu_read_unlock(); +} + +/* + * This task may be accessing a foreign mm via (for example) + * get_user_pages()->find_vma(). The vmacache is task-local and this + * task's vmacache pertains to a different mm (ie, its own). There is + * nothing we can do here. + * + * Also handle the case where a kernel thread has adopted this mm via use_mm(). + * That kernel thread's vmacache is not applicable to this mm. + */ +static bool vmacache_valid_mm(struct mm_struct *mm) +{ + return current->mm == mm && !(current->flags & PF_KTHREAD); +} + +void vmacache_update(unsigned long addr, struct vm_area_struct *newvma) +{ + if (vmacache_valid_mm(newvma->vm_mm)) + current->vmacache[VMACACHE_HASH(addr)] = newvma; +} + +static bool vmacache_valid(struct mm_struct *mm) +{ + struct task_struct *curr; + + if (!vmacache_valid_mm(mm)) + return false; + + curr = current; + if (mm->vmacache_seqnum != curr->vmacache_seqnum) { + /* + * First attempt will always be invalid, initialize + * the new cache for this task here. + */ + curr->vmacache_seqnum = mm->vmacache_seqnum; + vmacache_flush(curr); + return false; + } + return true; +} + +struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) +{ + int i; + + if (!vmacache_valid(mm)) + return NULL; + + count_vm_vmacache_event(VMACACHE_FIND_CALLS); + + for (i = 0; i < VMACACHE_SIZE; i++) { + struct vm_area_struct *vma = current->vmacache[i]; + + if (!vma) + continue; + if (WARN_ON_ONCE(vma->vm_mm != mm)) + break; + if (vma->vm_start <= addr && vma->vm_end > addr) { + count_vm_vmacache_event(VMACACHE_FIND_HITS); + return vma; + } + } + + return NULL; +} + +#ifndef CONFIG_MMU +struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + int i; + + if (!vmacache_valid(mm)) + return NULL; + + count_vm_vmacache_event(VMACACHE_FIND_CALLS); + + for (i = 0; i < VMACACHE_SIZE; i++) { + struct vm_area_struct *vma = current->vmacache[i]; + + if (vma && vma->vm_start == start && vma->vm_end == end) { + count_vm_vmacache_event(VMACACHE_FIND_HITS); + return vma; + } + } + + return NULL; +} +#endif diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0fdf968..f64632b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -27,7 +27,9 @@ #include <linux/pfn.h> #include <linux/kmemleak.h> #include <linux/atomic.h> +#include <linux/compiler.h> #include <linux/llist.h> + #include <asm/uaccess.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> @@ -1083,6 +1085,12 @@ EXPORT_SYMBOL(vm_unmap_ram); * @node: prefer to allocate data structures on this node * @prot: memory protection to use. PAGE_KERNEL for regular RAM * + * If you use this function for less than VMAP_MAX_ALLOC pages, it could be + * faster than vmap so it's good. But if you mix long-life and short-life + * objects with vm_map_ram(), it could consume lots of address space through + * fragmentation (especially on a 32bit machine). You could see failures in + * the end. Please use this function for short-lived objects. + * * Returns: a pointer to the address that has been mapped, or %NULL on failure */ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) @@ -1260,6 +1268,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) vunmap_page_range(addr, end); flush_tlb_kernel_range(addr, end); } +EXPORT_SYMBOL_GPL(unmap_kernel_range); int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) { @@ -1488,7 +1497,7 @@ void vfree(const void *addr) if (!addr) return; if (unlikely(in_interrupt())) { - struct vfree_deferred *p = &__get_cpu_var(vfree_deferred); + struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred); if (llist_add((struct llist_node *)addr, &p->list)) schedule_work(&p->wq); } else @@ -2181,7 +2190,7 @@ EXPORT_SYMBOL(remap_vmalloc_range); * Implement a stub for vmalloc_sync_all() if the architecture chose not to * have one. */ -void __attribute__((weak)) vmalloc_sync_all(void) +void __weak vmalloc_sync_all(void) { } @@ -2611,19 +2620,19 @@ static int s_show(struct seq_file *m, void *p) seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr); if (v->flags & VM_IOREMAP) - seq_printf(m, " ioremap"); + seq_puts(m, " ioremap"); if (v->flags & VM_ALLOC) - seq_printf(m, " vmalloc"); + seq_puts(m, " vmalloc"); if (v->flags & VM_MAP) - seq_printf(m, " vmap"); + seq_puts(m, " vmap"); if (v->flags & VM_USERMAP) - seq_printf(m, " user"); + seq_puts(m, " user"); if (v->flags & VM_VPAGES) - seq_printf(m, " vpages"); + seq_puts(m, " vpages"); show_numa_info(m, v); seq_putc(m, '\n'); diff --git a/mm/vmscan.c b/mm/vmscan.c index a9c74b4..0f16ffe 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -11,6 +11,8 @@ * Multiqueue VM started 5.8.00, Rik van Riel. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/mm.h> #include <linux/module.h> #include <linux/gfp.h> @@ -43,6 +45,7 @@ #include <linux/sysctl.h> #include <linux/oom.h> #include <linux/prefetch.h> +#include <linux/printk.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -83,6 +86,9 @@ struct scan_control { /* Scan (total_size >> priority) pages at once */ int priority; + /* anon vs. file LRUs scanning "ratio" */ + int swappiness; + /* * The memory cgroup that hit its limit and as a result is the * primary target of this reclaim invocation. @@ -224,15 +230,15 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, unsigned long freed = 0; unsigned long long delta; long total_scan; - long max_pass; + long freeable; long nr; long new_nr; int nid = shrinkctl->nid; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; - max_pass = shrinker->count_objects(shrinker, shrinkctl); - if (max_pass == 0) + freeable = shrinker->count_objects(shrinker, shrinkctl); + if (freeable == 0) return 0; /* @@ -244,14 +250,14 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, total_scan = nr; delta = (4 * nr_pages_scanned) / shrinker->seeks; - delta *= max_pass; + delta *= freeable; do_div(delta, lru_pages + 1); total_scan += delta; if (total_scan < 0) { printk(KERN_ERR "shrink_slab: %pF negative objects to delete nr=%ld\n", shrinker->scan_objects, total_scan); - total_scan = max_pass; + total_scan = freeable; } /* @@ -260,26 +266,26 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, * shrinkers to return -1 all the time. This results in a large * nr being built up so when a shrink that can do some work * comes along it empties the entire cache due to nr >>> - * max_pass. This is bad for sustaining a working set in + * freeable. This is bad for sustaining a working set in * memory. * * Hence only allow the shrinker to scan the entire cache when * a large delta change is calculated directly. */ - if (delta < max_pass / 4) - total_scan = min(total_scan, max_pass / 2); + if (delta < freeable / 4) + total_scan = min(total_scan, freeable / 2); /* * Avoid risking looping forever due to too large nr value: * never try to free more than twice the estimate number of * freeable entries. */ - if (total_scan > max_pass * 2) - total_scan = max_pass * 2; + if (total_scan > freeable * 2) + total_scan = freeable * 2; trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, nr_pages_scanned, lru_pages, - max_pass, delta, total_scan); + freeable, delta, total_scan); /* * Normally, we should not scan less than batch_size objects in one @@ -292,12 +298,12 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, * * We detect the "tight on memory" situations by looking at the total * number of objects we want to scan (total_scan). If it is greater - * than the total number of objects on slab (max_pass), we must be + * than the total number of objects on slab (freeable), we must be * scanning at high prio and therefore should try to reclaim as much as * possible. */ while (total_scan >= batch_size || - total_scan >= max_pass) { + total_scan >= freeable) { unsigned long ret; unsigned long nr_to_scan = min(batch_size, total_scan); @@ -324,7 +330,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, else new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); - trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); + trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan); return freed; } @@ -458,7 +464,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * stalls if we need to run get_block(). We could test * PagePrivate for that. * - * If this process is currently in __generic_file_aio_write() against + * If this process is currently in __generic_file_write_iter() against * this page's queue, we can perform writeback even if that * will block. * @@ -477,7 +483,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, if (page_has_private(page)) { if (try_to_free_buffers(page)) { ClearPageDirty(page); - printk("%s: orphaned page\n", __func__); + pr_info("%s: orphaned page\n", __func__); return PAGE_CLEAN; } } @@ -523,7 +529,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * Same as remove_mapping, but if the page is removed from the mapping, it * gets returned with a refcount of 0. */ -static int __remove_mapping(struct address_space *mapping, struct page *page) +static int __remove_mapping(struct address_space *mapping, struct page *page, + bool reclaimed) { BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); @@ -569,10 +576,23 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) swapcache_free(swap, page); } else { void (*freepage)(struct page *); + void *shadow = NULL; freepage = mapping->a_ops->freepage; - - __delete_from_page_cache(page); + /* + * Remember a shadow entry for reclaimed file cache in + * order to detect refaults, thus thrashing, later on. + * + * But don't store shadows in an address space that is + * already exiting. This is not just an optizimation, + * inode reclaim needs to empty out the radix tree or + * the nodes are lost. Don't plant shadows behind its + * back. + */ + if (reclaimed && page_is_file_cache(page) && + !mapping_exiting(mapping)) + shadow = workingset_eviction(mapping, page); + __delete_from_page_cache(page, shadow); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); @@ -595,7 +615,7 @@ cannot_free: */ int remove_mapping(struct address_space *mapping, struct page *page) { - if (__remove_mapping(mapping, page)) { + if (__remove_mapping(mapping, page, false)) { /* * Unfreezing the refcount with 1 rather than 2 effectively * drops the pagecache ref for us without requiring another @@ -1065,7 +1085,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } - if (!mapping || !__remove_mapping(mapping, page)) + if (!mapping || !__remove_mapping(mapping, page, true)) goto keep_locked; /* @@ -1107,7 +1127,7 @@ keep: VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } - free_hot_cold_page_list(&free_pages, 1); + free_hot_cold_page_list(&free_pages, true); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); @@ -1144,7 +1164,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, TTU_UNMAP|TTU_IGNORE_ACCESS, &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); list_splice(&clean_pages, page_list); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); + mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); return ret; } @@ -1425,6 +1445,19 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) } /* + * If a kernel thread (such as nfsd for loop-back mounts) services + * a backing device by writing to the page cache it sets PF_LESS_THROTTLE. + * In that case we should only throttle if the backing device it is + * writing to is congested. In other cases it is safe to throttle. + */ +static int current_may_throttle(void) +{ + return !(current->flags & PF_LESS_THROTTLE) || + current->backing_dev_info == NULL || + bdi_write_congested(current->backing_dev_info); +} + +/* * shrink_inactive_list() is a helper for shrink_zone(). It returns the number * of reclaimed pages */ @@ -1505,7 +1538,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_unlock_irq(&zone->lru_lock); - free_hot_cold_page_list(&page_list, 1); + free_hot_cold_page_list(&page_list, true); /* * If reclaim is isolating dirty pages under writeback, it implies @@ -1540,19 +1573,18 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * If dirty pages are scanned that are not queued for IO, it * implies that flushers are not keeping up. In this case, flag * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing - * pages from reclaim context. It will forcibly stall in the - * next check. + * pages from reclaim context. */ if (nr_unqueued_dirty == nr_taken) zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); /* - * In addition, if kswapd scans pages marked marked for - * immediate reclaim and under writeback (nr_immediate), it - * implies that pages are cycling through the LRU faster than + * If kswapd scans pages marked marked for immediate + * reclaim and under writeback (nr_immediate), it implies + * that pages are cycling through the LRU faster than * they are written so also forcibly stall. */ - if (nr_unqueued_dirty == nr_taken || nr_immediate) + if (nr_immediate && current_may_throttle()) congestion_wait(BLK_RW_ASYNC, HZ/10); } @@ -1561,7 +1593,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * is congested. Allow kswapd to continue until it starts encountering * unqueued dirty pages or cycling through the LRU too quickly. */ - if (!sc->hibernation_mode && !current_is_kswapd()) + if (!sc->hibernation_mode && !current_is_kswapd() && + current_may_throttle()) wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, @@ -1726,7 +1759,7 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); - free_hot_cold_page_list(&l_hold, 1); + free_hot_cold_page_list(&l_hold, true); } #ifdef CONFIG_SWAP @@ -1816,13 +1849,6 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); } -static int vmscan_swappiness(struct scan_control *sc) -{ - if (global_reclaim(sc)) - return vm_swappiness; - return mem_cgroup_swappiness(sc->target_mem_cgroup); -} - enum scan_balance { SCAN_EQUAL, SCAN_FRACT, @@ -1848,10 +1874,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, struct zone *zone = lruvec_zone(lruvec); unsigned long anon_prio, file_prio; enum scan_balance scan_balance; - unsigned long anon, file, free; + unsigned long anon, file; bool force_scan = false; unsigned long ap, fp; enum lru_list lru; + bool some_scanned; + int pass; /* * If the zone or memcg is small, nr[l] can be 0. This @@ -1881,7 +1909,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * using the memory controller's swap limit feature would be * too expensive. */ - if (!global_reclaim(sc) && !vmscan_swappiness(sc)) { + if (!global_reclaim(sc) && !sc->swappiness) { scan_balance = SCAN_FILE; goto out; } @@ -1891,7 +1919,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * system is close to OOM, scan both anon and file equally * (unless the swappiness setting disagrees with swapping). */ - if (!sc->priority && vmscan_swappiness(sc)) { + if (!sc->priority && sc->swappiness) { scan_balance = SCAN_EQUAL; goto out; } @@ -1902,13 +1930,17 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, get_lru_size(lruvec, LRU_INACTIVE_FILE); /* - * If it's foreseeable that reclaiming the file cache won't be - * enough to get the zone back into a desirable shape, we have - * to swap. Better start now and leave the - probably heavily - * thrashing - remaining file pages alone. + * Prevent the reclaimer from falling into the cache trap: as + * cache pages start out inactive, every cache fault will tip + * the scan balance towards the file LRU. And as the file LRU + * shrinks, so does the window for rotation from references. + * This means we have a runaway feedback loop where a tiny + * thrashing file LRU becomes infinitely more attractive than + * anon pages. Try to detect this based on file LRU size. */ if (global_reclaim(sc)) { - free = zone_page_state(zone, NR_FREE_PAGES); + unsigned long free = zone_page_state(zone, NR_FREE_PAGES); + if (unlikely(file + free <= high_wmark_pages(zone))) { scan_balance = SCAN_ANON; goto out; @@ -1930,7 +1962,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ - anon_prio = vmscan_swappiness(sc); + anon_prio = sc->swappiness; file_prio = 200 - anon_prio; /* @@ -1971,39 +2003,49 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, fraction[1] = fp; denominator = ap + fp + 1; out: - for_each_evictable_lru(lru) { - int file = is_file_lru(lru); - unsigned long size; - unsigned long scan; + some_scanned = false; + /* Only use force_scan on second pass. */ + for (pass = 0; !some_scanned && pass < 2; pass++) { + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); + unsigned long size; + unsigned long scan; - size = get_lru_size(lruvec, lru); - scan = size >> sc->priority; + size = get_lru_size(lruvec, lru); + scan = size >> sc->priority; - if (!scan && force_scan) - scan = min(size, SWAP_CLUSTER_MAX); + if (!scan && pass && force_scan) + scan = min(size, SWAP_CLUSTER_MAX); - switch (scan_balance) { - case SCAN_EQUAL: - /* Scan lists relative to size */ - break; - case SCAN_FRACT: + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: + /* + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. + */ + scan = div64_u64(scan * fraction[file], + denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) + scan = 0; + break; + default: + /* Look ma, no brain */ + BUG(); + } + nr[lru] = scan; /* - * Scan types proportional to swappiness and - * their relative recent reclaim efficiency. + * Skip the second pass and don't force_scan, + * if we found something to scan. */ - scan = div64_u64(scan * fraction[file], denominator); - break; - case SCAN_FILE: - case SCAN_ANON: - /* Scan one type exclusively */ - if ((scan_balance == SCAN_FILE) != file) - scan = 0; - break; - default: - /* Look ma, no brain */ - BUG(); + some_scanned |= !!scan; } - nr[lru] = scan; } } @@ -2019,13 +2061,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct blk_plug plug; - bool scan_adjusted = false; + bool scan_adjusted; get_scan_count(lruvec, sc, nr); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); + /* + * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal + * event that can occur when there is little memory pressure e.g. + * multiple streaming readers/writers. Hence, we do not abort scanning + * when the requested number of pages are reclaimed when scanning at + * DEF_PRIORITY on the assumption that the fact we are direct + * reclaiming implies that kswapd is not keeping up and it is best to + * do a batch of work at once. For memcg reclaim one check is made to + * abort proportional reclaim if either the file or anon lru has already + * dropped to zero at the first pass. + */ + scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && + sc->priority == DEF_PRIORITY); + blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { @@ -2046,17 +2102,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) continue; /* - * For global direct reclaim, reclaim only the number of pages - * requested. Less care is taken to scan proportionally as it - * is more important to minimise direct reclaim stall latency - * than it is to properly age the LRU lists. - */ - if (global_reclaim(sc) && !current_is_kswapd()) - break; - - /* * For kswapd and memcg, reclaim at least the number of pages - * requested. Ensure that the anon and file LRUs shrink + * requested. Ensure that the anon and file LRUs are scanned * proportionally what was requested by get_scan_count(). We * stop reclaiming one LRU and reduce the amount scanning * proportional to the original scan target. @@ -2064,6 +2111,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + /* + * It's just vindictive to attack the larger once the smaller + * has gone to zero. And given the way we stop scanning the + * smaller below, this makes sure that we only make one nudge + * towards proportionality once we've got nr_to_reclaim. + */ + if (!nr_file || !nr_anon) + break; + if (nr_file > nr_anon) { unsigned long scan_target = targets[LRU_INACTIVE_ANON] + targets[LRU_ACTIVE_ANON] + 1; @@ -2206,6 +2262,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) lruvec = mem_cgroup_zone_lruvec(zone, memcg); + sc->swappiness = mem_cgroup_swappiness(memcg); shrink_lruvec(lruvec, sc); /* @@ -2250,9 +2307,8 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * there is a buffer of free pages available to give compaction * a reasonable chance of completing and allocating the page */ - balance_gap = min(low_wmark_pages(zone), - (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / - KSWAPD_ZONE_BALANCE_GAP_RATIO); + balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( + zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); @@ -2297,16 +2353,26 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; + unsigned long lru_pages = 0; bool aborted_reclaim = false; + struct reclaim_state *reclaim_state = current->reclaim_state; + gfp_t orig_mask; + struct shrink_control shrink = { + .gfp_mask = sc->gfp_mask, + }; + enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); /* * If the number of buffer_heads in the machine exceeds the maximum * allowed level, force direct reclaim to scan the highmem zone as * highmem pages could be pinning lowmem pages storing buffer_heads */ + orig_mask = sc->gfp_mask; if (buffer_heads_over_limit) sc->gfp_mask |= __GFP_HIGHMEM; + nodes_clear(shrink.nodes_to_scan); + for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { if (!populated_zone(zone)) @@ -2318,6 +2384,10 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (global_reclaim(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; + + lru_pages += zone_reclaimable_pages(zone); + node_set(zone_to_nid(zone), shrink.nodes_to_scan); + if (sc->priority != DEF_PRIORITY && !zone_reclaimable(zone)) continue; /* Let kswapd poll it */ @@ -2331,7 +2401,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * noticeable problem, like transparent huge * page allocations. */ - if (compaction_ready(zone, sc)) { + if ((zonelist_zone_idx(z) <= requested_highidx) + && compaction_ready(zone, sc)) { aborted_reclaim = true; continue; } @@ -2354,6 +2425,26 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) shrink_zone(zone, sc); } + /* + * Don't shrink slabs when reclaiming memory from over limit cgroups + * but do shrink slab at least once when aborting reclaim for + * compaction to avoid unevenly scanning file/anon LRU pages over slab + * pages. + */ + if (global_reclaim(sc)) { + shrink_slab(&shrink, sc->nr_scanned, lru_pages); + if (reclaim_state) { + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } + } + + /* + * Restore to original mask to avoid the impact on the caller if we + * promoted it to __GFP_HIGHMEM. + */ + sc->gfp_mask = orig_mask; + return aborted_reclaim; } @@ -2394,13 +2485,9 @@ static bool all_unreclaimable(struct zonelist *zonelist, * else, the number of pages reclaimed */ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, - struct scan_control *sc, - struct shrink_control *shrink) + struct scan_control *sc) { unsigned long total_scanned = 0; - struct reclaim_state *reclaim_state = current->reclaim_state; - struct zoneref *z; - struct zone *zone; unsigned long writeback_threshold; bool aborted_reclaim; @@ -2415,32 +2502,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, sc->nr_scanned = 0; aborted_reclaim = shrink_zones(zonelist, sc); - /* - * Don't shrink slabs when reclaiming memory from over limit - * cgroups but do shrink slab at least once when aborting - * reclaim for compaction to avoid unevenly scanning file/anon - * LRU pages over slab pages. - */ - if (global_reclaim(sc)) { - unsigned long lru_pages = 0; - - nodes_clear(shrink->nodes_to_scan); - for_each_zone_zonelist(zone, z, zonelist, - gfp_zone(sc->gfp_mask)) { - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - - lru_pages += zone_reclaimable_pages(zone); - node_set(zone_to_nid(zone), - shrink->nodes_to_scan); - } - - shrink_slab(shrink, sc->nr_scanned, lru_pages); - if (reclaim_state) { - sc->nr_reclaimed += reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; - } - } total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) goto out; @@ -2502,10 +2563,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; + if (!populated_zone(zone)) + continue; + pfmemalloc_reserve += min_wmark_pages(zone); free_pages += zone_page_state(zone, NR_FREE_PAGES); } + /* If there are no reserves (unexpected config) then do not throttle */ + if (!pfmemalloc_reserve) + return true; + wmark_ok = free_pages > pfmemalloc_reserve / 2; /* kswapd must be awake if processes are being throttled */ @@ -2530,9 +2598,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, nodemask_t *nodemask) { + struct zoneref *z; struct zone *zone; - int high_zoneidx = gfp_zone(gfp_mask); - pg_data_t *pgdat; + pg_data_t *pgdat = NULL; /* * Kernel threads should not be throttled as they may be indirectly @@ -2551,10 +2619,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, if (fatal_signal_pending(current)) goto out; - /* Check if the pfmemalloc reserves are ok */ - first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); - pgdat = zone->zone_pgdat; - if (pfmemalloc_watermark_ok(pgdat)) + /* + * Check if the pfmemalloc reserves are ok by finding the first node + * with a usable ZONE_NORMAL or lower zone. The expectation is that + * GFP_KERNEL will be required for allocating network buffers when + * swapping over the network so ZONE_HIGHMEM is unusable. + * + * Throttling is based on the first usable node and throttled processes + * wait on a queue until kswapd makes progress and wakes them. There + * is an affinity then between processes waking up and where reclaim + * progress has been made assuming the process wakes on the same node. + * More importantly, processes running on remote nodes will not compete + * for remote pfmemalloc reserves and processes on different nodes + * should make reasonable progress. + */ + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_mask, nodemask) { + if (zone_idx(zone) > ZONE_NORMAL) + continue; + + /* Throttle based on the first usable node */ + pgdat = zone->zone_pgdat; + if (pfmemalloc_watermark_ok(pgdat)) + goto out; + break; + } + + /* If no zone was usable by the allocation flags then do not throttle */ + if (!pgdat) goto out; /* Account for the throttling */ @@ -2602,9 +2694,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .target_mem_cgroup = NULL, .nodemask = nodemask, }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; /* * Do not enter reclaim if fatal signal was delivered while throttled. @@ -2618,7 +2707,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, sc.may_writepage, gfp_mask); - nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); @@ -2640,6 +2729,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, .may_swap = !noswap, .order = 0, .priority = 0, + .swappiness = mem_cgroup_swappiness(memcg), .target_mem_cgroup = memcg, }; struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); @@ -2685,9 +2775,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; /* * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't @@ -2702,7 +2789,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, sc.may_writepage, sc.gfp_mask); - nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -2874,9 +2961,8 @@ static bool kswapd_shrink_zone(struct zone *zone, * high wmark plus a "gap" where the gap is either the low * watermark or 1% of the zone, whichever is smaller. */ - balance_gap = min(low_wmark_pages(zone), - (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / - KSWAPD_ZONE_BALANCE_GAP_RATIO); + balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( + zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); /* * If there is no low memory pressure or the zone is balanced then no @@ -3285,7 +3371,10 @@ static int kswapd(void *p) } } + tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); current->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + return 0; } @@ -3337,9 +3426,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) .order = 0, .priority = DEF_PRIORITY, }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; unsigned long nr_reclaimed; @@ -3349,7 +3435,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); p->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); @@ -3408,7 +3494,7 @@ int kswapd_run(int nid) /* * Called by memory hotplug when all memory in a node is offlined. Caller must - * hold lock_memory_hotplug(). + * hold mem_hotplug_begin/end(). */ void kswapd_stop(int nid) { diff --git a/mm/vmstat.c b/mm/vmstat.c index def5dd2..b37bd49 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -207,7 +207,9 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, } /* - * For use when we know that interrupts are disabled. + * For use when we know that interrupts are disabled, + * or when we know that preemption is disabled and that + * particular counter cannot be updated from interrupt context. */ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, int delta) @@ -489,7 +491,7 @@ static void refresh_cpu_vm_stats(void) continue; if (__this_cpu_read(p->pcp.count)) - drain_zone_pages(zone, __this_cpu_ptr(&p->pcp)); + drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); #endif } fold_diff(global_diff); @@ -770,6 +772,9 @@ const char * const vmstat_text[] = { "numa_local", "numa_other", #endif + "workingset_refault", + "workingset_activate", + "workingset_nodereclaim", "nr_anon_transparent_hugepages", "nr_free_cma", "nr_dirty_threshold", @@ -810,6 +815,9 @@ const char * const vmstat_text[] = { "pgrotated", + "drop_pagecache", + "drop_slab", + #ifdef CONFIG_NUMA_BALANCING "numa_pte_updates", "numa_huge_pte_updates", @@ -860,6 +868,10 @@ const char * const vmstat_text[] = { "nr_tlb_local_flush_one", #endif /* CONFIG_DEBUG_TLBFLUSH */ +#ifdef CONFIG_DEBUG_VM_VMACACHE + "vmacache_find_calls", + "vmacache_find_hits", +#endif #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ @@ -1220,7 +1232,7 @@ int sysctl_stat_interval __read_mostly = HZ; static void vmstat_update(struct work_struct *w) { refresh_cpu_vm_stats(); - schedule_delayed_work(&__get_cpu_var(vmstat_work), + schedule_delayed_work(this_cpu_ptr(&vmstat_work), round_jiffies_relative(sysctl_stat_interval)); } @@ -1292,14 +1304,14 @@ static int __init setup_vmstat(void) #ifdef CONFIG_SMP int cpu; - register_cpu_notifier(&vmstat_notifier); + cpu_notifier_register_begin(); + __register_cpu_notifier(&vmstat_notifier); - get_online_cpus(); for_each_online_cpu(cpu) { start_cpu_timer(cpu); node_set_state(cpu_to_node(cpu), N_CPU); } - put_online_cpus(); + cpu_notifier_register_done(); #endif #ifdef CONFIG_PROC_FS proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); diff --git a/mm/workingset.c b/mm/workingset.c new file mode 100644 index 0000000..f7216fa --- /dev/null +++ b/mm/workingset.c @@ -0,0 +1,414 @@ +/* + * Workingset detection + * + * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner + */ + +#include <linux/memcontrol.h> +#include <linux/writeback.h> +#include <linux/pagemap.h> +#include <linux/atomic.h> +#include <linux/module.h> +#include <linux/swap.h> +#include <linux/fs.h> +#include <linux/mm.h> + +/* + * Double CLOCK lists + * + * Per zone, two clock lists are maintained for file pages: the + * inactive and the active list. Freshly faulted pages start out at + * the head of the inactive list and page reclaim scans pages from the + * tail. Pages that are accessed multiple times on the inactive list + * are promoted to the active list, to protect them from reclaim, + * whereas active pages are demoted to the inactive list when the + * active list grows too big. + * + * fault ------------------------+ + * | + * +--------------+ | +-------------+ + * reclaim <- | inactive | <-+-- demotion | active | <--+ + * +--------------+ +-------------+ | + * | | + * +-------------- promotion ------------------+ + * + * + * Access frequency and refault distance + * + * A workload is thrashing when its pages are frequently used but they + * are evicted from the inactive list every time before another access + * would have promoted them to the active list. + * + * In cases where the average access distance between thrashing pages + * is bigger than the size of memory there is nothing that can be + * done - the thrashing set could never fit into memory under any + * circumstance. + * + * However, the average access distance could be bigger than the + * inactive list, yet smaller than the size of memory. In this case, + * the set could fit into memory if it weren't for the currently + * active pages - which may be used more, hopefully less frequently: + * + * +-memory available to cache-+ + * | | + * +-inactive------+-active----+ + * a b | c d e f g h i | J K L M N | + * +---------------+-----------+ + * + * It is prohibitively expensive to accurately track access frequency + * of pages. But a reasonable approximation can be made to measure + * thrashing on the inactive list, after which refaulting pages can be + * activated optimistically to compete with the existing active pages. + * + * Approximating inactive page access frequency - Observations: + * + * 1. When a page is accessed for the first time, it is added to the + * head of the inactive list, slides every existing inactive page + * towards the tail by one slot, and pushes the current tail page + * out of memory. + * + * 2. When a page is accessed for the second time, it is promoted to + * the active list, shrinking the inactive list by one slot. This + * also slides all inactive pages that were faulted into the cache + * more recently than the activated page towards the tail of the + * inactive list. + * + * Thus: + * + * 1. The sum of evictions and activations between any two points in + * time indicate the minimum number of inactive pages accessed in + * between. + * + * 2. Moving one inactive page N page slots towards the tail of the + * list requires at least N inactive page accesses. + * + * Combining these: + * + * 1. When a page is finally evicted from memory, the number of + * inactive pages accessed while the page was in cache is at least + * the number of page slots on the inactive list. + * + * 2. In addition, measuring the sum of evictions and activations (E) + * at the time of a page's eviction, and comparing it to another + * reading (R) at the time the page faults back into memory tells + * the minimum number of accesses while the page was not cached. + * This is called the refault distance. + * + * Because the first access of the page was the fault and the second + * access the refault, we combine the in-cache distance with the + * out-of-cache distance to get the complete minimum access distance + * of this page: + * + * NR_inactive + (R - E) + * + * And knowing the minimum access distance of a page, we can easily + * tell if the page would be able to stay in cache assuming all page + * slots in the cache were available: + * + * NR_inactive + (R - E) <= NR_inactive + NR_active + * + * which can be further simplified to + * + * (R - E) <= NR_active + * + * Put into words, the refault distance (out-of-cache) can be seen as + * a deficit in inactive list space (in-cache). If the inactive list + * had (R - E) more page slots, the page would not have been evicted + * in between accesses, but activated instead. And on a full system, + * the only thing eating into inactive list space is active pages. + * + * + * Activating refaulting pages + * + * All that is known about the active list is that the pages have been + * accessed more than once in the past. This means that at any given + * time there is actually a good chance that pages on the active list + * are no longer in active use. + * + * So when a refault distance of (R - E) is observed and there are at + * least (R - E) active pages, the refaulting page is activated + * optimistically in the hope that (R - E) active pages are actually + * used less frequently than the refaulting page - or even not used at + * all anymore. + * + * If this is wrong and demotion kicks in, the pages which are truly + * used more frequently will be reactivated while the less frequently + * used once will be evicted from memory. + * + * But if this is right, the stale pages will be pushed out of memory + * and the used pages get to stay in cache. + * + * + * Implementation + * + * For each zone's file LRU lists, a counter for inactive evictions + * and activations is maintained (zone->inactive_age). + * + * On eviction, a snapshot of this counter (along with some bits to + * identify the zone) is stored in the now empty page cache radix tree + * slot of the evicted page. This is called a shadow entry. + * + * On cache misses for which there are shadow entries, an eligible + * refault distance will immediately activate the refaulting page. + */ + +static void *pack_shadow(unsigned long eviction, struct zone *zone) +{ + eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); + eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); + eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); + + return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); +} + +static void unpack_shadow(void *shadow, + struct zone **zone, + unsigned long *distance) +{ + unsigned long entry = (unsigned long)shadow; + unsigned long eviction; + unsigned long refault; + unsigned long mask; + int zid, nid; + + entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; + zid = entry & ((1UL << ZONES_SHIFT) - 1); + entry >>= ZONES_SHIFT; + nid = entry & ((1UL << NODES_SHIFT) - 1); + entry >>= NODES_SHIFT; + eviction = entry; + + *zone = NODE_DATA(nid)->node_zones + zid; + + refault = atomic_long_read(&(*zone)->inactive_age); + mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT + + RADIX_TREE_EXCEPTIONAL_SHIFT); + /* + * The unsigned subtraction here gives an accurate distance + * across inactive_age overflows in most cases. + * + * There is a special case: usually, shadow entries have a + * short lifetime and are either refaulted or reclaimed along + * with the inode before they get too old. But it is not + * impossible for the inactive_age to lap a shadow entry in + * the field, which can then can result in a false small + * refault distance, leading to a false activation should this + * old entry actually refault again. However, earlier kernels + * used to deactivate unconditionally with *every* reclaim + * invocation for the longest time, so the occasional + * inappropriate activation leading to pressure on the active + * list is not a problem. + */ + *distance = (refault - eviction) & mask; +} + +/** + * workingset_eviction - note the eviction of a page from memory + * @mapping: address space the page was backing + * @page: the page being evicted + * + * Returns a shadow entry to be stored in @mapping->page_tree in place + * of the evicted @page so that a later refault can be detected. + */ +void *workingset_eviction(struct address_space *mapping, struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long eviction; + + eviction = atomic_long_inc_return(&zone->inactive_age); + return pack_shadow(eviction, zone); +} + +/** + * workingset_refault - evaluate the refault of a previously evicted page + * @shadow: shadow entry of the evicted page + * + * Calculates and evaluates the refault distance of the previously + * evicted page in the context of the zone it was allocated in. + * + * Returns %true if the page should be activated, %false otherwise. + */ +bool workingset_refault(void *shadow) +{ + unsigned long refault_distance; + struct zone *zone; + + unpack_shadow(shadow, &zone, &refault_distance); + inc_zone_state(zone, WORKINGSET_REFAULT); + + if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) { + inc_zone_state(zone, WORKINGSET_ACTIVATE); + return true; + } + return false; +} + +/** + * workingset_activation - note a page activation + * @page: page that is being activated + */ +void workingset_activation(struct page *page) +{ + atomic_long_inc(&page_zone(page)->inactive_age); +} + +/* + * Shadow entries reflect the share of the working set that does not + * fit into memory, so their number depends on the access pattern of + * the workload. In most cases, they will refault or get reclaimed + * along with the inode, but a (malicious) workload that streams + * through files with a total size several times that of available + * memory, while preventing the inodes from being reclaimed, can + * create excessive amounts of shadow nodes. To keep a lid on this, + * track shadow nodes and reclaim them when they grow way past the + * point where they would still be useful. + */ + +struct list_lru workingset_shadow_nodes; + +static unsigned long count_shadow_nodes(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long shadow_nodes; + unsigned long max_nodes; + unsigned long pages; + + /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + local_irq_disable(); + shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid); + local_irq_enable(); + + pages = node_present_pages(sc->nid); + /* + * Active cache pages are limited to 50% of memory, and shadow + * entries that represent a refault distance bigger than that + * do not have any effect. Limit the number of shadow nodes + * such that shadow entries do not exceed the number of active + * cache pages, assuming a worst-case node population density + * of 1/8th on average. + * + * On 64-bit with 7 radix_tree_nodes per page and 64 slots + * each, this will reclaim shadow entries when they consume + * ~2% of available memory: + * + * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE + */ + max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3); + + if (shadow_nodes <= max_nodes) + return 0; + + return shadow_nodes - max_nodes; +} + +static enum lru_status shadow_lru_isolate(struct list_head *item, + spinlock_t *lru_lock, + void *arg) +{ + struct address_space *mapping; + struct radix_tree_node *node; + unsigned int i; + int ret; + + /* + * Page cache insertions and deletions synchroneously maintain + * the shadow node LRU under the mapping->tree_lock and the + * lru_lock. Because the page cache tree is emptied before + * the inode can be destroyed, holding the lru_lock pins any + * address_space that has radix tree nodes on the LRU. + * + * We can then safely transition to the mapping->tree_lock to + * pin only the address_space of the particular node we want + * to reclaim, take the node off-LRU, and drop the lru_lock. + */ + + node = container_of(item, struct radix_tree_node, private_list); + mapping = node->private_data; + + /* Coming from the list, invert the lock order */ + if (!spin_trylock(&mapping->tree_lock)) { + spin_unlock(lru_lock); + ret = LRU_RETRY; + goto out; + } + + list_del_init(item); + spin_unlock(lru_lock); + + /* + * The nodes should only contain one or more shadow entries, + * no pages, so we expect to be able to remove them all and + * delete and free the empty node afterwards. + */ + + BUG_ON(!node->count); + BUG_ON(node->count & RADIX_TREE_COUNT_MASK); + + for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { + if (node->slots[i]) { + BUG_ON(!radix_tree_exceptional_entry(node->slots[i])); + node->slots[i] = NULL; + BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT)); + node->count -= 1U << RADIX_TREE_COUNT_SHIFT; + BUG_ON(!mapping->nrshadows); + mapping->nrshadows--; + } + } + BUG_ON(node->count); + inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM); + if (!__radix_tree_delete_node(&mapping->page_tree, node)) + BUG(); + + spin_unlock(&mapping->tree_lock); + ret = LRU_REMOVED_RETRY; +out: + local_irq_enable(); + cond_resched(); + local_irq_disable(); + spin_lock(lru_lock); + return ret; +} + +static unsigned long scan_shadow_nodes(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long ret; + + /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + local_irq_disable(); + ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid, + shadow_lru_isolate, NULL, &sc->nr_to_scan); + local_irq_enable(); + return ret; +} + +static struct shrinker workingset_shadow_shrinker = { + .count_objects = count_shadow_nodes, + .scan_objects = scan_shadow_nodes, + .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_NUMA_AWARE, +}; + +/* + * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe + * mapping->tree_lock. + */ +static struct lock_class_key shadow_nodes_key; + +static int __init workingset_init(void) +{ + int ret; + + ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); + if (ret) + goto err; + ret = register_shrinker(&workingset_shadow_shrinker); + if (ret) + goto err_list_lru; + return 0; +err_list_lru: + list_lru_destroy(&workingset_shadow_nodes); +err: + return ret; +} +module_init(workingset_init); @@ -247,7 +247,7 @@ void zbud_destroy_pool(struct zbud_pool *pool) * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate * a new page. */ -int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, +int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp, unsigned long *handle) { int chunks, i, freechunks; @@ -255,7 +255,7 @@ int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, enum buddy bud; struct page *page; - if (size <= 0 || gfp & __GFP_HIGHMEM) + if (!size || (gfp & __GFP_HIGHMEM)) return -EINVAL; if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) return -ENOSPC; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c03ca5e..fe78189 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -141,7 +141,7 @@ #define ZS_MAX_ALLOC_SIZE PAGE_SIZE /* - * On systems with 4K page size, this gives 254 size classes! There is a + * On systems with 4K page size, this gives 255 size classes! There is a * trader-off here: * - Large number of size classes is potentially wasteful as free page are * spread across these classes @@ -814,21 +814,32 @@ static void zs_exit(void) { int cpu; + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); - unregister_cpu_notifier(&zs_cpu_nb); + __unregister_cpu_notifier(&zs_cpu_nb); + + cpu_notifier_register_done(); } static int zs_init(void) { int cpu, ret; - register_cpu_notifier(&zs_cpu_nb); + cpu_notifier_register_begin(); + + __register_cpu_notifier(&zs_cpu_nb); for_each_online_cpu(cpu) { ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); - if (notifier_to_errno(ret)) + if (notifier_to_errno(ret)) { + cpu_notifier_register_done(); goto fail; + } } + + cpu_notifier_register_done(); + return 0; fail: zs_exit(); @@ -1071,7 +1082,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) class = &pool->size_class[class_idx]; off = obj_idx_to_offset(page, obj_idx, class->size); - area = &__get_cpu_var(zs_map_area); + area = this_cpu_ptr(&zs_map_area); if (off + class->size <= PAGE_SIZE) kunmap_atomic(area->vm_addr); else { @@ -89,6 +89,9 @@ static unsigned int zswap_max_pool_percent = 20; module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); +/* zbud_pool is shared by all of zswap backend */ +static struct zbud_pool *zswap_pool; + /********************************* * compression functions **********************************/ @@ -160,14 +163,14 @@ static void zswap_comp_exit(void) * rbnode - links the entry into red-black tree for the appropriate swap type * refcount - the number of outstanding reference to the entry. This is needed * to protect against premature freeing of the entry by code - * concurent calls to load, invalidate, and writeback. The lock + * concurrent calls to load, invalidate, and writeback. The lock * for the zswap_tree structure that contains the entry must * be held while changing the refcount. Since the lock must * be held, there is no reason to also make refcount atomic. * offset - the swap offset for the entry. Index into the red-black tree. - * handle - zsmalloc allocation handle that stores the compressed page data + * handle - zbud allocation handle that stores the compressed page data * length - the length in bytes of the compressed page data. Needed during - * decompression + * decompression */ struct zswap_entry { struct rb_node rbnode; @@ -189,7 +192,6 @@ struct zswap_header { struct zswap_tree { struct rb_root rbroot; spinlock_t lock; - struct zbud_pool *pool; }; static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; @@ -202,7 +204,7 @@ static struct kmem_cache *zswap_entry_cache; static int zswap_entry_cache_create(void) { zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); - return (zswap_entry_cache == NULL); + return zswap_entry_cache == NULL; } static void zswap_entry_cache_destory(void) @@ -282,16 +284,15 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) } /* - * Carries out the common pattern of freeing and entry's zsmalloc allocation, + * Carries out the common pattern of freeing and entry's zbud allocation, * freeing the entry itself, and decrementing the number of stored pages. */ -static void zswap_free_entry(struct zswap_tree *tree, - struct zswap_entry *entry) +static void zswap_free_entry(struct zswap_entry *entry) { - zbud_free(tree->pool, entry->handle); + zbud_free(zswap_pool, entry->handle); zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); - zswap_pool_pages = zbud_get_pool_size(tree->pool); + zswap_pool_pages = zbud_get_pool_size(zswap_pool); } /* caller must hold the tree lock */ @@ -311,7 +312,7 @@ static void zswap_entry_put(struct zswap_tree *tree, BUG_ON(refcount < 0); if (refcount == 0) { zswap_rb_erase(&tree->rbroot, entry); - zswap_free_entry(tree, entry); + zswap_free_entry(entry); } } @@ -346,7 +347,7 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) return NOTIFY_BAD; } *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; - dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL); + dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); if (!dst) { pr_err("can't allocate compressor buffer\n"); crypto_free_comp(tfm); @@ -387,18 +388,18 @@ static int zswap_cpu_init(void) { unsigned long cpu; - get_online_cpus(); + cpu_notifier_register_begin(); for_each_online_cpu(cpu) if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK) goto cleanup; - register_cpu_notifier(&zswap_cpu_notifier_block); - put_online_cpus(); + __register_cpu_notifier(&zswap_cpu_notifier_block); + cpu_notifier_register_done(); return 0; cleanup: for_each_online_cpu(cpu) __zswap_cpu_notifier(CPU_UP_CANCELED, cpu); - put_online_cpus(); + cpu_notifier_register_done(); return -ENOMEM; } @@ -407,8 +408,8 @@ cleanup: **********************************/ static bool zswap_is_full(void) { - return (totalram_pages * zswap_max_pool_percent / 100 < - zswap_pool_pages); + return totalram_pages * zswap_max_pool_percent / 100 < + zswap_pool_pages; } /********************************* @@ -545,7 +546,6 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) zbud_unmap(pool, handle); tree = zswap_trees[swp_type(swpentry)]; offset = swp_offset(swpentry); - BUG_ON(pool != tree->pool); /* find and ref zswap entry */ spin_lock(&tree->lock); @@ -573,13 +573,13 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) case ZSWAP_SWAPCACHE_NEW: /* page is locked */ /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zbud_map(tree->pool, entry->handle) + + src = (u8 *)zbud_map(zswap_pool, entry->handle) + sizeof(struct zswap_header); dst = kmap_atomic(page); ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, dst, &dlen); kunmap_atomic(dst); - zbud_unmap(tree->pool, entry->handle); + zbud_unmap(zswap_pool, entry->handle); BUG_ON(ret); BUG_ON(dlen != PAGE_SIZE); @@ -652,7 +652,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* reclaim space if needed */ if (zswap_is_full()) { zswap_pool_limit_hit++; - if (zbud_reclaim_page(tree->pool, 8)) { + if (zbud_reclaim_page(zswap_pool, 8)) { zswap_reject_reclaim_fail++; ret = -ENOMEM; goto reject; @@ -679,7 +679,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* store */ len = dlen + sizeof(struct zswap_header); - ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN, + ret = zbud_alloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; @@ -689,11 +689,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, zswap_reject_alloc_fail++; goto freepage; } - zhdr = zbud_map(tree->pool, handle); + zhdr = zbud_map(zswap_pool, handle); zhdr->swpentry = swp_entry(type, offset); buf = (u8 *)(zhdr + 1); memcpy(buf, dst, dlen); - zbud_unmap(tree->pool, handle); + zbud_unmap(zswap_pool, handle); put_cpu_var(zswap_dstmem); /* populate entry */ @@ -716,7 +716,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* update stats */ atomic_inc(&zswap_stored_pages); - zswap_pool_pages = zbud_get_pool_size(tree->pool); + zswap_pool_pages = zbud_get_pool_size(zswap_pool); return 0; @@ -752,13 +752,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zbud_map(tree->pool, entry->handle) + + src = (u8 *)zbud_map(zswap_pool, entry->handle) + sizeof(struct zswap_header); dst = kmap_atomic(page); ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, dst, &dlen); kunmap_atomic(dst); - zbud_unmap(tree->pool, entry->handle); + zbud_unmap(zswap_pool, entry->handle); BUG_ON(ret); spin_lock(&tree->lock); @@ -804,11 +804,9 @@ static void zswap_frontswap_invalidate_area(unsigned type) /* walk the tree and free everything */ spin_lock(&tree->lock); rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) - zswap_free_entry(tree, entry); + zswap_free_entry(entry); tree->rbroot = RB_ROOT; spin_unlock(&tree->lock); - - zbud_destroy_pool(tree->pool); kfree(tree); zswap_trees[type] = NULL; } @@ -822,20 +820,14 @@ static void zswap_frontswap_init(unsigned type) struct zswap_tree *tree; tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); - if (!tree) - goto err; - tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); - if (!tree->pool) - goto freetree; + if (!tree) { + pr_err("alloc failed, zswap disabled for swap type %d\n", type); + return; + } + tree->rbroot = RB_ROOT; spin_lock_init(&tree->lock); zswap_trees[type] = tree; - return; - -freetree: - kfree(tree); -err: - pr_err("alloc failed, zswap disabled for swap type %d\n", type); } static struct frontswap_ops zswap_frontswap_ops = { @@ -907,9 +899,16 @@ static int __init init_zswap(void) return 0; pr_info("loading zswap\n"); + + zswap_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); + if (!zswap_pool) { + pr_err("zbud pool creation failed\n"); + goto error; + } + if (zswap_entry_cache_create()) { pr_err("entry cache creation failed\n"); - goto error; + goto cachefail; } if (zswap_comp_init()) { pr_err("compressor initialization failed\n"); @@ -919,6 +918,7 @@ static int __init init_zswap(void) pr_err("per-cpu initialization failed\n"); goto pcpufail; } + frontswap_register_ops(&zswap_frontswap_ops); if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); @@ -927,6 +927,8 @@ pcpufail: zswap_comp_exit(); compfail: zswap_entry_cache_destory(); +cachefail: + zbud_destroy_pool(zswap_pool); error: return -ENOMEM; } |